In [17]:
import pandas as pd
import numpy as np

# Method 1: More robust loading with error handling
try:
    # Try with explicit encoding and error handling
    messages = pd.read_csv(r'C:\Users\avira\OneDrive\Desktop\UDEMY\NLP & Deep Learning\NLP\SMSSpamCollection.csv', 
                          sep='\t', 
                          names=['label', 'message'], 
                          encoding='utf-8',
                          engine='python',
                          on_bad_lines='skip')
    print("Method 1 attempted...")
    print(f"Shape: {messages.shape}")
    print("First few rows:")
    print(messages.head())
    
    # Check if tab separation actually worked
    if messages.shape[1] == 2:
        # Check if the first column still contains tabs (meaning separation failed)
        sample_label = str(messages.iloc[0]['label']) if len(messages) > 0 else ""
        if '\t' in sample_label:
            print("Tab separation failed - labels still contain tab characters")
            raise Exception("Tab separation didn't work properly")
        else:
            print("Method 1 successful - tab separation worked!")
    else:
        raise Exception("Wrong number of columns")
    
except Exception as e:
    print(f"Method 1 issue detected: {e}")
    
    # Method 2: Manual splitting approach (most reliable)
    try:
        # Read the file as a single column first
        with open(r'C:\Users\avira\OneDrive\Desktop\UDEMY\NLP & Deep Learning\NLP\SMSSpamCollection.csv', 'r', encoding='utf-8') as file:
            lines = file.readlines()
        
        # Parse each line manually
        labels = []
        messages_text = []
        
        for line in lines:
            line = line.strip()
            if line:  # Skip empty lines
                # Split on the first tab only
                parts = line.split('\t', 1)
                if len(parts) == 2:
                    labels.append(parts[0])
                    messages_text.append(parts[1])
        
        # Create dataframe
        messages = pd.DataFrame({
            'label': labels,
            'message': messages_text
        })
        
        print("Method 2 (manual parsing) successful!")
        print(f"Shape: {messages.shape}")
        print("Sample data:")
        for i in range(min(3, len(messages))):
            print(f"  {i}: '{messages.iloc[i]['label']}' - '{messages.iloc[i]['message'][:50]}...'")
            
    except Exception as e2:
        print(f"Method 2 failed: {e2}")
        
        # Method 3: Let pandas auto-detect
        try:
            messages = pd.read_csv(r'C:\Users\avira\OneDrive\Desktop\UDEMY\NLP & Deep Learning\NLP\SMSSpamCollection.csv', 
                                  sep='\t', 
                                  header=None,
                                  encoding='utf-8')
            # Manually assign column names
            if messages.shape[1] >= 2:
                messages.columns = ['label', 'message'] + [f'extra_{i}' for i in range(2, messages.shape[1])]
                # Keep only first two columns if there are extras
                messages = messages[['label', 'message']]
            print("Method 3 (auto-detect) successful!")
            print(f"Shape: {messages.shape}")
            print(messages.head())
            
        except Exception as e3:
            print(f"All methods failed. Last error: {e3}")

# Clean up and validate the data
if 'messages' in locals() and len(messages) > 0:
    print(f"\n" + "="*50)
    print("DATA VALIDATION AND CLEANUP:")
    
    # Check for any remaining issues
    print(f"Dataset shape: {messages.shape}")
    print(f"Any null values? {messages.isnull().sum().sum()}")
    
    # Validate labels
    unique_labels = messages['label'].unique()
    print(f"Unique labels: {unique_labels}")
    
    # Clean up any whitespace
    messages['label'] = messages['label'].str.strip()
    messages['message'] = messages['message'].str.strip()
    
    # Remove any empty rows
    messages = messages[(messages['label'] != '') & (messages['message'] != '')]
    
    print(f"Final shape after cleanup: {messages.shape}")
    print(f"Label distribution:")
    print(messages['label'].value_counts())
    
    print(f"\nFirst 3 examples:")
    for i in range(min(3, len(messages))):
        label = messages.iloc[i]['label']
        message = messages.iloc[i]['message']
        print(f"  Row {i}: [{label}] {message[:60]}...")
    
    print(f"\nDataset is ready to use!")
    print("Access with:")
    print("  - messages['label'] for spam/ham labels") 
    print("  - messages['message'] for SMS text")

else:
    print("No valid data loaded. Please check your file path and format.")

# Fix the column splitting issue
print("\n" + "="*50)
print("FIXING THE COLUMN SPLITTING:")

if 'messages' in locals() and messages.shape[1] == 1:
    print("Data loaded but columns not split properly. Fixing...")
    
    # Get the single column that contains both label and message
    combined_column = messages.iloc[:, 0]
    
    # Split on the first tab character
    split_data = combined_column.str.split('\t', n=1, expand=True)
    
    # Create new dataframe with proper columns
    messages_fixed = pd.DataFrame({
        'label': split_data[0],
        'message': split_data[1]
    })
    
    # Remove any rows where splitting failed
    messages_fixed = messages_fixed.dropna()
    
    print(f"Fixed dataset shape: {messages_fixed.shape}")
    print(f"Label distribution:")
    print(messages_fixed['label'].value_counts())
    
    print(f"\nFirst 5 rows after fixing:")
    for i in range(min(5, len(messages_fixed))):
        label = messages_fixed.iloc[i]['label']
        message = messages_fixed.iloc[i]['message'][:50]
        print(f"{i}: {label} - {message}...")
    
    # Replace the original messages dataframe
    messages = messages_fixed
    print(f"\nFinal dataset ready to use!")
    print(f"Access with: messages['label'] and messages['message']")

else:
    print("Data appears to be properly formatted already.")

Method 1 attempted...
Shape: (5574, 2)
First few rows:
                                               label  message
0  ham\tGo until jurong point, crazy.. Available ...      NaN
1                 ham\tOk lar... Joking wif u oni...      NaN
2  spam\tFree entry in 2 a wkly comp to win FA Cu...      NaN
3  ham\tU dun say so early hor... U c already the...      NaN
4  ham\tNah I don't think he goes to usf, he live...      NaN
Tab separation failed - labels still contain tab characters
Method 1 issue detected: Tab separation didn't work properly
Method 2 (manual parsing) successful!
Shape: (5574, 2)
Sample data:
  0: '"ham' - 'Go until jurong point, crazy.. Available only in b...'
  1: '"ham' - 'Ok lar... Joking wif u oni..."...'
  2: '"spam' - 'Free entry in 2 a wkly comp to win FA Cup final tk...'

DATA VALIDATION AND CLEANUP:
Dataset shape: (5574, 2)
Any null values? 0
Unique labels: ['"ham' '"spam']
Final shape after cleanup: (5574, 2)
Label distribution:
label
"ham     4827
"spam     

In [18]:
# Access the data
X = messages['message']  # SMS text features
y = messages['label']    # ham/spam labels

# Check the distribution
print(messages['label'].value_counts())

label
"ham     4827
"spam     747
Name: count, dtype: int64


In [19]:
# Clean up the quotation marks from labels and messages
print("Cleaning up quotation marks...")

# Remove quotes from labels (ham/spam)
messages['label'] = messages['label'].str.replace('"', '', regex=False)

# Remove quotes from the beginning and end of messages
messages['message'] = messages['message'].str.replace('"', '', regex=False)

print("Cleanup complete!")
print(f"\nCleaned data preview:")
print(f"Shape: {messages.shape}")

# Show first few rows to verify cleanup
for i in range(5):
    label = messages.iloc[i]['label']
    message = messages.iloc[i]['message'][:60]
    print(f"Row {i}: [{label}] {message}...")

print(f"\nLabel distribution after cleanup:")
print(messages['label'].value_counts())

print(f"\nUnique labels: {messages['label'].unique()}")

# Verify no quotes remain
has_quotes_labels = messages['label'].str.contains('"', na=False).sum()
has_quotes_messages = messages['message'].str.contains('"', na=False).sum()

print(f"\nQuotes remaining in labels: {has_quotes_labels}")
print(f"Quotes remaining in messages: {has_quotes_messages}")

print(f"\n✅ Dataset is now clean and ready for NLP analysis!")


Cleaning up quotation marks...
Cleanup complete!

Cleaned data preview:
Shape: (5574, 2)
Row 0: [ham] Go until jurong point, crazy.. Available only in bugis n gre...
Row 1: [ham] Ok lar... Joking wif u oni......
Row 2: [spam] Free entry in 2 a wkly comp to win FA Cup final tkts 21st Ma...
Row 3: [ham] U dun say so early hor... U c already then say......
Row 4: [ham] Nah I don't think he goes to usf, he lives around here thoug...

Label distribution after cleanup:
label
ham     4827
spam     747
Name: count, dtype: int64

Unique labels: ['ham' 'spam']

Quotes remaining in labels: 0
Quotes remaining in messages: 0

✅ Dataset is now clean and ready for NLP analysis!


In [20]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [21]:
# Data Cleaning and Preprocessing

import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordlemmatize = WordNetLemmatizer()

In [23]:
corpus = []
for i in range(0, len(messages)):
  review = re.sub('[^a-zA-z]',' ', messages['message'][i]) ##substitute any thing rather than a-z/A-Z with blank from message and store in review
  review = review.lower()
  review = review.split()
  review = [wordlemmatize.lemmatize(word) for word in review if not word in stopwords.words('english')] 
  review = ' '.join(review)
  corpus.append(review)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
tfidf = TfidfVectorizer(max_features=100)
tfidf.fit_transform(corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.434, 0, 0, 0.461, 0.544, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.456, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.473, 0, 0, 0, 0, 0, 0, 0, 0.492, 0, 0, 0, 0, 0, 0, 0, 0.571, 0, 0, 0, 0, 0, 0],
       [0.465, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.486, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.574, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [26]:
import numpy as np
np.set_printoptions(edgeitems = 30, linewidth= 100000,
                    formatter=dict(float = lambda x: "%.3g" % x))

In [27]:
x

NameError: name 'x' is not defined

N-Grams

In [31]:
tfidf = TfidfVectorizer(max_features=100, ngram_range=(2,2))
tfidf.fit_transform(corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [32]:
tfidf.vocabulary_

{'free entry': 32,
 'claim call': 15,
 'call claim': 3,
 'claim code': 16,
 'free call': 31,
 'chance win': 14,
 'txt word': 88,
 'let know': 54,
 'please call': 68,
 'lt gt': 58,
 'want go': 97,
 'first time': 30,
 'like lt': 55,
 'sorry call': 80,
 'call later': 8,
 'ur awarded': 90,
 'hi hi': 46,
 'call customer': 4,
 'customer service': 23,
 'guaranteed cash': 42,
 'cash prize': 13,
 'po box': 70,
 'trying contact': 85,
 'draw show': 28,
 'show prize': 79,
 'prize guaranteed': 74,
 'guaranteed call': 41,
 'valid hr': 95,
 'selected receive': 76,
 'private account': 72,
 'account statement': 0,
 'statement show': 81,
 'call identifier': 5,
 'identifier code': 50,
 'code expires': 20,
 'urgent mobile': 94,
 'call landline': 7,
 'wat time': 98,
 'ur mob': 93,
 'gud ni': 44,
 'new year': 62,
 'send stop': 78,
 'co uk': 19,
 'gud mrng': 43,
 'nice day': 63,
 'lt decimal': 57,
 'decimal gt': 25,
 'txt nokia': 86,
 'good morning': 36,
 'ur friend': 92,
 'good night': 37,
 'reply call': 75

In [34]:
x

NameError: name 'x' is not defined