In [1]:
import scipy, gensim
from gensim.models import Word2Vec, keyedvectors

In [2]:
from gensim.models import KeyedVectors

wv = KeyedVectors.load_word2vec_format(
    r"C:\Users\avira\OneDrive\Desktop\UDEMY\NLP & Deep Learning\NLP\GoogleNews-vectors-negative300.bin",
    binary=True
)

In [3]:
import pandas as pd
import numpy as np

# Method 1: More robust loading with error handling
try:
    # Try with explicit encoding and error handling
    messages = pd.read_csv(r'C:\Users\avira\OneDrive\Desktop\UDEMY\NLP & Deep Learning\NLP\SMSSpamCollection.csv', 
                          sep='\t', 
                          names=['label', 'message'], 
                          encoding='utf-8',
                          engine='python',
                          on_bad_lines='skip')
    print("Method 1 attempted...")
    print(f"Shape: {messages.shape}")
    print("First few rows:")
    print(messages.head())
    
    # Check if tab separation actually worked
    if messages.shape[1] == 2:
        # Check if the first column still contains tabs (meaning separation failed)
        sample_label = str(messages.iloc[0]['label']) if len(messages) > 0 else ""
        if '\t' in sample_label:
            print("Tab separation failed - labels still contain tab characters")
            raise Exception("Tab separation didn't work properly")
        else:
            print("Method 1 successful - tab separation worked!")
    else:
        raise Exception("Wrong number of columns")
    
except Exception as e:
    print(f"Method 1 issue detected: {e}")
    
    # Method 2: Manual splitting approach (most reliable)
    try:
        # Read the file as a single column first
        with open(r'C:\Users\avira\OneDrive\Desktop\UDEMY\NLP & Deep Learning\NLP\SMSSpamCollection.csv', 'r', encoding='utf-8') as file:
            lines = file.readlines()
        
        # Parse each line manually
        labels = []
        messages_text = []
        
        for line in lines:
            line = line.strip()
            if line:  # Skip empty lines
                # Split on the first tab only
                parts = line.split('\t', 1)
                if len(parts) == 2:
                    labels.append(parts[0])
                    messages_text.append(parts[1])
        
        # Create dataframe
        messages = pd.DataFrame({
            'label': labels,
            'message': messages_text
        })
        
        print("Method 2 (manual parsing) successful!")
        print(f"Shape: {messages.shape}")
        print("Sample data:")
        for i in range(min(3, len(messages))):
            print(f"  {i}: '{messages.iloc[i]['label']}' - '{messages.iloc[i]['message'][:50]}...'")
            
    except Exception as e2:
        print(f"Method 2 failed: {e2}")
        
        # Method 3: Let pandas auto-detect
        try:
            messages = pd.read_csv(r'C:\Users\avira\OneDrive\Desktop\UDEMY\NLP & Deep Learning\NLP\SMSSpamCollection.csv', 
                                  sep='\t', 
                                  header=None,
                                  encoding='utf-8')
            # Manually assign column names
            if messages.shape[1] >= 2:
                messages.columns = ['label', 'message'] + [f'extra_{i}' for i in range(2, messages.shape[1])]
                # Keep only first two columns if there are extras
                messages = messages[['label', 'message']]
            print("Method 3 (auto-detect) successful!")
            print(f"Shape: {messages.shape}")
            print(messages.head())
            
        except Exception as e3:
            print(f"All methods failed. Last error: {e3}")

# Clean up and validate the data
if 'messages' in locals() and len(messages) > 0:
    print(f"\n" + "="*50)
    print("DATA VALIDATION AND CLEANUP:")
    
    # Check for any remaining issues
    print(f"Dataset shape: {messages.shape}")
    print(f"Any null values? {messages.isnull().sum().sum()}")
    
    # Validate labels
    unique_labels = messages['label'].unique()
    print(f"Unique labels: {unique_labels}")
    
    # Clean up any whitespace
    messages['label'] = messages['label'].str.strip()
    messages['message'] = messages['message'].str.strip()
    
    # Remove any empty rows
    messages = messages[(messages['label'] != '') & (messages['message'] != '')]
    
    print(f"Final shape after cleanup: {messages.shape}")
    print(f"Label distribution:")
    print(messages['label'].value_counts())
    
    print(f"\nFirst 3 examples:")
    for i in range(min(3, len(messages))):
        label = messages.iloc[i]['label']
        message = messages.iloc[i]['message']
        print(f"  Row {i}: [{label}] {message[:60]}...")
    
    print(f"\nDataset is ready to use!")
    print("Access with:")
    print("  - messages['label'] for spam/ham labels") 
    print("  - messages['message'] for SMS text")

else:
    print("No valid data loaded. Please check your file path and format.")

# Fix the column splitting issue
print("\n" + "="*50)
print("FIXING THE COLUMN SPLITTING:")

if 'messages' in locals() and messages.shape[1] == 1:
    print("Data loaded but columns not split properly. Fixing...")
    
    # Get the single column that contains both label and message
    combined_column = messages.iloc[:, 0]
    
    # Split on the first tab character
    split_data = combined_column.str.split('\t', n=1, expand=True)
    
    # Create new dataframe with proper columns
    messages_fixed = pd.DataFrame({
        'label': split_data[0],
        'message': split_data[1]
    })
    
    # Remove any rows where splitting failed
    messages_fixed = messages_fixed.dropna()
    
    print(f"Fixed dataset shape: {messages_fixed.shape}")
    print(f"Label distribution:")
    print(messages_fixed['label'].value_counts())
    
    print(f"\nFirst 5 rows after fixing:")
    for i in range(min(5, len(messages_fixed))):
        label = messages_fixed.iloc[i]['label']
        message = messages_fixed.iloc[i]['message'][:50]
        print(f"{i}: {label} - {message}...")
    
    # Replace the original messages dataframe
    messages = messages_fixed
    print(f"\nFinal dataset ready to use!")
    print(f"Access with: messages['label'] and messages['message']")

else:
    print("Data appears to be properly formatted already.")

Method 1 attempted...
Shape: (5574, 2)
First few rows:
                                               label  message
0  ham\tGo until jurong point, crazy.. Available ...      NaN
1                 ham\tOk lar... Joking wif u oni...      NaN
2  spam\tFree entry in 2 a wkly comp to win FA Cu...      NaN
3  ham\tU dun say so early hor... U c already the...      NaN
4  ham\tNah I don't think he goes to usf, he live...      NaN
Tab separation failed - labels still contain tab characters
Method 1 issue detected: Tab separation didn't work properly
Method 2 (manual parsing) successful!
Shape: (5574, 2)
Sample data:
  0: '"ham' - 'Go until jurong point, crazy.. Available only in b...'
  1: '"ham' - 'Ok lar... Joking wif u oni..."...'
  2: '"spam' - 'Free entry in 2 a wkly comp to win FA Cup final tk...'

DATA VALIDATION AND CLEANUP:
Dataset shape: (5574, 2)
Any null values? 0
Unique labels: ['"ham' '"spam']
Final shape after cleanup: (5574, 2)
Label distribution:
label
"ham     4827
"spam     

In [4]:
# Clean up the quotation marks from labels and messages
print("Cleaning up quotation marks...")

# Remove quotes from labels (ham/spam)
messages['label'] = messages['label'].str.replace('"', '', regex=False)

# Remove quotes from the beginning and end of messages
messages['message'] = messages['message'].str.replace('"', '', regex=False)

print("Cleanup complete!")
print(f"\nCleaned data preview:")
print(f"Shape: {messages.shape}")

# Show first few rows to verify cleanup
for i in range(5):
    label = messages.iloc[i]['label']
    message = messages.iloc[i]['message'][:60]
    print(f"Row {i}: [{label}] {message}...")

print(f"\nLabel distribution after cleanup:")
print(messages['label'].value_counts())

print(f"\nUnique labels: {messages['label'].unique()}")

# Verify no quotes remain
has_quotes_labels = messages['label'].str.contains('"', na=False).sum()
has_quotes_messages = messages['message'].str.contains('"', na=False).sum()

print(f"\nQuotes remaining in labels: {has_quotes_labels}")
print(f"Quotes remaining in messages: {has_quotes_messages}")

print(f"\n✅ Dataset is now clean and ready for NLP analysis!")


Cleaning up quotation marks...
Cleanup complete!

Cleaned data preview:
Shape: (5574, 2)
Row 0: [ham] Go until jurong point, crazy.. Available only in bugis n gre...
Row 1: [ham] Ok lar... Joking wif u oni......
Row 2: [spam] Free entry in 2 a wkly comp to win FA Cup final tkts 21st Ma...
Row 3: [ham] U dun say so early hor... U c already then say......
Row 4: [ham] Nah I don't think he goes to usf, he lives around here thoug...

Label distribution after cleanup:
label
ham     4827
spam     747
Name: count, dtype: int64

Unique labels: ['ham' 'spam']

Quotes remaining in labels: 0
Quotes remaining in messages: 0

✅ Dataset is now clean and ready for NLP analysis!


In [5]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
import re

In [8]:
corpus = []
for i in range(0, len(message)):
  review = re.sub('[^a-zA-Z]',' ', messages['message'][i])
  review = review.lower()
  review = review.split()
  
  review = [lemmatizer.lemmatize(word) for word in review]
  review = ' '.join(review)
  corpus.append(review)

In [29]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [10]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [None]:
words=[]
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))
    
## Simple preprocess : convert a document unto a list of lower case tokens, ignoring tokens that are too short or too long
  

In [12]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [13]:
import gensim 

In [14]:
## Lets train word2vec from scratch

In [18]:
model = gensim.models.Word2Vec(words)


In [19]:
# to get all the vocabulary
model.wv.index_to_key

['you',
 'to',
 'the',
 'your',
 'and',
 'for',
 'that',
 'in',
 'go',
 'call',
 'not',
 'free',
 'he',
 'like',
 'my',
 'is',
 'with',
 'have',
 'did',
 'all',
 'it',
 'or',
 'txt',
 'so',
 'no',
 'me',
 'do',
 'know',
 'if',
 'how',
 'been',
 'now',
 'at',
 'about',
 'on',
 'mobile',
 'wa',
 'anything']

In [35]:
model.corpus_count

60

In [21]:
model.epochs

5

In [36]:
model.wv.similar_by_word('mobile')

[('at', 0.14668814837932587),
 ('not', 0.1452500820159912),
 ('you', 0.09852369129657745),
 ('on', 0.09167248755693436),
 ('or', 0.09009508043527603),
 ('no', 0.08799311518669128),
 ('do', 0.07903679460287094),
 ('in', 0.07666514813899994),
 ('now', 0.07027751207351685),
 ('about', 0.05136902630329132)]

In [38]:
model.wv['mobile'].shape

(100,)

In [None]:
words[0]  # for all the words it will 100 dimension vectors

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [45]:
def avg_word2vec(doc):
  # remove out of vocab words
  # sent = [word of word in doc if word in model.wv.index_to_key]
  # print(sent)
  
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key])

In [40]:
!pip install tqdm



In [None]:
from tqdm import tqdm


In [46]:
# apply for the entire sentences

X=[]
for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

  0%|          | 0/60 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 60/60 [00:00<00:00, 282.94it/s]


In [49]:
len(X)


60

In [50]:
# independent features
X_new = np.array(X)

In [51]:
X_new.shape

(60,)

In [52]:
X_new[0]

4.3459608605189715e-06

In [53]:
## Dependent features
y = pd.get_dummies(messages['label'])
y = y.iloc[:,0].values

In [54]:
y

array([ True,  True, False, ...,  True,  True,  True])

In [56]:
messages.shape

(5574, 2)

In [57]:
y.shape

(5574,)