### 🤖 Spam classifier using multiple **NLP** techniques 📧

In [1]:
# imports
import pandas as pd
import nltk
import gensim

In [2]:
df=pd.read_csv('spam.csv',encoding='latin1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Preprocessing (fe and eda)

In [5]:
df.drop(columns=df.columns[2:],inplace=True)

In [6]:
df.shape

(5572, 2)

In [7]:
df.rename(columns={'v1':'label','v2':'message'},inplace=True)

In [8]:
import nltk
import re
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
wordlemmatize=WordNetLemmatizer()

In [10]:
corpus=[]
for i in range(0,len(df)):
    message=re.sub('[^a-zA-z]',' ',df['message'][i])
    message=message.lower()
    message=message.split()
    message=[wordlemmatize.lemmatize(word) for word in message if not word in stopwords.words('english')]
    df['message'][i]=message
    corpus.append(' '.join(message))

In [11]:
df.sample(7)

Unnamed: 0,label,message
441,ham,"[supposed, wake, gt]"
4218,ham,"[anything, lor, go, go, lor]"
2784,ham,"[k, wat, tht, incident]"
1056,ham,"[u, drive, lor]"
3743,ham,"[nobody, name, penis, girl, name, story, add]"
3787,spam,"[want, funk, ur, fone, weekly, new, tone, repl..."
1023,ham,"[may, call, later, pls]"


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer(max_features=100,ngram_range=(1,2),binary=True)

In [13]:
# Here we need to do binary true then we can also use bernoulli naivebayes

In [14]:
X=bow.fit_transform(corpus).toarray()

In [15]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
bow.vocabulary_

{'go': 22,
 'great': 26,
 'got': 25,
 'wat': 90,
 'ok': 59,
 'free': 18,
 'win': 94,
 'text': 79,
 'txt': 86,
 'say': 70,
 'already': 0,
 'think': 82,
 'life': 38,
 'hey': 29,
 'week': 92,
 'back': 5,
 'like': 39,
 'still': 75,
 'send': 72,
 'friend': 19,
 'prize': 65,
 'claim': 9,
 'call': 6,
 'mobile': 50,
 'co': 10,
 'home': 31,
 'want': 89,
 'today': 84,
 'cash': 8,
 'day': 14,
 'reply': 67,
 'www': 96,
 'right': 68,
 'take': 77,
 'time': 83,
 'message': 47,
 'com': 11,
 'oh': 58,
 'yes': 99,
 'make': 45,
 'way': 91,
 'dont': 16,
 'miss': 49,
 'ur': 88,
 'going': 23,
 'da': 13,
 'lor': 41,
 'meet': 46,
 'really': 66,
 'know': 34,
 'lol': 40,
 'love': 42,
 'let': 37,
 'work': 95,
 'yeah': 97,
 'tell': 78,
 'anything': 2,
 'thanks': 80,
 'uk': 87,
 'please': 63,
 'msg': 52,
 'see': 71,
 'pls': 64,
 'need': 54,
 'tomorrow': 85,
 'hope': 32,
 'well': 93,
 'lt': 43,
 'gt': 27,
 'lt gt': 44,
 'get': 20,
 'ask': 3,
 'morning': 51,
 'happy': 28,
 'sorry': 74,
 'give': 21,
 'new': 55,
 'fin

In [17]:
# X is independent features
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit(df['label'])

In [18]:
y=le.transform(df['label'])

In [19]:
le.classes_

array(['ham', 'spam'], dtype=object)

In [20]:
import numpy as np

In [21]:
X.shape,y.shape

((5572, 100), (5572,))

In [22]:
type(X),type(y)

(numpy.ndarray, numpy.ndarray)

In [23]:
df_final=pd.DataFrame(np.column_stack((X,y)),columns=[f'X{i}' for i in range(1,101)]+['y'])

In [24]:
df_final.sample(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X92,X93,X94,X95,X96,X97,X98,X99,X100,y
2955,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
510,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1331,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2506,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1064,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
3992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3930,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1787,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [26]:
# Imbalance dataset -- so accuracy is not good measure
#  and obviously spam would be less
# we can apply various techniques like upsampling etc but for now we will move to other accuracy score

In [27]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

In [28]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()

In [29]:
mnb.fit(X_train,y_train)

In [30]:
y_pred=mnb.predict(X_test)

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1214
           1       0.89      0.82      0.85       179

    accuracy                           0.96      1393
   macro avg       0.93      0.90      0.92      1393
weighted avg       0.96      0.96      0.96      1393



In [32]:
# Using tfidf

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=2000,ngram_range=(1,2))
# These are hyperparameters we need to tune

In [34]:
len(corpus)

5572

In [35]:
# ✅ Summary for your exact question:
# ✅ Your doubt	✅ Reality
# MNB expects input categories like one-hot or labels?	❌ No
# MNB expects discrete counts?	✅ Originally, yes
# Can MNB work on TF-IDF continuous features?	✅ Yes (common practice, works fine even though it's a hack)
# Does MNB learn category-wise bins or categories for each feature?	❌ No, it learns conditional feature likelihoods for each feature index per class.

In [36]:
# Just for showing otherwise we can't do split like this because we need random sampling and preserve orginal distributin
corpus_train=corpus[:int(3*(5572/4))]
corpus_test=corpus[int(3*(5572/4)):]
y_train=y[:int(3*(5572/4))]
y_test=y[int(3*(5572/4)):]

In [37]:
X_train=tfidf.fit_transform(corpus_train)

In [38]:
X_test=tfidf.transform(corpus_test)

In [39]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(X_train,y_train)
y_pred=mnb.predict(X_test)

In [40]:
print(classification_report(y_pred=y_pred,y_true=y_test))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1211
           1       0.97      0.86      0.91       182

    accuracy                           0.98      1393
   macro avg       0.97      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [41]:
# Using word2vec and averae word2vec

In [42]:
# Till this stage it wil remain similar as it is basic text preprocessing
df

Unnamed: 0,label,message
0,ham,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, think, go, usf, life, around, though]"
...,...,...
5567,spam,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,ham,"[_, b, going, esplanade, fr, home]"
5569,ham,"[pity, mood, suggestion]"
5570,ham,"[guy, bitching, acted, like, interested, buyin..."


In [43]:
import gensim

In [44]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [45]:
wv.get_mean_vector(['apple' ,'mango'])

array([-0.05100081, -0.01163026,  0.0112685 ,  0.05735083, -0.03516588,
        0.04631148,  0.05745351, -0.06669649, -0.03802702,  0.06472728,
        0.02161846, -0.03494708, -0.02307806,  0.02040998, -0.05585345,
        0.07940796, -0.02461164,  0.01876362, -0.05041343,  0.01298413,
       -0.0925411 ,  0.05851515,  0.05984956, -0.01032645,  0.00220206,
       -0.01485917, -0.06615578,  0.04388044,  0.00570427,  0.04196128,
       -0.01958964, -0.01832396, -0.02153732,  0.05990501, -0.04396031,
       -0.00090472,  0.00508461, -0.11603972,  0.07676972,  0.03122808,
        0.00115335, -0.0140543 , -0.01912641,  0.02279656, -0.02415244,
       -0.07157112, -0.01524638, -0.02177763,  0.01131025,  0.02391658,
       -0.03354566,  0.04918388,  0.03490923,  0.02717781, -0.03186986,
       -0.01987224,  0.00960829, -0.00728065, -0.03924953, -0.0776751 ,
        0.01155698, -0.0396637 , -0.01874629, -0.00555709,  0.00196626,
       -0.02333881, -0.01406609, -0.12383032,  0.04311423, -0.01

In [46]:
wv['snajkn/']

KeyError: "Key 'snajkn/' not present"

In [None]:
# this avgword2vec function doesn't rase error if not present , instead give 0 
wv.get_mean_vector(['snajkn/'])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [None]:
wv.get_mean_vector([])
# but raises on empty list

ValueError: cannot compute mean with no input

In [None]:
# method 1: each xi:
# wv.get_mean_vector() run for that and get the Xnew

# or method2:
# train our own word2vec from scratch


In [None]:
len(corpus)

5572

In [47]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [48]:
from sklearn.model_selection import train_test_split
corpus_train,corpus_test,y_train,y_test=train_test_split(corpus,y,test_size=0.25)

In [49]:
len(corpus_test)

1393

In [50]:
# tqdm
from tqdm import tqdm
import time

for i in tqdm(range(100)):
    time.sleep(0.01) 

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:01<00:00, 94.86it/s]


In [56]:
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
from tqdm import tqdm

def tokenize_text(text):
    """Tokenize one document: sentence split + word tokenization + cleaning."""
    tokens = []
    sentences = sent_tokenize(text)
    for sent in sentences:
        tokens.extend(simple_preprocess(sent))
    return tokens

def tokenize_corpus(corpus):
    """Tokenize the whole corpus with progress bar."""
    return [tokenize_text(text) for text in tqdm(corpus, desc="Tokenizing Corpus")]

# Example usage:
# corpus = ["Hello World! This is NLP.", "Another text..."]
tokenized_corpus = tokenize_corpus(corpus)

# Optional: Check result
print(tokenized_corpus[:3])


Tokenizing Corpus: 100%|██████████| 5572/5572 [00:00<00:00, 111271.91it/s]

[['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'la', 'buffet', 'cine', 'got', 'amore', 'wat'], ['ok', 'lar', 'joking', 'wif', 'oni'], ['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'st', 'may', 'text', 'fa', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply']]





In [57]:
# Scratch
import gensim
from gensim.models import KeyedVectors,Word2Vec
model=Word2Vec(window=5,vector_size=100,sentences=tokenized_corpus)

In [58]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'text',
 'love',
 'want',
 'send',
 'txt',
 'need',
 'one',
 'today',
 'going',
 'home',
 'stop',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'think',
 'tell',
 'dont',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'pls',
 'please',
 'co',
 'msg',
 'min',
 'make',
 'dear',
 'night',
 'message',
 'say',
 'well',
 'thing',
 'much',
 'great',
 'claim',
 'oh',
 'hope',
 'hey',
 'number',
 'friend',
 'happy',
 'wat',
 'work',
 'give',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'cash',
 'yeah',
 'life',
 'really',
 'amp',
 'meet',
 'im',
 'babe',
 'find',
 'morning',
 'miss',
 'service',
 'year',
 'last',
 'uk',
 'thanks',
 'com',
 'would',
 'nokia',
 'anything',
 'lol',
 'also',
 'care',
 'every',
 'feel',
 'keep',
 'pick',
 'sure',
 'sent',
 'urgent',
 'contact',


In [None]:
df['message'][0]
# -->list of words

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [None]:
model.wv.get_mean_vector(['ignore','cash']).shape

(100,)

In [65]:
type(df['message'][0])

list

In [72]:
import numpy as np

vector_size = model.wv.vector_size

def safe_mean_vector(words):
	# Filter words to those in the vocabulary
	words_in_vocab = [w for w in words if w in model.wv]
	if words_in_vocab: #(checking list is not empty)
		return model.wv.get_mean_vector(words_in_vocab)
	else:
		return np.zeros(vector_size, dtype=np.float32)

X = [safe_mean_vector(df['message'][i]) for i in range(len(df))]

In [73]:
# This should be performed before word2vec and all but here we were learning word2vec that's why not a issue
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y)

In [75]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(X_train,y_train)

ValueError: Negative values in data passed to MultinomialNB (input X).

In [None]:
# mnb navie bayes can't be used with word2vec

In [76]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)

In [77]:
y_pred=rf.predict(X_test)

In [79]:
print(classification_report(y_pred=y_pred,y_true=y_test))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1215
           1       0.95      0.71      0.81       178

    accuracy                           0.96      1393
   macro avg       0.95      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393

