In [1]:
import pandas as pd
import numpy as np
df= pd.read_csv("SMSSpamCollection.csv", sep='\t', names=['label', 'messages'])

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import 
from nltk.corpus import stopwordsword_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('all')
import string


In [2]:
df.shape

(5572, 2)

In [3]:
df.duplicated().sum()


403

In [4]:
df.drop_duplicates(keep='first', inplace=True)

In [5]:
df.duplicated().sum()
df

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
import plotly.express as px
px.histogram(df,x=df['label'],color=df['label'])

SMOTE (Synthetic Minority Over-sampling Technique) is an algorithm for oversampling in unbalanced datasets. It generates synthetic data samples of the minority class by interpolating between existing minority class instances. The algorithm selects two or more nearest neighbors of a minority class instance and then generates a synthetic sample by interpolating between the selected instances. The new samples are added to the original dataset to balance the class distribution.

In [7]:
def text_process(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [i for i in text if i not in stopwords.words('english')]
    text = [i for i in text if i not in string.punctuation]
    text = [i for i in text if i.isalnum()]
    text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return " ".join(text)


Lemmatization is a process of reducing a word to its base form, called a lemma, while considering the context in which the word appears. The goal of lemmatization is to group together different inflected forms of a word so they can be analyzed as a single item, allowing for better text analysis and natural language processing.

Lemmatization is different from stemming, which involves removing the suffixes of a word to reduce it to its root form without considering context, which often results in words with a different meaning. Lemmatization uses a dictionary or a morphological analysis to determine the correct lemma for a word.

In [8]:
temp_1 = pd.get_dummies(df['label'],drop_first=True)

In [9]:
df = pd.concat([df, temp_1], axis=1)
df.drop(columns='label', inplace=True)

In [10]:
df['Transformed_Message'] = df['messages'].apply(text_process)

In [11]:
from sklearn.metrics import accuracy_score,f1_score, recall_score, precision_score,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression

Generative models focus on the distribution of individual classes in a dataset and the learning algorithms tend to model the underlying patterns or distribution of the data points

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = pd.DataFrame(cv.fit_transform(df['Transformed_Message']).toarray())

In [15]:
y=df['spam'].values

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [None]:
from imblearn.combine import SMOTETomek

smk = SMOTETomek()
X_res,y_res=smk.fit_resample(X_train,y_train)
X_res.shape,y_res.shape
from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

In [None]:

GNB = GaussianNB()
accuracy_GNB = cross_val_score(GNB, X_test, y_test, cv=5)
GNB.fit(X_res, y_res)
y_pred1 = GNB.predict(X_test)
cm1= confusion_matrix(y_test,y_pred1)
print("Confusion Matrix", confusion_matrix(y_test,y_pred1))
print("Precision Score",precision_score(y_test,y_pred1))
print("Accuracy Score:",np.mean(accuracy_GNB))
print("F1 score:",f1_score(y_test,y_pred1))
print("Train: ",GNB.score(X_train,y_train))
print("Test: ",GNB.score(X_test,y_test))
print("F1: ",f1_score(y_test,y_pred1))

Confusion Matrix [[1295  168]
 [  30  213]]
Precision Score 0.5590551181102362
Accuracy Score: 0.8915727735761692
F1 score: 0.6826923076923077
Train:  0.9419578400231013
Test:  0.8839390386869871
F1:  0.6826923076923077


In [None]:
MNB = MultinomialNB()
accuracy_MNB = cross_val_score(MNB, X_test, y_test, cv=5)
MNB.fit(X_res, y_res)
y_pred2 = MNB.predict(X_test)
cm2 = confusion_matrix(y_test,y_pred2)
print("Confusion Matrix", confusion_matrix(y_test,y_pred2))
print("Precision Score",precision_score(y_test,y_pred2))
print("Accuracy Score:",np.mean(accuracy_MNB))
print("Train: ",MNB.score(X_train,y_train))
print("Test: ",MNB.score(X_test,y_test))
print("F1: ",f1_score(y_test,y_pred2))

Confusion Matrix [[1398   65]
 [  13  230]]
Precision Score 0.7796610169491526
Accuracy Score: 0.9753785735110014
Train:  0.9742997401097314
Test:  0.9542790152403282
F1:  0.8550185873605949


In [None]:
BNB = BernoulliNB()
accuracy_BNB = cross_val_score(BNB, X_test, y_test, cv=5)
BNB.fit(X_res, y_res)
y_pred3 = BNB.predict(X_test)
cm3 = confusion_matrix(y_test,y_pred3)
print("Confusion Matrix", confusion_matrix(y_test,y_pred3))
print("Precision Score",precision_score(y_test,y_pred3))
print("Accuracy Score:",np.mean(accuracy_BNB))
print("Train: ",BNB.score(X_res,y_res))
print("Test: ",BNB.score(X_test,y_test))
print("F1: ",f1_score(y_test,y_pred3))

Confusion Matrix [[1340  123]
 [  17  226]]
Precision Score 0.6475644699140402
Accuracy Score: 0.8915641988647082
Train:  0.9697019325253848
Test:  0.917936694021102
F1:  0.7635135135135135


In [None]:
KNN = KNeighborsClassifier()
accuracy_KNN = cross_val_score(KNN,X_test,y_test)
KNN.fit(X_res,y_res)
y_pred4 = KNN.predict(X_test)
cm4 = confusion_matrix(y_test,y_pred4)
print("Confusion Matrix", confusion_matrix(y_test,y_pred4))
print("Precision Score",precision_score(y_test,y_pred4))
print("Accuracy Score:",np.mean(accuracy_BNB))
print("Train: ",KNN.score(X_res,y_res))
print("Test: ",KNN.score(X_test,y_test))
print("F1: ",f1_score(y_test,y_pred4))

Confusion Matrix [[ 392 1071]
 [   3  240]]
Precision Score 0.18306636155606407
Accuracy Score: 0.8915641988647082
Train:  0.6832623648869964
Test:  0.3704572098475967
F1:  0.3088803088803089


In [None]:
LogR = LogisticRegression()
accuracy_LogR = cross_val_score(LogR,X_test,y_test)
LogR.fit(X_res,y_res)
y_pred5 = LogR.predict(X_test)
cm5 = confusion_matrix(y_test,y_pred5)

print("Confusion Matrix", confusion_matrix(y_test,y_pred5))
print("Precision Score",precision_score(y_test,y_pred5))
print("Accuracy Score:",np.mean(accuracy_LogR))
print("Train: ",LogR.score(X_res,y_res))
print("Test: ",LogR.score(X_test,y_test))

print("F1 score : ",f1_score(y_test,y_pred5))

Confusion Matrix [[1287  176]
 [  19  224]]
Precision Score 0.56
Accuracy Score: 0.9660012690572962
Train:  0.9824762528660335
Test:  0.8856975381008206
F1 score :  0.6967340590979783


In [None]:
result = BNB.predict(vector_input)[0]

if result==1:
    print("Spam")
else:
    print("Not Spam")

Spam


In [None]:
def knn_predict(x,y,xtest,k_neighbour=5,matric='cosine'):
    if matric not in matrics: raise Exception("Enter Correct Matric : ",matrics)
    y_pred = list()
    rev = False
    if matric == 'cosine': 
        rev = True
    for i in range(len(xtest)):
        store_distance = dict()
        for j in range(len(x)): 
            store_distance[x[j]]=globals()[matric](sparse_matrix[xtest[i]],sparse_matrix[x[j]])
        count = k_neighbour
        ham = 0
        spam = 0
        for k in sorted(store_distance.items(),key=lambda x:x[1], reverse=rev):
            count-=1
            if(email_marking[k[0]]==1): ham +=1
            else: spam +=1
            if(count==0): break
        y_pred.append(ham > spam)

    return y_pred

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=82255365-501c-401b-819d-9fad1f8d4e79' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>