In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [None]:
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
dataset.rename(columns={'v1':'Target','v2':'mailText'},inplace=True)

In [None]:
dataset.nunique()

Target           2
mailText      5169
Unnamed: 2      43
Unnamed: 3      10
Unnamed: 4       5
dtype: int64

In [None]:
dataset['mailText'] = dataset['mailText'].fillna('') + dataset['Unnamed: 2'].fillna('') + dataset['Unnamed: 3'].fillna('') + dataset['Unnamed: 4'].fillna('')

In [None]:
dataset.isna().sum()

Target           0
mailText         0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [None]:
dataset.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
dataset.head()

Unnamed: 0,Target,mailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
count_spam = dataset['Target'].value_counts().get('spam', 0)
count_not_spam = dataset['Target'].value_counts().get('ham', 0)

print("Count of 'spam':", count_spam)
print("Count of 'not spam':", count_not_spam)

Count of 'spam': 747
Count of 'not spam': 4825


In [None]:
dataset['mailText'] = dataset['mailText'].str.lower()
dataset.head()

Unnamed: 0,Target,mailText
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [None]:
from nltk.tokenize import word_tokenize

def tokenize_text(text) :
    return word_tokenize(text)

dataset['Tokenize Text'] = dataset['mailText'].apply(tokenize_text)

In [None]:
dataset.head()

Unnamed: 0,Target,mailText,Tokenize Text
0,ham,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy, .., avail..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea..."
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,..."


In [None]:
def remove_numbers(tokens):
    return [ token for token in tokens if not token.isdigit()]


dataset['Tokenize Text'] = dataset['Tokenize Text'].apply(remove_numbers)

In [None]:
import string

def remove_punctuations(tokens):
    return [ token for token in tokens if not token in string.punctuation ]

dataset['Tokenize Text'] = dataset['Tokenize Text'].apply(remove_punctuations)

In [None]:
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stopwords_list = stopwords.words('english')
    return [ token for token in tokens if token not in stopwords_list]

dataset['Tokenize Text'] = dataset['Tokenize Text'].apply(remove_stopwords)

4. Stemming
----

In [None]:
from nltk import SnowballStemmer

lang = "english"
stemmer = SnowballStemmer(lang)
def adding_Stemming(tokens):
    return [ stemmer.stem(token) for token in tokens ]

dataset['Tokenize Text'] = dataset['Tokenize Text'].apply(adding_Stemming)

In [None]:
dataset.head()

Unnamed: 0,Target,mailText,Tokenize Text
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazi, .., avail, bugi, n,..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joke, wif, u, oni, ...]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goe, usf, live, around, though]"


In [None]:
d2set = dataset
d2set.head()

Unnamed: 0,Target,mailText,Tokenize Text
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazi, .., avail, bugi, n,..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joke, wif, u, oni, ...]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goe, usf, live, around, though]"


In [None]:
from sklearn.feature_extraction.text import Tfid3setVectorizer

tfid3set_vectorizer = Tfid3setVectorizer()

tfid3set_matrix = tfid3set_vectorizer.fit_transform([" ".join(doc) for doc in d2set['Tokenize Text']])

tfid3set_d3set = pd.DataFrame(tfid3set_matrix.toarray(), columns=tfid3set_vectorizer.get_feature_names_out())

d2set = pd.concat([dataset, tfid3set_d3set], axis=1)

In [None]:
d2set.head()

Unnamed: 0,Target,mailText,Tokenize Text,00,000,000pes,02,0207,02072069400,03,...,ó_,û_,û_thank,ûªm,ûªt,ûªv,ûï,ûïharri,ûò,ûówel
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazi, .., avail, bugi, n,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joke, wif, u, oni, ...]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entri, wkli, comp, win, fa, cup, final,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, earli, hor, ..., u, c, alreadi, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goe, usf, live, around, though]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
d2set['Target'] = d2set['Target'].apply(lambda x : 1 if x == "ham" else 0)

In [None]:
d2set.head()

Unnamed: 0,Target,mailText,Tokenize Text,00,000,000pes,02,0207,02072069400,03,...,ó_,û_,û_thank,ûªm,ûªt,ûªv,ûï,ûïharri,ûò,ûówel
0,1,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazi, .., avail, bugi, n,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,ok lar... joking wif u oni...,"[ok, lar, ..., joke, wif, u, oni, ...]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,free entry in 2 a wkly comp to win fa cup fina...,"[free, entri, wkli, comp, win, fa, cup, final,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,u dun say so early hor... u c already then say...,"[u, dun, say, earli, hor, ..., u, c, alreadi, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goe, usf, live, around, though]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X = d2set.drop(['Target','mailText','Tokenize Text'],axis=1)
Y = d2set['Target']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB

tf_naive = GaussianNB()
tf_naive.fit(x_train,y_train)

from sklearn.metrics import accuracy_score,classification_report
pred_naive = tf_naive.predict(x_test)
print("Naive Bayes Classifier ")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_naive,y_test))
print("Calssification Report : \n",classification_report(pred_naive,y_test))

Naive Bayes Classifier 
--------------------------------------------------------------
Accuracy Score :  0.8708520179372198
Calssification Report : 
               precision    recall  f1-score   support

           0       0.90      0.51      0.65       264
           1       0.87      0.98      0.92       851

    accuracy                           0.87      1115
   macro avg       0.88      0.75      0.79      1115
weighted avg       0.87      0.87      0.86      1115



In [None]:
from sklearn.linear_model import LogisticRegression

tf_log = LogisticRegression()
tf_log.fit(x_train,y_train)

pred_log = tf_log.predict(x_test)
print("Logistic Regression ")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_log,y_test))
print("Calssification Report : \n",classification_report(pred_log,y_test))

Logistic Regression 
--------------------------------------------------------------
Accuracy Score :  0.9497757847533632
Calssification Report : 
               precision    recall  f1-score   support

           0       0.65      0.96      0.78       102
           1       1.00      0.95      0.97      1013

    accuracy                           0.95      1115
   macro avg       0.82      0.95      0.87      1115
weighted avg       0.96      0.95      0.95      1115



In [None]:
from sklearn.svm import SVC

tf_svm = SVC(kernel='linear',C=1,random_state=42)
tf_svm.fit(x_train,y_train)

pred_svm = tf_svm.predict(x_test)
print("Support Vector Machine ")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_svm,y_test))
print("Calssification Report : \n",classification_report(pred_svm,y_test))

Support Vector Machine 
--------------------------------------------------------------
Accuracy Score :  0.9802690582959641
Calssification Report : 
               precision    recall  f1-score   support

           0       0.87      0.98      0.92       134
           1       1.00      0.98      0.99       981

    accuracy                           0.98      1115
   macro avg       0.94      0.98      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
d3set = dataset
d3set.head()

Unnamed: 0,Target,mailText,Tokenize Text
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazi, .., avail, bugi, n,..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joke, wif, u, oni, ...]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goe, usf, live, around, though]"


In [None]:
from gensim.models import Word2Vec

tokenized_text = d3set['Tokenize Text']
model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, sg=0)

In [None]:
def document_vector(tokens, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    num_words = 0

    for word in tokens:
        if word in model.wv:
            num_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])

    if num_words > 0:
        feature_vector = np.divide(feature_vector, num_words)
    return feature_vector
num_features = 100
document_vectors = []
for tokens in tokenized_text:
    vector = document_vector(tokens, model, num_features)
    document_vectors.append(vector)

In [None]:
X = pd.DataFrame(document_vectors)
Y = d3set['Target']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB

w2v_naive = GaussianNB()
w2v_naive.fit(x_train,y_train)

from sklearn.metrics import accuracy_score,classification_report
pred_naive = w2v_naive.predict(x_test)
print("Naive Bayes Classifier ")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_naive,y_test))
print("Calssification Report : \n",classification_report(pred_naive,y_test))

Naive Bayes Classifier 
--------------------------------------------------------------
Accuracy Score :  0.5004484304932736
Calssification Report : 
               precision    recall  f1-score   support

         ham       0.45      0.94      0.61       460
        spam       0.83      0.19      0.31       655

    accuracy                           0.50      1115
   macro avg       0.64      0.57      0.46      1115
weighted avg       0.67      0.50      0.43      1115



In [None]:
from sklearn.linear_model import LogisticRegression

w2v_log = LogisticRegression()
w2v_log.fit(x_train,y_train)

pred_log = w2v_log.predict(x_test)
print("Logistic Regression ")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_log,y_test))
print("Calssification Report : \n",classification_report(pred_log,y_test))

Logistic Regression 
--------------------------------------------------------------
Accuracy Score :  0.8654708520179372
Calssification Report : 
               precision    recall  f1-score   support

         ham       1.00      0.87      0.93      1115
        spam       0.00      0.00      0.00         0

    accuracy                           0.87      1115
   macro avg       0.50      0.43      0.46      1115
weighted avg       1.00      0.87      0.93      1115



In [None]:
from sklearn.svm import SVC

w2v_svm = SVC(kernel='linear',C=1,random_state=42)
w2v_svm.fit(x_train,y_train)

pred_svm = w2v_svm.predict(x_test)
print("Support Vector Machine ")
print("--------------------------------------------------------------")
print("Accuracy Score : ",accuracy_score(pred_svm,y_test))
print("Calssification Report : \n",classification_report(pred_svm,y_test))

Support Vector Machine 
--------------------------------------------------------------
Accuracy Score :  0.8654708520179372
Calssification Report : 
               precision    recall  f1-score   support

         ham       1.00      0.87      0.93      1115
        spam       0.00      0.00      0.00         0

    accuracy                           0.87      1115
   macro avg       0.50      0.43      0.46      1115
weighted avg       1.00      0.87      0.93      1115



In [None]:
d3set.head()

Unnamed: 0,Target,mailText,Tokenize Text
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazi, .., avail, bugi, n,..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joke, wif, u, oni, ...]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, earli, hor, ..., u, c, alreadi, ..."
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goe, usf, live, around, though]"


In [None]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

# Specify the path to your GloVe file
glove_file = './glove.42B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file)


In [None]:
def get_word_vector(token, embeddings_index):
    return embeddings_index.get(token, np.zeros(300))

d3set['GloVe Embeddings'] = d3set['Tokenize Text'].apply(lambda tokens: [get_word_vector(token, glove_embeddings) for token in tokens])


In [None]:
d3set.head()

Unnamed: 0,Target,mailText,Tokenize Text,GloVe Embeddings
0,ham,"go until jurong point, crazy.. available only ...","[go, jurong, point, crazi, .., avail, bugi, n,...","[[0.094418, 0.26803, -0.18872, -0.34682, 0.173..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, ..., joke, wif, u, oni, ...]","[[0.05973, 0.11751, -0.19544, -0.2859, 0.34065..."
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entri, wkli, comp, win, fa, cup, final,...","[[-0.61984, -0.31242, 0.39918, 0.48442, 0.1743..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, earli, hor, ..., u, c, alreadi, ...","[[-0.078214, 0.95937, 0.12532, 0.52195, 0.0887..."
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, n't, think, goe, usf, live, around, though]","[[0.28848, 0.1572, 0.49064, -0.057261, -0.5658..."


In [None]:
d3set.dtypes

Target              object
mailText            object
Tokenize Text       object
GloVe Embeddings    object
dtype: object

In [None]:
requriedData = d3set['GloVe Embeddings']

In [None]:
requriedData

0       [[0.094418, 0.26803, -0.18872, -0.34682, 0.173...
1       [[0.05973, 0.11751, -0.19544, -0.2859, 0.34065...
2       [[-0.61984, -0.31242, 0.39918, 0.48442, 0.1743...
3       [[-0.078214, 0.95937, 0.12532, 0.52195, 0.0887...
4       [[0.28848, 0.1572, 0.49064, -0.057261, -0.5658...
                              ...                        
5567    [[0.10397, -0.20526, -0.29512, 0.016276, 0.007...
5568    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
5569    [[-0.39909, -0.38653, 0.082255, 0.10243, -0.52...
5570    [[-0.46733, 0.5856, 0.057228, -0.2123, -0.0979...
5571    [[-0.2477, 0.14809, 0.047774, -0.50507, 0.1089...
Name: GloVe Embeddings, Length: 5572, dtype: object

In [None]:
def extract_embeddings(embedding_list):
    token_list = [f"Token_{i}" for i in range(len(embedding_list))]
    data = []

    for token, embedding in zip(token_list, embedding_list):
        embedding_str = ' '.join(map(str, embedding))
        data.append(f"{token} {embedding_str}")

    return data


# Apply the function to the 'GloVe Embeddings' column
impData = d3set['GloVe Embeddings'].apply(extract_embeddings)

requriedData = pd.DataFrame(impData)



In [None]:
requriedData

Unnamed: 0,GloVe Embeddings
0,[Token_0 0.094418 0.26803 -0.18872 -0.34682 0....
1,[Token_0 0.05973 0.11751 -0.19544 -0.2859 0.34...
2,[Token_0 -0.61984 -0.31242 0.39918 0.48442 0.1...
3,[Token_0 -0.078214 0.95937 0.12532 0.52195 0.0...
4,[Token_0 0.28848 0.1572 0.49064 -0.057261 -0.5...
...,...
5567,[Token_0 0.10397 -0.20526 -0.29512 0.016276 0....
5568,[Token_0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0...
5569,[Token_0 -0.39909 -0.38653 0.082255 0.10243 -0...
5570,[Token_0 -0.46733 0.5856 0.057228 -0.2123 -0.0...
