#**Problem Definition**
**Goal: Classify text messages (emails, SMS, etc.) as spam or ham (not spam).**




In [1]:
# steps1:   writing  code to add google drive and import basic librery

import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#**2. Data Collection**


In [2]:
# steps2:importing the data from the google drive path
df2 = pd.read_csv("/content/drive/MyDrive/projects /machine learning implementation/email smap classifier/spam.csv", encoding='ISO-8859-1')


#**3 Data viewing**

In [3]:
#checking the entire dataframe of top five row
df2.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#**4 Data cleaning and processing**

**Dropping column**

In [4]:

# Dropping the specified columns from the dataframe because of not uses of any thing
df = df2.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [5]:


# Display the new dataframe head
print(df.sample(5))


        v1                                                 v2
1292   ham  Da my birthdate in certificate is in april but...
5170   ham  I'm in school now n i'll be in da lab doing so...
3490   ham                                                Ok.
4237   ham       Lol wtf random. Btw is that your lunch break
65    spam  As a valued customer, I am pleased to advise y...


In [6]:
#displaying shape of dataframe
df.shape

(5572, 2)

**information gathering about the dataframe**

In [7]:
#displaying information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


**Description checking**

In [8]:
#checking the description of the dataset
df.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


**levelling the spam with 1 and ham with 0**

In [9]:
#  Convert labels to binary (e.g., spam → 1, ham → 0).

# Mapping labels 'spam' to 1 and 'ham' to 0
df['v1'] = df['v1'].map({'spam': 1, 'ham': 0})

# Display the dataframe with binary labels
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


**counting the totla value of spam and ham**

In [10]:

# Counting the number of 0s and 1s in the 'v1' column ie spam and ham
label_counts = df['v1'].value_counts()

print("Count of 0s (ham):", label_counts.get(0, 0))
print("Count of 1s (spam):", label_counts.get(1, 0))

Count of 0s (ham): 4825
Count of 1s (spam): 747


**lowering the case v2 column**

In [11]:
# lowering case the v2 column

# Converting text to lowercase for consistency
df['v2'] = df['v2'].str.lower()

# Display the dataframe with the text in lowercase
df.head()

Unnamed: 0,v1,v2
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


**removing the punction , numbers and special characters**

In [12]:
# Using dataframe df: in v2  Remove punctuation, numbers, and special characters

# Import the regular expression library
import re

# Define a function to clean the text by removing punctuation, numbers, and special characters
def clean_text(text):
    # Remove characters that are not letters or spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Apply the cleaning function to the 'v2' column
df['v2'] = df['v2'].apply(clean_text)

In [13]:
df.head(2)

Unnamed: 0,v1,v2
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni


**removing the stopwords**

In [14]:
#  Using dataframe df: in v2 Remove stopwords (like "the", "is", "in")

# Importing necessary library
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Geting the English stopwords
stop_words = set(stopwords.words('english'))

# Defing a function to remove stopwords from a string
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
# Applying the function to the 'v2' column to remove stopwords
df['v2'] = df['v2'].apply(lambda x: remove_stopwords(x))


In [16]:
df.head(2)

Unnamed: 0,v1,v2
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni


**tokenizing the sentence in v2 column**

In [17]:
#Using dataframe df: tokenize the v2 using nltk

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab') # Download the punkt_tab resource


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [18]:
# Using dataframe df: tokenize the v2 using nltk

# Apply word_tokenize to the 'v2' column
df['tokens'] = df['v2'].apply(lambda x: word_tokenize(str(x)))

df.head()

Unnamed: 0,v1,v2,tokens
0,0,go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, available, bugis, n..."
1,0,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,free entry wkly comp win fa cup final tkts st ...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,u dun say early hor u c already say,"[u, dun, say, early, hor, u, c, already, say]"
4,0,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t..."


**leminization of tokens column**

In [19]:
# Using dataframe df: lemenization of column tokens and make new column leminazation
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize a list of tokens
def lemmatize_tokens(tokens):
    if isinstance(tokens, list):
        return [lemmatizer.lemmatize(token) for token in tokens]
    return []

# Apply the lemmatization function to the 'tokens' column to create the 'leminazation' column
df['leminization'] = df['tokens'].apply(lemmatize_tokens)

In [21]:
df.sample()

Unnamed: 0,v1,v2,tokens,leminization
1457,1,claire havin borin time alone u wanna cum nite...,"[claire, havin, borin, time, alone, u, wan, na...","[claire, havin, borin, time, alone, u, wan, na..."


#**5 feature_extraction (vectorization_word2vec)**

In [22]:
pip install numpy==1.24.4




In [23]:
pip install gensim==4.3.3



In [24]:
import gensim
from gensim.models import Word2Vec


In [25]:
# Example: List of tokenized sentences
sentences = df['leminization'].tolist()

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)


In [26]:
def get_avg_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)  # return zero vector if no known word
    return np.mean(vectors, axis=0)

# Add new 'vector' column with Word2Vec vector
df['vector'] = df['leminization'].apply(lambda x: get_avg_vector(x, model))


In [27]:
print(df[['leminization', 'vector']].head())
df['vector'].iloc[0].shape
df.head()


                                        leminization  \
0  [go, jurong, point, crazy, available, bugis, n...   
1                     [ok, lar, joking, wif, u, oni]   
2  [free, entry, wkly, comp, win, fa, cup, final,...   
3      [u, dun, say, early, hor, u, c, already, say]   
4  [nah, dont, think, go, usf, life, around, though]   

                                              vector  
0  [0.01298303, 0.18281235, 0.008474714, -0.03085...  
1  [0.0155949285, 0.20437358, 0.0016384419, -0.00...  
2  [-0.058027893, 0.10076328, 0.09810457, 0.03028...  
3  [0.03193054, 0.25300437, -0.020393435, -0.0153...  
4  [0.07610119, 0.25514925, -0.0284994, -0.057699...  


Unnamed: 0,v1,v2,tokens,leminization,vector
0,0,go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n...","[0.01298303, 0.18281235, 0.008474714, -0.03085..."
1,0,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[0.0155949285, 0.20437358, 0.0016384419, -0.00..."
2,1,free entry wkly comp win fa cup final tkts st ...,"[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,...","[-0.058027893, 0.10076328, 0.09810457, 0.03028..."
3,0,u dun say early hor u c already say,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]","[0.03193054, 0.25300437, -0.020393435, -0.0153..."
4,0,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]","[0.07610119, 0.25514925, -0.0284994, -0.057699..."


In [28]:
df.columns

Index(['v1', 'v2', 'tokens', 'leminization', 'vector'], dtype='object')

#**splitting the data**

In [29]:
from sklearn.model_selection import train_test_split

X = df['vector']  # This is word2vec
y = df['v1']  # This should be 'spam' or 'ham'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#**model1--Gaussian Naive-Bias**

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB  # Import GaussianNB
from sklearn.metrics import accuracy_score, classification_report


X = df['vector']
y = df['v1']

# Convert the pandas Series of arrays into a 2D NumPy array
# np.vstack stacks the individual arrays vertically
X_array = np.vstack(X)

X_train, X_test, y_train, y_test = train_test_split(X_array, y, test_size=0.2, random_state=42)

# Use GaussianNB instead of MultinomialNB for continuous data
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

print("🔹 Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

🔹 Gaussian Naive Bayes Accuracy: 0.9264573991031391
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       965
           1       0.70      0.81      0.75       150

    accuracy                           0.93      1115
   macro avg       0.83      0.88      0.85      1115
weighted avg       0.93      0.93      0.93      1115



#**model2--logistic Regression**

In [31]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("🔹 Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


🔹 Logistic Regression Accuracy: 0.9300448430493273
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       965
           1       0.84      0.59      0.70       150

    accuracy                           0.93      1115
   macro avg       0.89      0.79      0.83      1115
weighted avg       0.93      0.93      0.92      1115



#**model-3-Support Vector Machine (SVM)**

In [32]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

print("🔹 SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


🔹 SVM Accuracy: 0.947085201793722
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       965
           1       0.87      0.71      0.78       150

    accuracy                           0.95      1115
   macro avg       0.91      0.85      0.88      1115
weighted avg       0.94      0.95      0.94      1115



#**model-4- Random Forest**

In [33]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("🔹 Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


🔹 Random Forest Accuracy: 0.9650224215246637
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.93      0.80      0.86       150

    accuracy                           0.97      1115
   macro avg       0.95      0.90      0.92      1115
weighted avg       0.96      0.97      0.96      1115



#**model-5-xgboost classifier**

In [34]:
pip install xgboost




In [35]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("🔹 XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



🔹 XGBoost Accuracy: 0.9659192825112107
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.91      0.83      0.87       150

    accuracy                           0.97      1115
   macro avg       0.94      0.91      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [37]:
#  writing code to pickle this xgb model because this is performing best out of five

import pickle

# Pickling the XGBoost model
with open('xgb_model.pkl', 'wb') as f:
  pickle.dump(xgb_model, f)

print("XGBoost model pickled successfully as xgb_model.pkl")

XGBoost model pickled successfully as xgb_model.pkl


In [38]:
# writing  code for the saving model word2vec

# Saving the Word2Vec model
model.save("word2vec_model.bin")

print("Word2Vec model saved successfully as word2vec_model.bin")

Word2Vec model saved successfully as word2vec_model.bin


#**choosing the best model out of five is XG-Boost**
our datadset was a imbalanced dataset and thats why we will  focus more on precision , recall  and f1score not just for the accuracy only
so out of all five model x-G Boost is giving the best result.so we will select the xg-boost model