Import important libraries

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from collections import Counter

Now we need to import the dataset (The link of the dataset: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset/data)

In [8]:
original_dataset = pd.read_csv("spam.csv", encoding='latin1')
original_dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


There are some unnamed columns with no values so we will remove them

In [9]:
# dropping the redundent columns
original_dataset = original_dataset.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

original_dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Check if the data have any duplicates

In [10]:
original_dataset.duplicated().any()

True

Remove the duplicates

In [13]:
original_dataset.drop_duplicates(inplace=True)

Check if the dataset is imbalanced

In [14]:
# get the number of unique values in the label column ("v1")
unique_labels = original_dataset['v1'].value_counts()

# get the percentages of them
percentages = unique_labels / len(original_dataset) * 100

# print the result out
print(percentages)

v1
ham     87.366996
spam    12.633004
Name: count, dtype: float64


The data is imbalanced

The Next few cells we will clean the text of each message in the data and we will make a new column ('text') of the cleaned texts.

1- Converting all of the characters to lower case

In [15]:
original_dataset['text'] = original_dataset['v2'].str.lower()

2- Removing Punctuation

In [16]:
original_dataset['text'] = original_dataset['text'].str.replace('[^\w\s]', '')

3- Removing Stopwords

In [17]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop = stopwords.words('english')
original_dataset['text'] = original_dataset['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


5- Tokenization (split the text into individual words'Tokens')

In [18]:
original_dataset['text'] = original_dataset['text'].apply(lambda x: x.split())

6. Stemming/Lemmatization (convert the words to thier base form)

In [19]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
original_dataset['text'] = original_dataset['text'].apply(lambda x: [stemmer.stem(word) for word in x])

7- Removing the original column of the texts

In [20]:
original_dataset.drop('v2',axis=1,inplace=True)

In [21]:
original_dataset.head()

Unnamed: 0,v1,text
0,ham,"[go, jurong, point,, crazy.., avail, bugi, n, ..."
1,ham,"[ok, lar..., joke, wif, u, oni...]"
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa..."
4,ham,"[nah, think, goe, usf,, live, around, though]"


Feature Engineering 

We will add a new feature ('no_words') which represents the number of the words in each message

In [22]:
original_dataset['no_words'] = original_dataset['text'].apply(len)

rename the column of each class from 'v1' to 'class'

In [23]:
original_dataset.rename(columns={'v1':'class'}, inplace=True)

In [24]:
original_dataset.head()

Unnamed: 0,class,text,no_words
0,ham,"[go, jurong, point,, crazy.., avail, bugi, n, ...",16
1,ham,"[ok, lar..., joke, wif, u, oni...]",6
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",23
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",9
4,ham,"[nah, think, goe, usf,, live, around, though]",7


Feature Extraction

In the feature extraction we will transform the textual data into numerical features using TF-IDF technique which works only with strings so we will convert back the column text from list of words ('Tokens') into string then apply the technique

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join the tokens back into a string 
original_dataset['text'] = original_dataset['text'].apply(lambda x: ' '.join(x))

# Initialize the TF-IDF 
vectorizer = TfidfVectorizer()

# Transform the text data into TF-IDF vectors
features_text = vectorizer.fit_transform(original_dataset['text'])

combine the features_text with the feature no_words to make the features dataset which will be splitted

In [26]:
import numpy as np
from scipy.sparse import hstack

no_words = original_dataset['no_words'].values.reshape(-1, 1)
features = hstack([features_text, no_words])

sperate the label column and encode it 

In [27]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels as 0 (ham) and 1 (spam)
encoder = LabelEncoder()
original_dataset['class'] = encoder.fit_transform(original_dataset['class'])
label = original_dataset['class']

Split the data into train and test dataframes

In [28]:
from sklearn.model_selection import train_test_split

features_train,features_test,label_train,label_test = train_test_split(features, label, test_size=0.2, random_state=42)

Import the model which is Logisitic Regression and train it

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression()

model.fit(features_train, label_train)

Make predictions on the test set

In [30]:
predictions = model.predict(features_test)

Evaluate the model 

In [31]:
print(classification_report(label_test, predictions))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       889
           1       0.97      0.77      0.86       145

    accuracy                           0.96      1034
   macro avg       0.97      0.88      0.92      1034
weighted avg       0.96      0.96      0.96      1034

