> ## **Machine Learning Task:** SMS Spam Detection ##

We will start by importing all the necessary libraries and the dataset.

In [450]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [451]:
# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

Now, lets check for any null values and any duplicates. To clean our data. 

In [452]:
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [453]:
data.dropna(inplace=True, axis=1)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [454]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [455]:
data.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [456]:
data.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [457]:
data.duplicated().sum()

403

In [458]:
data.drop_duplicates(inplace=True)

In [459]:
data.duplicated().sum()

0

Now lets move onto the data preprocessing. So that we can make our data suitable for our model. We will start by cleaning our text and removing any special characters from our text.

In [460]:
# Step 1: Text Cleaning
def clean_text(text):
    # Remove special characters and numbers
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

In [461]:
data['v2'] = data['v2'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print(data.head())

     v1                                                 v2
0   ham  Go until jurong point crazy Available only in ...
1   ham                            Ok lar Joking wif u oni
2  spam  Free entry in  a wkly comp to win FA Cup final...
3   ham        U dun say so early hor U c already then say
4   ham  Nah I dont think he goes to usf he lives aroun...


Now that we are done with that. It better to convert our text into lowercase so that there are no discrepancies or uneven cases in our text. This can cause problems in our model.

In [462]:
# Step 1: Text Clearning (Continued)
def lowercase_text(text):
    # Convert text to lowercase
    lowercase_text = text.lower()
    return lowercase_text

In [463]:
data['v2'] = data['v2'].apply(lowercase_text)

# Display the first few rows of the dataset with lowercase text
print(data.head())

     v1                                                 v2
0   ham  go until jurong point crazy available only in ...
1   ham                            ok lar joking wif u oni
2  spam  free entry in  a wkly comp to win fa cup final...
3   ham        u dun say so early hor u c already then say
4   ham  nah i dont think he goes to usf he lives aroun...


Now, just in case. I am going to download some necessary packages and update them if required. 

In [464]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

This part of the process is to divide the text into small tokens. What this means is that each word of our dataset is a seperate element. That will be reviewed by our model. 

In [465]:
# Step 1: Text Cleaning (Continued)
def tokenize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    return tokens

In [466]:
data['v2'] = data['v2'].apply(tokenize_text)

# Display the first few rows of the dataset with tokenized text
print(data.head())

     v1                                                 v2
0   ham  [go, until, jurong, point, crazy, available, o...
1   ham                     [ok, lar, joking, wif, u, oni]
2  spam  [free, entry, in, a, wkly, comp, to, win, fa, ...
3   ham  [u, dun, say, so, early, hor, u, c, already, t...
4   ham  [nah, i, dont, think, he, goes, to, usf, he, l...


After making our tokens. We are going to remove any stopwords. Stopwords are words that are not important for our model. They are words like 'the', 'a', 'an', 'is', 'are', etc. These words are not important for our model. So, we are going to remove them.

In [467]:
# Step 1: Text Cleaning (Continued)
def remove_stopwords(tokens):
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [468]:
data['v2'] = data['v2'].apply(remove_stopwords)

# Display the first few rows of the dataset with filtered text
print(data.head())

     v1                                                 v2
0   ham  [go, jurong, point, crazy, available, bugis, n...
1   ham                     [ok, lar, joking, wif, u, oni]
2  spam  [free, entry, wkly, comp, win, fa, cup, final,...
3   ham      [u, dun, say, early, hor, u, c, already, say]
4   ham  [nah, dont, think, goes, usf, lives, around, t...


This is the final step of our data preprocessing. We are going to stem our words. Stemming is the process of reducing a word to its root form. For example, the word 'running' will be reduced to 'run'. This is done so that our model can understand the words better. We are going to use the PorterStemmer for this task. Moreover, we have also provided an option to use lemmatization, if that might be better. Although a very small difference is observed between the two. However, stemming is faster than lemmatization. So, we are going to use stemming and also has a slight edge in accuracy.

In [469]:
# Step 1: Text Cleaning (Continued)
def apply_stemming(tokens):
    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

def apply_lemmatization(tokens):
    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens


In [470]:
# Choose either stemming or lemmatization
data['v2'] = data['v2'].apply(apply_stemming)
# or
#data['v2'] = data['v2'].apply(apply_lemmatization)

In [471]:
# Display the first few rows of the dataset with processed text
print(data.head())

     v1                                                 v2
0   ham  [go, jurong, point, crazi, avail, bugi, n, gre...
1   ham                       [ok, lar, joke, wif, u, oni]
2  spam  [free, entri, wkli, comp, win, fa, cup, final,...
3   ham      [u, dun, say, earli, hor, u, c, alreadi, say]
4   ham  [nah, dont, think, goe, usf, live, around, tho...


Now we are going to perform vectorization our data. This is done so that our model can understand our data better. We are going to use the TF-IDF vectorizer for this task. Additionally, this is also done so that all the words are given equal importance and the data can be tested as a whole. Hence, removing any bias and making it easier for our model to understand the data. Our features are reduced so that the models works faster and more efficiently.

In [472]:
# Step 2: Feature Extraction (TF-IDF)
corpus = [' '.join(tokens) for tokens in data['v2']]
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(corpus)

In [473]:
# Display the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X.shape)

TF-IDF Matrix Shape: (5169, 7050)


Just for the layman, TF-IDF is a measure of originality of a word by comparing the number of times a word appears in a document with the number of documents the word appears in. Hence, giving us a measure of how important a word is to a document in a collection or corpus.

Also, I have seperately made the label variable so it would be easier for anyone reading to understand the code. The column v1 are the labels and the column v2 are the text. 

In [474]:
labels = data['v1']

This part of the process is fairly common but extremely necessary part of our machine learning process. We are going to split our data into training and testing data. This is done so that we can train our model on the training data and then test it on the testing data. This is done so that we can check the accuracy of our model. We are going to use the train_test_split function from the sklearn library. We are going to use 20% of our data for testing and 80% for training.

In [475]:
# Assuming you have 'X' as your TF-IDF matrix and 'labels' as corresponding labels
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

> ## **Machine Learning Model:** Naive Bayes ##

We will start by finding the parameters that are best for our model. I have used the GridSearchCV function from the sklearn library. This function will help us find the best parameters for our model. We are going to use the Multinomial Naive Bayes model for this task. Alpha is from range 0.1 to 10.0, with seperations of 0.1 for each value of alpha. Just wide range is taken so that the best results can be achieved.

In [476]:
# Hyperparameter tuning for Naive Bayes
param_grid_nb = {'alpha': [0.1 * i for i in range(1, 101)]}
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
grid_search_nb.fit(X_train, y_train)
best_nb_classifier = grid_search_nb.best_estimator_

# Deploy the best Naive Bayes model
new_sms_nb = ["Get a free gift now!", "Hi, how are you?"]
new_sms_corpus_nb = [' '.join(tokenize_text(clean_text(sms.lower()))) for sms in new_sms_nb]
new_sms_tfidf_nb = tfidf_vectorizer.transform(new_sms_corpus_nb)
predicted_labels_nb = best_nb_classifier.predict(new_sms_tfidf_nb)

for sms, label in zip(new_sms_nb, predicted_labels_nb):
    print(f"SMS: {sms} - Predicted Label: {label}")
    
print(f"Best Score: {grid_search_nb.best_score_}")
print(f"Best Estimator: {grid_search_nb.best_estimator_}")
print(f"Best Value of Alpha: {grid_search_nb.best_params_}")

SMS: Get a free gift now! - Predicted Label: spam
SMS: Hi, how are you? - Predicted Label: ham
Best Score: 0.9719486049818394
Best Estimator: MultinomialNB(alpha=0.1)
Best Value of Alpha: {'alpha': 0.1}


We are now going to use the Naive Bayes model for our task. We are going to use the Multinomial Naive Bayes model. This is because our data is discrete and not continuous. We are going to use the MultinomialNB function from the sklearn library. A very important thing to note is that we are going to use the fit function on our training data. This is done so that our model can learn from our training data. We will now create a model variable.

In [477]:
# Step 3: Model Training (Naive Bayes)
nb_classifier = best_nb_classifier
nb_classifier.fit(X_train, y_train)

Now that we have trained our model. We would use that model to predict the labels of our testing data. We are going to use the predict function on our testing data. We will now create a predictions variable. Additionally, we would also check the accuracy of our model.

In [478]:
# Step 4: Model Evaluation
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9758220502901354


In [479]:
# Display classification report and confusion matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       889
        spam       0.92      0.91      0.91       145

    accuracy                           0.98      1034
   macro avg       0.95      0.95      0.95      1034
weighted avg       0.98      0.98      0.98      1034


Confusion Matrix:
 [[877  12]
 [ 13 132]]


In [480]:
# Hyperparameter tuning for Logistic Regression
param_grid_logreg = {'C': [0.1 * i for i in range(1, 101)]}
grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_logreg, cv=3)
grid_search_logreg.fit(X_train, y_train)
best_logreg_classifier = grid_search_logreg.best_estimator_

# Deploy the best Logistic Regression model
new_sms_logreg = ["Get a free gift now!", "Hi, how are you?"]
new_sms_corpus_logreg = [' '.join(tokenize_text(clean_text(sms.lower()))) for sms in new_sms_logreg]
new_sms_tfidf_logreg = tfidf_vectorizer.transform(new_sms_corpus_logreg)
predicted_labels_logreg = best_logreg_classifier.predict(new_sms_tfidf_logreg)

for sms, label in zip(new_sms_logreg, predicted_labels_logreg):
    print(f"SMS: {sms} - Predicted Label: {label}")

print(f"Best Score: {grid_search_logreg.best_score_}")
print(f"Best Estimator: {grid_search_logreg.best_estimator_}")
print(f"Best Value of Alpha: {grid_search_logreg.best_params_}")

SMS: Get a free gift now! - Predicted Label: ham
SMS: Hi, how are you? - Predicted Label: ham
Best Score: 0.9680772090024078
Best Estimator: LogisticRegression(C=10.0, max_iter=1000)
Best Value of Alpha: {'C': 10.0}


In [481]:
# Step 3: Model Training (Logistic Regression)
logreg_classifier = best_logreg_classifier
logreg_classifier.fit(X_train, y_train)

In [482]:
# Step 4: Model Evaluation
y_pred_logreg = logreg_classifier.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)

Logistic Regression Accuracy: 0.9729206963249516


In [483]:
# Display classification report and confusion matrix for Logistic Regression
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))
print("\nLogistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))


Logistic Regression Classification Report:
               precision    recall  f1-score   support

         ham       0.97      0.99      0.98       889
        spam       0.96      0.84      0.90       145

    accuracy                           0.97      1034
   macro avg       0.97      0.92      0.94      1034
weighted avg       0.97      0.97      0.97      1034


Logistic Regression Confusion Matrix:
 [[884   5]
 [ 23 122]]


In [484]:
# Hyperparameter tuning for SVM
param_grid_svm = {
    'C': [0.1 * i for i in range(1, 101)],
    'kernel': ['linear', 'rbf', 'sigmoid']
}
grid_search = GridSearchCV(SVC(), param_grid, cv=3)
grid_search.fit(X_train, y_train)
best_svm_classifier = grid_search.best_estimator_

# Deploy the best SVM model
new_sms = ["Get a free gift now!", "Hi, how are you?"]
new_sms_corpus = [' '.join(tokenize_text(clean_text(sms.lower()))) for sms in new_sms]
new_sms_tfidf = tfidf_vectorizer.transform(new_sms_corpus)
predicted_labels = best_svm_classifier.predict(new_sms_tfidf)

for sms, label in zip(new_sms, predicted_labels):
    print(f"SMS: {sms} - Predicted Label: {label}")

print(f"Best Score: {grid_search.best_score_}")
print(f"Best Estimator: {grid_search.best_estimator_}")
print(f"Best Value of Alpha: {grid_search.best_params_}")

SMS: Get a free gift now! - Predicted Label: ham
SMS: Hi, how are you? - Predicted Label: ham
Best Score: 0.9743656401064694
Best Estimator: SVC(C=10, kernel='linear')
Best Value of Alpha: {'C': 10, 'kernel': 'linear'}


In [485]:
# Step 3: Model Training (Support Vector Machines - SVM)
svm_classifier = best_svm_classifier
svm_classifier.fit(X_train, y_train)

In [486]:
# Step 4: Model Evaluation
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

SVM Accuracy: 0.9758220502901354


In [487]:
# Display classification report and confusion matrix for SVM
print("\nSVM Classification Report:\n", classification_report(y_test, y_pred_svm))
print("\nSVM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


SVM Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.99      0.99       889
        spam       0.95      0.87      0.91       145

    accuracy                           0.98      1034
   macro avg       0.97      0.93      0.95      1034
weighted avg       0.98      0.98      0.98      1034


SVM Confusion Matrix:
 [[883   6]
 [ 19 126]]
