**Importing the required libraries**

In [16]:
import pandas as pd
import numpy as np
import regex as re
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")
import random
random.seed(1024)

**Reading in the train, test and validation data**

In [17]:
path_train = './data/train.csv'
path_val = './data/validation.csv'
path_test = './data/test.csv'
path_mod_df = './data/mod_df.csv'

train_df = pd.read_csv(path_train)
val_df = pd.read_csv(path_val)
test_df = pd.read_csv(path_test)
mod_df = pd.read_csv(path_mod_df)

**Creating the bag of words transformer**

In [18]:
vectorizer = CountVectorizer()
vectorizer.fit(train_df.X_train)

bow_transformer = vectorizer.vocabulary_
print(len(bow_transformer))

7331


**Converting all the data to be used into bag of words form**

In [19]:
X_train = vectorizer.transform(train_df.X_train)
X_val = vectorizer.transform(val_df.X_val)
X_test = vectorizer.transform(test_df.X_test)

print(X_train.shape, X_val.shape, X_test.shape)

(4514, 7331) (502, 7331) (558, 7331)


**Creating the tf-idf transformer**

In [20]:
tfidf_transformer = TfidfTransformer().fit(X_train)

**Converting the data into tf-idf form**

In [21]:
tfidf_X_train = tfidf_transformer.transform(X_train)
tfidf_X_val = tfidf_transformer.transform(X_val)
tfidf_X_test = tfidf_transformer.transform(X_test)

print(tfidf_X_train.shape, tfidf_X_val.shape, tfidf_X_test.shape)

(4514, 7331) (502, 7331) (558, 7331)


**Multinomial Naive Bayes Model based on tf-idf tokenizer**

**Creating the spam detection model and computing the evaluation metrics for the predicted values based on the validation set**

In [22]:
for i in np.arange(0.05, 2.25, 0.25):
    spam_detection_model = MultinomialNB(alpha = i).fit(tfidf_X_train, train_df.y_train)
    val_predictions = spam_detection_model.predict(tfidf_X_val)
    print("Alpha:", i, "\n","Accuracy Score:",accuracy_score(val_df.y_val, val_predictions), '\n', classification_report(val_df.y_val, val_predictions))
    print("\nConfusion Matrix:\n",confusion_matrix(val_df.y_val, val_predictions),"\n\n")

Alpha: 0.05 
 Accuracy Score: 0.9920318725099602 
               precision    recall  f1-score   support

         ham       0.99      1.00      1.00       440
        spam       1.00      0.94      0.97        62

    accuracy                           0.99       502
   macro avg       1.00      0.97      0.98       502
weighted avg       0.99      0.99      0.99       502


Confusion Matrix:
 [[440   0]
 [  4  58]] 


Alpha: 0.3 
 Accuracy Score: 0.9880478087649402 
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       440
        spam       1.00      0.90      0.95        62

    accuracy                           0.99       502
   macro avg       0.99      0.95      0.97       502
weighted avg       0.99      0.99      0.99       502


Confusion Matrix:
 [[440   0]
 [  6  56]] 


Alpha: 0.55 
 Accuracy Score: 0.9860557768924303 
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99    

Best value is for alpha = 0.05

**Computing the predicted values on the test set based on the best model**

In [23]:
spam_detector_model_best = MultinomialNB(alpha = 0.05).fit(tfidf_X_train, train_df.y_train)
test_predictions = spam_detection_model.predict(tfidf_X_test)

**Computing the evaluation metrics for the predicted values based on the test set**

In [24]:
print("Accuracy Score:",accuracy_score(test_df.y_test, test_predictions), '\n',classification_report(test_df.y_test, test_predictions))
print("\nConfusion Matrix:\n",confusion_matrix(test_df.y_test, test_predictions),"\n\n")

Accuracy Score: 0.9408602150537635 
               precision    recall  f1-score   support

         ham       0.94      1.00      0.97       478
        spam       1.00      0.59      0.74        80

    accuracy                           0.94       558
   macro avg       0.97      0.79      0.85       558
weighted avg       0.94      0.94      0.93       558


Confusion Matrix:
 [[478   0]
 [ 33  47]] 




**Logistic Regression Model based on tf-idf tokenizer**

**Creating the spam detection model and computing the evaluation metrics for the predicted values based on the validation set**

In [25]:
for i in [0.1, 0.5, 1, 10, 20, 50, 100]:
    spam_detection_model_2 = LogisticRegression(C = i)
    spam_detection_model_2.fit(tfidf_X_train, train_df.y_train)
    val_predictions = spam_detection_model_2.predict(tfidf_X_val)
    print("C =",i,"\nAccuracy Score",accuracy_score(val_df.y_val, val_predictions), '\n', classification_report(val_df.y_val, val_predictions))
    print("\nConfusion Matrix:\n",confusion_matrix(val_df.y_val, val_predictions),"\n\n")

C = 0.1 
Accuracy Score 0.8764940239043825 
               precision    recall  f1-score   support

         ham       0.88      1.00      0.93       440
        spam       0.00      0.00      0.00        62

    accuracy                           0.88       502
   macro avg       0.44      0.50      0.47       502
weighted avg       0.77      0.88      0.82       502


Confusion Matrix:
 [[440   0]
 [ 62   0]] 


C = 0.5 
Accuracy Score 0.952191235059761 
               precision    recall  f1-score   support

         ham       0.95      1.00      0.97       440
        spam       0.97      0.63      0.76        62

    accuracy                           0.95       502
   macro avg       0.96      0.81      0.87       502
weighted avg       0.95      0.95      0.95       502


Confusion Matrix:
 [[439   1]
 [ 23  39]] 


C = 1 
Accuracy Score 0.9760956175298805 
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       440
        spam 

Best value is for C = 100

**Computing the predicted values on the test set based on the best model**

In [26]:
spam_detector_model_best = LogisticRegression(C = 100)
test_predictions = spam_detection_model.predict(tfidf_X_test)

**Computing the evaluation metrics for the predicted values based on the test set**

In [27]:
print("Accuracy Score:",accuracy_score(test_df.y_test, test_predictions), '\n',classification_report(test_df.y_test, test_predictions))
print("\nConfusion Matrix:\n",confusion_matrix(test_df.y_test, test_predictions),"\n\n")

Accuracy Score: 0.9408602150537635 
               precision    recall  f1-score   support

         ham       0.94      1.00      0.97       478
        spam       1.00      0.59      0.74        80

    accuracy                           0.94       558
   macro avg       0.97      0.79      0.85       558
weighted avg       0.94      0.94      0.93       558


Confusion Matrix:
 [[478   0]
 [ 33  47]] 




**Support Vector Classifier Model**

In [28]:
for i in [0.1, 0.5, 1, 10, 20, 50, 100]:
    spam_detection_model_3 = SVC(C = i)
    spam_detection_model_3.fit(tfidf_X_train, train_df.y_train)
    val_predictions = spam_detection_model_3.predict(tfidf_X_val)
    print("C =",i,"\nAccuracy Score",accuracy_score(val_df.y_val, val_predictions), '\n', classification_report(val_df.y_val, val_predictions))
    print("\nConfusion Matrix:\n",confusion_matrix(val_df.y_val, val_predictions),"\n\n")

C = 0.1 
Accuracy Score 0.8784860557768924 
               precision    recall  f1-score   support

         ham       0.88      1.00      0.94       440
        spam       1.00      0.02      0.03        62

    accuracy                           0.88       502
   macro avg       0.94      0.51      0.48       502
weighted avg       0.89      0.88      0.82       502


Confusion Matrix:
 [[440   0]
 [ 61   1]] 


C = 0.5 
Accuracy Score 0.9820717131474104 
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       440
        spam       1.00      0.85      0.92        62

    accuracy                           0.98       502
   macro avg       0.99      0.93      0.96       502
weighted avg       0.98      0.98      0.98       502


Confusion Matrix:
 [[440   0]
 [  9  53]] 


C = 1 
Accuracy Score 0.9820717131474104 
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       440
        spam

Model performs same for nearly all values of the regularization parameter for values > 1 hence we use C = 10 in the model for evaluating the test set

In [29]:
spam_detector_model_3_best = SVC(C = 10)
spam_detection_model_3.fit(tfidf_X_train, train_df.y_train)
test_predictions = spam_detection_model_3.predict(tfidf_X_test)


**Computing the evaluation metrics for the predicted values based on the test set**

In [30]:
print("Accuracy Score:",accuracy_score(test_df.y_test, test_predictions), '\n',classification_report(test_df.y_test, test_predictions))
print("\nConfusion Matrix:\n",confusion_matrix(test_df.y_test, test_predictions),"\n\n")

Accuracy Score: 0.9874551971326165 
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       478
        spam       1.00      0.91      0.95        80

    accuracy                           0.99       558
   macro avg       0.99      0.96      0.97       558
weighted avg       0.99      0.99      0.99       558


Confusion Matrix:
 [[478   0]
 [  7  73]] 




Among the 3 benchmark models the Support Vector Classifier performs the best on the test set.