In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [5]:
data.columns

Index(['v1', 'v2'], dtype='object')

In [6]:
data.columns = ['tag', 'message']

In [7]:
data.head()

Unnamed: 0,tag,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data['tag'] = data['tag'].map({'ham':0, 'spam':1})

In [9]:
data.head()

Unnamed: 0,tag,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
data.shape

(5572, 2)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['tag'], test_size=0.2, random_state=40)

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')

In [13]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [14]:
X_test_tfidf = vectorizer.transform(X_test)

In [15]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [16]:
y_pred_nb = nb_model.predict(X_test_tfidf)

In [17]:
print("Naive Bayes Classification Report:\n", classification_report(y_pred_nb, y_test))

Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98       998
           1       0.79      1.00      0.88       117

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.93      1115
weighted avg       0.98      0.97      0.97      1115



In [18]:
print("Naive Bayes Confusion Matrix:\n", confusion_matrix(y_pred_nb, y_test))

Naive Bayes Confusion Matrix:
 [[967  31]
 [  0 117]]


In [19]:
lr_model = LogisticRegression(max_iter=1000)

In [20]:
lr_model.fit(X_train_tfidf, y_train)

In [21]:
y_pred_lr = lr_model.predict(X_test_tfidf)

In [22]:
print("Logistic Regression Classification Report:\n", classification_report(y_pred_lr, y_test))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98      1004
           1       0.74      0.99      0.85       111

    accuracy                           0.97      1115
   macro avg       0.87      0.98      0.91      1115
weighted avg       0.97      0.97      0.97      1115



In [23]:
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_pred_lr, y_test))

Logistic Regression Confusion Matrix:
 [[966  38]
 [  1 110]]


In [24]:
svm_model = SVC(kernel='linear')

In [25]:
svm_model.fit(X_train_tfidf, y_train)

In [26]:
y_pred_svm = svm_model.predict(X_test_tfidf)

In [27]:
print("SVM Classification Report:\n", classification_report(y_pred_svm, y_test))

SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       978
           1       0.91      0.99      0.95       137

    accuracy                           0.99      1115
   macro avg       0.96      0.99      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [28]:
print("SVM Confusion Matrix:\n", confusion_matrix(y_pred_svm, y_test))

SVM Confusion Matrix:
 [[965  13]
 [  2 135]]


In [29]:
accuracy_nb = accuracy_score(y_test, y_pred_nb)

In [30]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)

In [31]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)

In [32]:
print(f"Naive Bayes Accuracy: {accuracy_nb}")

Naive Bayes Accuracy: 0.9721973094170404


In [33]:
print(f"Logistic Regression Accuracy: {accuracy_lr}")

Logistic Regression Accuracy: 0.9650224215246637


In [34]:
print(f"SVM Accuracy: {accuracy_svm}")

SVM Accuracy: 0.9865470852017937
