In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [3]:
data = pd.read_csv(r'Dataset\spam.csv', encoding = 'ISO-8859-1')

In [4]:
#selecting only the relevant columns
data = data[['v1', 'v2']]

In [8]:
data.columns = ['label', 'message']

In [10]:
#Preprocessing
data['label'] = data['label'].map({'ham':0, 'spam':1})

In [12]:
vectorizer = TfidfVectorizer(stop_words = 'english')
X = vectorizer.fit_transform(data['message'])
y = data['label']

In [13]:
#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [16]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [18]:
#predictions
y_pred = model.predict(X_test)

In [19]:
#Model Rvaluation
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix\n:', confusion_matrix(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))

Accuracy: 0.968609865470852
Confusion Matrix
: [[965   0]
 [ 35 115]]
Classification Report:               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115



# Logistic Regression

In [31]:
log_reg = LogisticRegression(random_state=42)

log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.9443946188340807
Precision: 0.9680851063829787
Recall: 0.6066666666666667
F1-Score: 0.7459016393442623


# SVM

In [33]:
SVM = LogisticRegression(kernel='linear')

SVM.fit(X_train, y_train)

y_pred = SVM.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.9443946188340807
Precision: 0.9680851063829787
Recall: 0.6066666666666667
F1-Score: 0.7459016393442623
