In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

data = pd.read_csv(r"C:\Users\Kunjal Thorat\Desktop\SMS Detection\spam.csv", encoding='latin-1')

data = data[['v1', 'v2']]
data.columns = ['Class', 'Message']

In [4]:
print("Training Data - First 5 rows:")
data.head()

Training Data - First 5 rows:


Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Data Preprocessing
data['Message'] = data['Message'].str.lower()

X = data['Message']
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
# Naive Bayes

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
confusion_matrix_nb = confusion_matrix(y_test, y_pred_nb)
report_nb = classification_report(y_test, y_pred_nb)

print("Naive Bayes Model:")
print(f'Accuracy: {accuracy_nb * 100:.2f}%')
print('Confusion Matrix:')
print(confusion_matrix_nb)
print('Classification Report:')
print(report_nb)

Naive Bayes Model:
Accuracy: 95.98%
Confusion Matrix:
[[1202    0]
 [  56  135]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1202
        spam       1.00      0.71      0.83       191

    accuracy                           0.96      1393
   macro avg       0.98      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393



In [11]:
spam_count_nb = classified_nb.count('spam')
print(f"Naive Bayes: {spam_count_nb} spam messages")

Naive Bayes: 135 spam messages


In [12]:
# Logistic Regression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
confusion_matrix_lr = confusion_matrix(y_test, y_pred_lr)
report_lr = classification_report(y_test, y_pred_lr)

print("\nLogistic Regression Model:")
print(f'Accuracy: {accuracy_lr * 100:.2f}%')
print('Confusion Matrix:')
print(confusion_matrix_lr)
print('Classification Report:')
print(report_lr)



Logistic Regression Model:
Accuracy: 96.34%
Confusion Matrix:
[[1201    1]
 [  50  141]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1202
        spam       0.99      0.74      0.85       191

    accuracy                           0.96      1393
   macro avg       0.98      0.87      0.91      1393
weighted avg       0.96      0.96      0.96      1393



In [16]:
spam_count_lr = classified_lr.count('spam')
print(f"Logistic Regression: {spam_count_lr} spam messages")

Logistic Regression: 142 spam messages


In [17]:
# Support Vector Machines (SVM)

svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
confusion_matrix_svm = confusion_matrix(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)

print("\nSupport Vector Machines (SVM) Model:")
print(f'Accuracy: {accuracy_svm * 100:.2f}%')
print('Confusion Matrix:')
print(confusion_matrix_svm)
print('Classification Report:')
print(report_svm)


Support Vector Machines (SVM) Model:
Accuracy: 97.92%
Confusion Matrix:
[[1202    0]
 [  29  162]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1202
        spam       1.00      0.85      0.92       191

    accuracy                           0.98      1393
   macro avg       0.99      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [19]:
spam_count_svm = classified_svm.count('spam')

print(f"SVM: {spam_count_svm} spam messages")

SVM: 162 spam messages
