In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

In [2]:
spamsms_dataset = pd.read_csv('4. SMS Spam Collection.csv', encoding='cp1252')

In [3]:
spamsms_dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
spamsms_dataset=spamsms_dataset.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [5]:
spamsms_dataset.isnull().sum()

v1    0
v2    0
dtype: int64

In [6]:
spamsms_dataset.shape

(5572, 2)

In [7]:
spamsms_dataset.replace({'v1': {'spam':0, 'ham':1}}, inplace = True)

In [8]:
spamsms_dataset.head()

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
X = spamsms_dataset['v2']
y = spamsms_dataset['v1']

In [10]:
print(X)
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will ?_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: int64


In [11]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [13]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': MultinomialNB()
}

In [14]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)

    print(f"{model_name} - Test Accuracy: {test_accuracy:.4f}")
    print(f"{model_name} - Test F1 Score: {test_f1:.4f}")
    print(f"{model_name} - Test Recall: {test_recall:.4f}")
    print(f"{model_name} - Test Precision: {test_precision:.4f}\n")

Logistic Regression - Test Accuracy: 0.9623
Logistic Regression - Test F1 Score: 0.9787
Logistic Regression - Test Recall: 1.0000
Logistic Regression - Test Precision: 0.9583

Support Vector Machine - Test Accuracy: 0.9767
Support Vector Machine - Test F1 Score: 0.9867
Support Vector Machine - Test Recall: 1.0000
Support Vector Machine - Test Precision: 0.9738

Random Forest - Test Accuracy: 0.9776
Random Forest - Test F1 Score: 0.9872
Random Forest - Test Recall: 0.9990
Random Forest - Test Precision: 0.9757

Decision Tree - Test Accuracy: 0.9659
Decision Tree - Test F1 Score: 0.9803
Decision Tree - Test Recall: 0.9813
Decision Tree - Test Precision: 0.9793

K-Nearest Neighbors - Test Accuracy: 0.9130
K-Nearest Neighbors - Test F1 Score: 0.9521
K-Nearest Neighbors - Test Recall: 1.0000
K-Nearest Neighbors - Test Precision: 0.9087

Naive Bayes - Test Accuracy: 0.9623
Naive Bayes - Test F1 Score: 0.9787
Naive Bayes - Test Recall: 1.0000
Naive Bayes - Test Precision: 0.9583



In [15]:
input_mail=("Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030")
input_data_features=tfidf_vectorizer.transform([input_mail])
input_data_features = input_data_features.reshape(1, -1)
prediction=model.predict(input_data_features)
if(prediction[0]==1):
    print('Ham mail')
else:
    print('Spam mail')

Spam mail


In [16]:
input_mail=("I only haf msn. It's yijue@hotmail.com")
input_data_features=tfidf_vectorizer.transform([input_mail])
input_data_features = input_data_features.reshape(1, -1)
prediction=model.predict(input_data_features)
if(prediction[0]==1):
    print('Ham mail')
else:
    print('Spam mail')

Ham mail
