In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,  precision_score, recall_score, f1_score
import joblib

In [2]:
file = '/content/spam.csv'
df = pd.read_csv(file, encoding='latin1', usecols=[0, 1])
df.columns = ['label', 'text']
df['bool_label'] = df['label'].map({'ham': 0, 'spam': 1})

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       5572 non-null   object
 1   text        5572 non-null   object
 2   bool_label  5572 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


In [4]:
X=df['text']
Y=df['bool_label']
X_train,X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [5]:
tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [8]:
model = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_tfidf, Y_train)

print("Best Parameters: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

Y_pred = best_model.predict(X_test_tfidf)

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, pos_label=1)
recall = recall_score(Y_test, Y_pred, pos_label=1)
f1 = f1_score(Y_test, Y_pred, pos_label=1)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

print('Confusion Matrix:')
print(confusion_matrix(Y_test, Y_pred))



Best Parameters:  {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.9757847533632287
Precision: 0.9624060150375939
Recall: 0.8533333333333334
F1 Score: 0.9045936395759717
Confusion Matrix:
[[960   5]
 [ 22 128]]




In [9]:
model_nb = MultinomialNB()

param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]
}

grid_search_nb = GridSearchCV(model_nb, param_grid_nb, cv=5, scoring='f1')
grid_search_nb.fit(X_train_tfidf, Y_train)

print("Best Parameters for NB: ", grid_search_nb.best_params_)
best_model_nb = grid_search_nb.best_estimator_

Y_pred_nb = best_model_nb.predict(X_test_tfidf)

accuracy = accuracy_score(Y_test, Y_pred_nb)
precision = precision_score(Y_test, Y_pred_nb, pos_label=1)
recall = recall_score(Y_test, Y_pred_nb, pos_label=1)
f1 = f1_score(Y_test, Y_pred_nb, pos_label=1)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

print('Confusion Matrix:')
print(confusion_matrix(Y_test, Y_pred_nb))

Best Parameters for NB:  {'alpha': 0.1}
Accuracy: 0.9811659192825112
Precision: 0.9640287769784173
Recall: 0.8933333333333333
F1 Score: 0.9273356401384083
Confusion Matrix:
[[960   5]
 [ 16 134]]


In [16]:
joblib.dump(best_model_nb, 'model.pkl')

['model.pkl']

In [17]:
joblib.dump(tfidf, 'vectorizer.pkl')

['vectorizer.pkl']