**Import library**

In [89]:
import pandas as pd
import matplotlib.pyplot as mpl
from  sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

**Data loading**

In [52]:
dataset = pd.read_csv("spam.csv",encoding="latin-1")[["v1","v2"]]

**Explore data analysis**

In [53]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [35]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [54]:
dataset["v1"].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [37]:
dataset["v2"].duplicated().value_counts()

v2
False    5169
True      403
Name: count, dtype: int64

In [55]:
dataset[dataset.duplicated()]

Unnamed: 0,v1,v2
102,ham,As per your request 'Melle Melle (Oru Minnamin...
153,ham,As per your request 'Melle Melle (Oru Minnamin...
206,ham,"As I entered my cabin my PA said, '' Happy B'd..."
222,ham,"Sorry, I'll call later"
325,ham,No calls..messages..missed calls
...,...,...
5524,spam,You are awarded a SiPix Digital Camera! call 0...
5535,ham,"I know you are thinkin malaria. But relax, chi..."
5539,ham,Just sleeping..and surfing
5553,ham,Hahaha..use your brain dear


**Data preprocess**

In [58]:
#change the column name
dataset = dataset.rename(columns = {"v1":"label","v2":"message"})

In [29]:
#remove the duplicates based on message column
dataset = dataset.drop_duplicates(keep="first",subset="message")

In [60]:
#categorical encoding
dataset["label"] = dataset["label"].map({"ham":0,"spam":1})

In [62]:
#split dataset into train and test
x_train,x_test,y_train,y_test = train_test_split(dataset["message"],dataset["label"],test_size=0.2,random_state=42)

In [63]:
#feature extraction
tfidf  = TfidfVectorizer(stop_words="english",max_df=0.7)
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)


**Build model**

In [87]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": LinearSVC(dual = "auto")
}

In [88]:
for name,model in models.items():
    model.fit(x_train_tfidf,y_train)
    y_pred = model.predict(x_test_tfidf)
    print(f"\n{name} Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Metrics:\n", confusion_matrix(y_test,y_pred))


Naive Bayes Results:
Accuracy: 0.9668161434977578
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Metrics:
 [[965   0]
 [ 37 113]]

Logistic Regression Results:
Accuracy: 0.9524663677130045
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.97      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115

Confusion Metrics:
 [[962   3]
 [ 50 100]]

Support Vector Machine Results:
Accuracy: 0.97847533632287
Classification Report:
               prec

**Save the model**

In [91]:
joblib.dump(models["Support Vector Machine"],"spam_classifier.pkl")
joblib.dump(tfidf,"tfidfVectorizer.pkl")

['tfidfVectorizer.pkl']