In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer 

In [3]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Data Load

In [4]:
df = pd.read_csv("D:/dataset/spam.csv",encoding='latin-1')[['v1','v2']]
df.columns=['label','text']
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5867,spam,Human resource management is essential for the...
5868,spam,That&#39;s less than $19/month! 27 Dec 2024 | ...
5869,spam,Claim one of our best offers of the year to le...
5870,ham,This is a copy of a security alert sent to asi...


In [5]:
df['label'].value_counts()

label
ham     4943
spam     929
Name: count, dtype: int64

## Data Preprocess

In [14]:
def clean_text(text):
    text=text.lower()
    text = re.sub(r'[^a-zA-Z0-9!%@ ]+','',text)
    text = text.translate(str.maketrans('','',string.punctuation))
    
    text = text.strip()

    return text

In [15]:
df['clean_text'] = df['text'].apply(clean_text)

In [16]:
df

Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...
...,...,...,...
5867,spam,Human resource management is essential for the...,human resource management is essential for the...
5868,spam,That&#39;s less than $19/month! 27 Dec 2024 | ...,that39s less than 19month 27 dec 2024 can39t ...
5869,spam,Claim one of our best offers of the year to le...,claim one of our best offers of the year to le...
5870,ham,This is a copy of a security alert sent to asi...,this is a copy of a security alert sent to asi...


In [17]:
df.tail()

Unnamed: 0,label,text,clean_text
5867,spam,Human resource management is essential for the...,human resource management is essential for the...
5868,spam,That&#39;s less than $19/month! 27 Dec 2024 | ...,that39s less than 19month 27 dec 2024 can39t ...
5869,spam,Claim one of our best offers of the year to le...,claim one of our best offers of the year to le...
5870,ham,This is a copy of a security alert sent to asi...,this is a copy of a security alert sent to asi...
5871,spam,Plus more awesome Boxing Day deals you wouldn&...,plus more awesome boxing day deals you wouldn3...


## Vectorize

In [18]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['clean_text'])

y = df['label'].map({'ham':0,'spam':1}) 

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### SVM Model

In [36]:
svm = SVC()
svm.fit(X_train,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [37]:
svm.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [20]:
param_svm = {
    'C':[.1,1,10],
    'kernel':['rbf','linear','sigmoid']
    }

In [21]:
grid_svm = GridSearchCV(SVC(),param_svm,cv=5,scoring='accuracy')
grid_svm.fit(X_train,y_train)

0,1,2
,estimator,SVC()
,param_grid,"{'C': [0.1, 1, ...], 'kernel': ['rbf', 'linear', ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,10
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [22]:
grid_svm.best_params_

{'C': 10, 'kernel': 'linear'}

In [23]:
y_pred = grid_svm.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9668085106382979

Confusion Matrix:
 [[984   7]
 [ 32 152]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       991
           1       0.96      0.83      0.89       184

    accuracy                           0.97      1175
   macro avg       0.96      0.91      0.93      1175
weighted avg       0.97      0.97      0.97      1175



### Logistic Regression

In [39]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [40]:
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [24]:
param_lr = {
    'C':[.1,1,10],
    'penalty':['l2'],
    'solver': ['liblinear'],
    'max_iter':[200]
}

In [25]:
grid_lr = GridSearchCV(LogisticRegression(),param_lr,cv=5,scoring='accuracy')
grid_lr.fit(X_train,y_train)

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'C': [0.1, 1, ...], 'max_iter': [200], 'penalty': ['l2'], 'solver': ['liblinear']}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,200


In [26]:
grid_lr.best_params_

{'C': 10, 'max_iter': 200, 'penalty': 'l2', 'solver': 'liblinear'}

In [27]:
y_pred = grid_lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9617021276595744

Confusion Matrix:
 [[987   4]
 [ 41 143]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       991
           1       0.97      0.78      0.86       184

    accuracy                           0.96      1175
   macro avg       0.97      0.89      0.92      1175
weighted avg       0.96      0.96      0.96      1175



### Naive Bayes

In [28]:
naive = MultinomialNB()
naive.fit(X_train,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [29]:
naive.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': True}

In [30]:
y_pred = naive.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9523404255319149

Confusion Matrix:
 [[989   2]
 [ 54 130]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       991
           1       0.98      0.71      0.82       184

    accuracy                           0.95      1175
   macro avg       0.97      0.85      0.90      1175
weighted avg       0.95      0.95      0.95      1175



## Predictions

In [31]:
def predict_spam(text):
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    result = grid_lr.predict(vector)

    return "Spam" if result[0]==1 else "Ham"

In [32]:
print(predict_spam("Congratulations! You've won a $1000 gift card!"))

Spam


In [33]:
print(predict_spam("Hey, are we still meeting at 6?"))

Ham


In [36]:
import joblib

In [37]:
joblib.dump(naive,'naive_spam_classifier.pkl')
joblib.dump(grid_svm,'grid_svm_spam_classifier.pkl')
joblib.dump(grid_lr,'grid_lr_spam_classifier.pkl')
joblib.dump(vectorizer,'tf-idf_vectorizer.pkl')

['tf-idf_vectorizer.pkl']