In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [25]:
df = pd.read_csv("spam.csv", encoding="latin-1")[['v1','v2']]
df.columns = ['label','message']
df['is_spam'] = df['label'].map({'ham':0,'spam':1})
df['word_freq_free'] = df['message'].str.lower().str.count('free')
df['word_freq_win'] = df['message'].str.lower().str.count('win')
df['word_freq_offer'] = df['message'].str.lower().str.count('offer')
df['sms_length'] = df['message'].str.len()
df_features = df[['word_freq_free','word_freq_win','word_freq_offer','sms_length','is_spam']]

In [26]:
df['is_spam'].value_counts()

is_spam
0    4825
1     747
Name: count, dtype: int64

In [27]:
df_features.head()

Unnamed: 0,word_freq_free,word_freq_win,word_freq_offer,sms_length,is_spam
0,0,0,0,111,0
1,0,0,0,29,0
2,1,1,0,155,1
3,0,0,0,49,0
4,0,0,0,61,0


In [28]:
df_features.isna().sum()

word_freq_free     0
word_freq_win      0
word_freq_offer    0
sms_length         0
is_spam            0
dtype: int64

In [29]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   word_freq_free   5572 non-null   int64
 1   word_freq_win    5572 non-null   int64
 2   word_freq_offer  5572 non-null   int64
 3   sms_length       5572 non-null   int64
 4   is_spam          5572 non-null   int64
dtypes: int64(5)
memory usage: 217.8 KB


In [30]:
df_features.describe()

Unnamed: 0,word_freq_free,word_freq_win,word_freq_offer,sms_length,is_spam
count,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.058686,0.03374,0.009332,80.118808,0.134063
std,0.286039,0.203036,0.106775,59.690841,0.340751
min,0.0,0.0,0.0,2.0,0.0
25%,0.0,0.0,0.0,36.0,0.0
50%,0.0,0.0,0.0,61.0,0.0
75%,0.0,0.0,0.0,121.0,0.0
max,3.0,3.0,2.0,910.0,1.0


In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import GridSearchCV

In [32]:
X=df_features[['word_freq_free','word_freq_win','word_freq_offer','sms_length']]
y=df_features['is_spam']

In [33]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [34]:
model=MultinomialNB()
model.fit(x_train,y_train)
pred_y=model.predict(x_test)
print(accuracy_score(y_test,pred_y))
print(precision_score(y_test,pred_y))
print(recall_score(y_test,pred_y))
print(f1_score(y_test,pred_y))
print(classification_report(y_test,pred_y))

0.8923766816143498
0.7586206896551724
0.29333333333333333
0.4230769230769231
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       965
           1       0.76      0.29      0.42       150

    accuracy                           0.89      1115
   macro avg       0.83      0.64      0.68      1115
weighted avg       0.88      0.89      0.87      1115



In [35]:
model2=GaussianNB()
model2.fit(x_train,y_train)
pred_y2=model2.predict(x_test)
accuracy_score(y_test,pred_y2)
print(accuracy_score(y_test,pred_y2))
print(precision_score(y_test,pred_y2))
print(recall_score(y_test,pred_y2))
print(f1_score(y_test,pred_y2))
print(classification_report(y_test,pred_y2))

0.8896860986547085
0.6451612903225806
0.4
0.49382716049382713
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       965
           1       0.65      0.40      0.49       150

    accuracy                           0.89      1115
   macro avg       0.78      0.68      0.72      1115
weighted avg       0.88      0.89      0.88      1115



In [36]:
model3=BernoulliNB()
model3.fit(x_train,y_train)
pred_y3=model3.predict(x_test)
accuracy_score(y_test,pred_y3)
print(accuracy_score(y_test,pred_y3))
print(precision_score(y_test,pred_y3))
print(recall_score(y_test,pred_y3))
print(f1_score(y_test,pred_y3))
print(classification_report(y_test,pred_y3))

0.8887892376681614
0.6382978723404256
0.4
0.4918032786885246
              precision    recall  f1-score   support

           0       0.91      0.96      0.94       965
           1       0.64      0.40      0.49       150

    accuracy                           0.89      1115
   macro avg       0.78      0.68      0.71      1115
weighted avg       0.88      0.89      0.88      1115



In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [38]:
param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'criterion': ['gini', 'entropy'],
     'min_samples_split': [2,3,4, 5,8,9, 10,13,17,18, 20],
}


In [39]:
best=GridSearchCV(param_grid=param_grid,estimator=DecisionTreeClassifier(),cv=5,n_jobs=-1)
best.fit(x_train,y_train)
best.best_params_   

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 2,
 'min_samples_split': 13}

In [40]:
model_dt=DecisionTreeClassifier(max_depth=7,min_samples_leaf=2,criterion='entropy',min_samples_split=13,class_weight='balanced')
model_dt.fit(x_train,y_train)
pred_y_dt=model_dt.predict(x_test)
accuracy_score(y_test,pred_y_dt)
print(accuracy_score(y_test,pred_y_dt))
print(precision_score(y_test,pred_y_dt))
print(recall_score(y_test,pred_y_dt))
print(f1_score(y_test,pred_y_dt))
print(classification_report(y_test,pred_y_dt))

0.8573991031390135
0.4811715481171548
0.7666666666666667
0.5912596401028277
              precision    recall  f1-score   support

           0       0.96      0.87      0.91       965
           1       0.48      0.77      0.59       150

    accuracy                           0.86      1115
   macro avg       0.72      0.82      0.75      1115
weighted avg       0.90      0.86      0.87      1115



In [41]:
model_rf=RandomForestClassifier(n_estimators=100,max_depth=100,class_weight='balanced')
model_rf.fit(x_train,y_train)
pred_y_rf=model_rf.predict(x_test)
accuracy_score(y_test,pred_y_rf)
print(accuracy_score(y_test,pred_y_rf))
print(precision_score(y_test,pred_y_rf))
print(recall_score(y_test,pred_y_rf))
print(f1_score(y_test,pred_y_rf))
print(classification_report(y_test,pred_y_rf))

0.8645739910313901
0.49783549783549785
0.7666666666666667
0.6036745406824147
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       965
           1       0.50      0.77      0.60       150

    accuracy                           0.86      1115
   macro avg       0.73      0.82      0.76      1115
weighted avg       0.90      0.86      0.88      1115



In [42]:
import pickle
pickle.dump(model_rf,open('spam_model.pkl','wb'))