In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [32]:
df = pd.read_csv("spam.csv", encoding="latin-1")[['v1','v2']]
df.columns = ['label','message']
df['is_spam'] = df['label'].map({'ham':0,'spam':1})
df['word_freq_free'] = df['message'].str.lower().str.count('free')
df['word_freq_win'] = df['message'].str.lower().str.count('win')
df['word_freq_offer'] = df['message'].str.lower().str.count('offer')
df['sms_length'] = df['message'].str.len()
df_features = df[['word_freq_free','word_freq_win','word_freq_offer','sms_length','is_spam']]

In [33]:
df_features.head()

Unnamed: 0,word_freq_free,word_freq_win,word_freq_offer,sms_length,is_spam
0,0,0,0,111,0
1,0,0,0,29,0
2,1,1,0,155,1
3,0,0,0,49,0
4,0,0,0,61,0


In [108]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV

In [35]:
X=df_features[['word_freq_free','word_freq_win','word_freq_offer','sms_length']]
y=df_features['is_spam']

In [36]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [72]:
model=MultinomialNB()
model.fit(x_train,y_train)
pred_y=model.predict(x_test)
accuracy_score(y_test,pred_y)

0.8923766816143498

In [69]:
model2=GaussianNB()
model2.fit(x_train,y_train)
pred_y2=model2.predict(x_test)
accuracy_score(y_test,pred_y2)

0.8896860986547085

In [71]:
model3=BernoulliNB()
model3.fit(x_train,y_train)
pred_y2=model3.predict(x_test)
accuracy_score(y_test,pred_y2)

0.8887892376681614

In [95]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [130]:
param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'criterion': ['gini', 'entropy'],
     'min_samples_split': [2,3,4, 5,8,9, 10,13,17,18, 20],
}


In [131]:
best=GridSearchCV(param_grid=param_grid,estimator=DecisionTreeClassifier(),cv=5)
best.fit(x_train,y_train)
best.best_params_   

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 2,
 'min_samples_split': 13}

In [133]:
model_dt=DecisionTreeClassifier(max_depth=7,min_samples_leaf=2,criterion='entropy',min_samples_split=13)
model_dt.fit(x_train,y_train)
pred_y_dt=model_dt.predict(x_test)
accuracy_score(y_test,pred_y_dt)

0.9094170403587444

In [100]:
model_rf=RandomForestClassifier(n_estimators=100,max_depth=100)
model_rf.fit(x_train,y_train)
pred_y_rf=model_rf.predict(x_test)
accuracy_score(y_test,pred_y_rf)

0.895067264573991

In [134]:
import pickle
pickle.dump(model_rf,open('spam_model.pkl','wb'))