In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

import chardet
with open("spam.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}

In [2]:
data = pd.read_csv('spam.csv', encoding='Windows-1252').drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis = 1)
data = data.rename(columns={'v1':'label','v2':'text'})
data.to_csv('spam_clean.csv', index=False, encoding='Windows-1252')
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [40]:
# Features & Target

X = data['text']
Y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)

In [41]:
# Label encoder

Y = Y.replace({"spam":1,"ham":0})

In [42]:
# Vectorizer

vectorizer = CountVectorizer()

# Updates of X_train & X_test

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [47]:
# Model

model = GradientBoostingClassifier(n_estimators=100, 
                                   learning_rate=1, 
                                   max_depth=1)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [52]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))
print("\n")
print("score train :", model.score(X_train, y_train))
print("score test :", model.score(X_test, y_test))

              precision    recall  f1-score   support

         ham       0.96      0.93      0.94       944
        spam       0.67      0.80      0.73       171

    accuracy                           0.91      1115
   macro avg       0.81      0.86      0.83      1115
weighted avg       0.92      0.91      0.91      1115

[[876  68]
 [ 35 136]]


score train : 0.9178819833969037
score test : 0.9076233183856502


In [54]:
parameters = {
    "n_estimators" : [10,100,200],
    "learning_rate" : [0.1,0.5,1],
    "max_depth" : [1,3,5]
}

clf = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs= -1)
clf.fit(X_train, y_train)

print(clf.best_params_)


{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}


In [55]:
# Model with best params

model = GradientBoostingClassifier(n_estimators=200, 
                                   learning_rate=0.1, 
                                   max_depth=5)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [56]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))
print("\n")
print("score train :", model.score(X_train, y_train))
print("score test :", model.score(X_test, y_test))

              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       944
        spam       0.97      0.88      0.92       171

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

[[939   5]
 [ 21 150]]


score train : 1.0
score test : 0.9766816143497757


In [57]:
import pickle

with open('spam_detector.pkl', 'wb') as model_file :
    pickle.dump(model, model_file)