In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import pickle


data = pd.read_csv('spam.csv', encoding='Windows-1252').drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis = 1)
data = data.rename(columns={'v1':'label','v2':'text'})
data.to_csv('spam_clean.csv', index=False, encoding='utf-8')
# Features & Target

X = data['text']
Y = data['label']

# Label encoder
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)

# Vectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Model with best params

model = GradientBoostingClassifier(n_estimators=200, 
                                   learning_rate=0.1, 
                                   max_depth=5)

model.fit(X, Y)


with open('models/spam_detector.pkl', 'wb') as model_file :
    pickle.dump(model, model_file)

with open('models/vectorizer.pkl', 'wb') as model_file :
    pickle.dump(vectorizer, model_file)