In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('spam.csv',encoding='ISO-8859-1')

In [3]:
data.drop(columns = ['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace = True)

In [4]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
message = data.loc[:,'v2']
check = data.loc[:,'v1']

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(message,check,test_size = 0.2,random_state = 0)
X_train.shape

(4457,)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_x_vector = vectorizer.fit_transform(X_train)
test_x_vector = vectorizer.transform(X_test)
test_x_vector.shape

(1115, 7612)

## Testing on different machines

### SVM

In [8]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vector,y_train)
clf_svm.predict(test_x_vector[0])

array(['ham'], dtype=object)

### Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vector,y_train)
clf_dec.predict(test_x_vector[0])

array(['ham'], dtype=object)

### Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()
clf_mnb.fit(train_x_vector,y_train)
clf_mnb.predict(test_x_vector[0])

array(['ham'], dtype='<U4')

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
clf_lrs = LogisticRegression(random_state = 0)
clf_lrs.fit(train_x_vector,y_train)
clf_lrs.predict(test_x_vector[0])

array(['ham'], dtype=object)

In [12]:
print(clf_svm.score(test_x_vector,y_test))
print(clf_dec.score(test_x_vector,y_test))
print(clf_mnb.score(test_x_vector,y_test))
print(clf_lrs.score(test_x_vector,y_test))

0.9811659192825112
0.9632286995515695
0.9874439461883409
0.9766816143497757


In [13]:
from sklearn.metrics import f1_score
print(f1_score(y_test,clf_svm.predict(test_x_vector),average=None,labels=['spam','ham']))
print(f1_score(y_test,clf_dec.predict(test_x_vector),average=None,labels=['spam','ham']))
print(f1_score(y_test,clf_mnb.predict(test_x_vector),average=None,labels=['spam','ham']))
print(f1_score(y_test,clf_lrs.predict(test_x_vector),average=None,labels=['spam','ham']))

[0.93375394 0.98902248]
[0.87306502 0.97850026]
[0.95652174 0.99266247]
[0.91612903 0.98645833]


In [14]:
from sklearn.model_selection import GridSearchCV
parameters = {'fit_prior':(True,False),'alpha':[1.0,2.0,3.0,4.0,5.0,6.0]}
MNB = MultinomialNB()
clf = GridSearchCV(MNB,parameters,cv=5)
clf.fit(train_x_vector,y_train)
print(clf.score(test_x_vector,y_test))
print(f1_score(y_test,clf.predict(test_x_vector),average=None,labels=['spam','ham']))

0.9874439461883409
[0.95652174 0.99266247]


In [15]:
import pickle
with open('./models/spam_classifier.pkl','wb') as f:
    pickle.dump(clf,f)
with open('./models/text_vectorizer.pkl','wb') as f1:
    pickle.dump(vectorizer,f1)