# Spam detection AML 2017

In [7]:
# to not affect python config, but only current session: 
%config IPCompleter.greedy=True

In [8]:
# Import package 

import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt
import pylab as pl

# data processing
from nltk.tokenize import RegexpTokenizer 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re

# models: 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import IsolationForest
from sklearn.naive_bayes import BernoulliNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn import svm
from sklearn import grid_search

#Plotly for all the plots
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

# Improve the models:
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import average_precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score


# Processing Data

In [14]:
# extract numbers
token = RegexpTokenizer(r'\b[^\d\W]+\b') 

# remove stop words
stopwords = set(stopwords.words('english')) 

# remove subject
stopwords.update(set(['_', 'subject']))

lemmatizer = WordNetLemmatizer()

# lemmatize 
def lemmatize(data):
    for i, line in enumerate(data['text']): 
        newline = [] 
        words = line.split() 
        for word in words: 
            word = lemmatizer.lemmatize(word) 
            newline.append(word) 
        new = ' '.join(newline)  
        data.loc[i, 'text'] = new 


In [15]:
# Get data
train = pd.read_csv('./Data/emails.train.csv')
test  = pd.read_csv('./Data/emails.test.csv')

# lemmatize text
lemmatize(train)
lemmatize(test)

text = train['text']
Y_train = train['spam']
Y_test = test['spam']

# vectorize 
vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=token.tokenize, max_features = 1000)
vectorizer.fit(text)

X_train = vectorizer.transform(train['text']).todense()
X_test = vectorizer.transform(test['text']).todense()

# Empty lists for plots 
Models = []
Scores = []
Cross_scores = []

In [43]:
# get descriptives 

plotly.tools.set_credentials_file(username='fvs10533176', api_key='r56zsrqviN0WhBa3R05F')
total_email = train.shape[0] + test.shape[0]
total_spam = train[train['spam'] == 1].shape[0] + test[test['spam'] == 1].shape[0]
total_ham =  train[train['spam'] == 0].shape[0] + test[test['spam'] == 0].shape[0]

train_spam = train[train['spam'] == 1].shape[0]
train_ham = train[train['spam'] == 0].shape[0]

test_spam = test[test['spam'] == 1].shape[0]
test_ham = test[test['spam'] == 0].shape[0]

print (train_spam)
print (test_spam)
print (train_ham)
print (test_ham)

print (total_spam)
print (total_ham)
# headerColor = 'grey'
# rowEvenColor = 'lightgrey'
# rowOddColor = 'white'

# trace0 = go.Table(
#   type = 'table',
#   header = dict(
#     values = [['<b>Descriptive Statistics</b>'],
#                   ['<b>Spam</b>'],
#                   ['<b>Ham</b>'],
#                   ['<b>Total</b>']],
#     line = dict(color = '#506784'),
#     fill = dict(color = headerColor),
#     align = ['left','center'],
#     font = dict(color = 'white', size = 12)
#   ),
#   cells = dict(
#     values = [
#       [['Train emails', 'Test emails','<b>TOTAL</b>']],
#       [[train_spam, test_spam, total_spam]],
#       [[train_ham, test_ham, total_ham]],
#       [[total_spam, total_ham, total_email]],
#     line = dict(color = '#506784'),
#     fill = dict(color = [[rowOddColor,rowEvenColor,rowOddColor,
#                                rowEvenColor,rowOddColor]]),
#     align = ['left', 'center'],
#     font = dict(color = '#506784', size = 11)
#     ))

# data = [trace0]

# py.iplot(data, filename = "descriptives")


953
415
3068
1292
1368
4360


# KNN model classifier

In [24]:
random_k = list(range(1,50))

# only odd numbers
neighbor = filter(lambda x: x % 2 != 0, random_k)

cross_scores = []

# 10 fold crossvalidation using training set, to find optimal k-value for classifier
for k in neighbor:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, Y_train, cv=10, scoring='accuracy')
    cross_scores.append(scores.mean())

# misclassification error
MSE = [1 - x for x in cross_scores]
optimal_k = neighbor[MSE.index(min(MSE))]
print "The optimal number of neighbors is %d" % optimal_k

The optimal number of neighbors is 35


In [None]:
# plot with MSE vs k
plt.plot(neighbor, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()

In [73]:
# with use of optimal_k
knn = KNeighborsClassifier(n_neighbors= optimal_k)

# model fitting
knn.fit(X_train, Y_train)

# predict
Y_pred = knn.predict(X_test)

# accuracy evaluation
print accuracy_score(Y_test, Y_pred)
Models.append('KNN')
Scores.append(accuracy_score(Y_test, Y_pred))
Cross_scores.append(cross_val_score(knn, X_train, Y_train, cv = 10, scoring = 'accuracy').mean())

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('knn.csv', index=False)

NameError: name 'optimal_k' is not defined

In [None]:
h = .02  # step size in the mesh
 
# Calculate min, max and limits
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
# Put the result into a color plot
plt.figure()
plt.scatter(X_train[:, 0], X_train[:, 1])
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Data points")
plt.show()

# Random Forest Model Classifier

In [21]:
rf_class = RandomForestClassifier() 
 
# Use a grid search to find optimal parameters
param_grid = { 
           "n_estimators" : [8, 16, 32, 40, 48, 56],
           "max_depth" : [1, 5, 10, 15, 20, 25],
           "min_samples_leaf" : [1, 2, 4, 6, 8, 10]}
 
CV_rf_class = GridSearchCV(estimator=rf_class, param_grid=param_grid, n_jobs=-1)
CV_rf_class.fit(X_train, Y_train)
print CV_rf_class.best_params_


{'n_estimators': 40, 'max_depth': 25, 'min_samples_leaf': 1}


In [11]:
print CV_rf_class.best_score_
# print accuracy_score(Y_test, Y_pred)


0.973887092763


In [22]:
est = CV_rf_class.best_estimator_
Y_pred = est.predict(X_test)
Models.append('Random Forest')
Scores.append(accuracy_score(Y_test, Y_pred))
Cross_scores.append(cross_val_score(rf_class, X_train, Y_train, cv = 10, scoring = 'accuracy').mean())

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('rforest.csv', index=False)

In [23]:
print Scores

[0.93204452255418868, 0.93204452255418868, 0.93204452255418868, 0.93204452255418868, 0.93204452255418868, 0.96485061511423553]


# Gaussian Process Model Classifier

In [19]:
gaus_class = GaussianProcessClassifier()

gaus_class.fit(X_train, Y_train)

Y_pred = gaus_class.predict(X_test)

print accuracy_score(Y_test, Y_pred)

Models.append('Gaussian Process')
Scores.append(accuracy_score(Y_test, Y_pred))
Cross_scores.append(cross_val_score(gaus_class, X_train, Y_train, cv = 10, scoring = 'accuracy').mean())

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('gaussian.csv', index=False)


0.960749853544


# Naive Bayes classifier for multivariate Bernoulli models

In [19]:
NB = BernoulliNB()

NB.fit(X_train, Y_train)

Y_pred = NB.predict(X_test)

print (accuracy_score(Y_test, Y_pred))

Models.append('Bernoulli Naive Bayes')
Scores.append(accuracy_score(Y_test, Y_pred))
Cross_scores.append(cross_val_score(NB, X_train, Y_train, cv = 10, scoring = 'accuracy').mean())

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('BernoulliNaiveBayes.csv', index=False)

0.932044522554


# Isolation Forest Model 

In [20]:
isolationForest = IsolationForest()

isolationForest.fit(X_train, Y_train)

y_pred = isolationForest.predict(X_test)

print (accuracy_score(Y_test, Y_pred))

Models.append('Isolation Forest')
Scores.append(accuracy_score(Y_test, Y_pred))
Cross_scores.append(accuracy_score(Y_test, Y_pred))

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('isolationForest.csv', index=False)

0.932044522554


# C Support Vector Classification

In [17]:
# def svc_param_selection(X, y, nfolds):
#     Cs = [0.001, 0.01, 0.1, 1, 10]
#     gammas = [0.001, 0.01, 0.1, 1]
#     param_grid = {'C': Cs, 'gamma' : gammas}
#     grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
#     grid_search.fit(X, y)
#     grid_search.best_params_
#     return grid_search.best_params_

# print (svc_param_selection(X_train, Y_train, 10)) kwam ene gamma van 1 uit en een c van 10
model = svm.SVC(kernel='rbf', C=10, gamma=1) 

model.fit(X_train, Y_train)
model.score(X_train, Y_train)
Y_pred = model.predict(X_test)

print (cross_val_score(model, X_train, Y_train, cv = 10, scoring = 'accuracy').mean())
# print (accuracy_score(Y_test, Y_pred))

Models.append('C-Support Vector')
Scores.append(accuracy_score(Y_test, Y_pred))
Cross_scores.append(cross_val_score(model, X_train, Y_train, cv = 10, scoring = 'accuracy').mean())

pd.DataFrame({
    'id': test.id,
    'spam': Y_pred
}).to_csv('C-Support.csv', index=False)


0.986822682563


# Plot scores

In [18]:
plotly.tools.set_credentials_file(username='fvs10533176', api_key='r56zsrqviN0WhBa3R05F')

trace1 = go.Bar(
    x=Models,
    y=Scores,
    name='Accuracy'
)

trace2 = go.Bar(
    x=Models,
    y= Cross_scores,
    name='Accuracy with cross validation'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group',
    title= 'Accuracy scores for Spam detection',
    yaxis=dict(
        type='log',
        autorange=True
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Scores')

