# CMPUT 466 Final Project
Arun Woosaree

I will be doing binary classification with 3 different algorithms to detect spam in emails

In [None]:
%%bash
pipenv install
kaggle datasets download -d uciml/sms-spam-collection-dataset
unzip -f sms-spam-collection-dataset.zip

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [None]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

In [None]:
# drop the useless columns and rename spam: 1 ham: 0
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)
data.rename(columns={"v1": "label", "v2": "text"}, inplace=True)
data.replace({'spam': 1, 'ham': 0}, inplace=True)
data.head()

In [None]:
num = pd.value_counts(data['label'],sort=True).sort_index()
num.plot(kind='bar')
plt.title('Number of Messages in Dataset')
plt.xticks((0, 1), ("Legitimate", "Spam"), rotation=0)
plt.ylabel('Count')
# plt.show()
plt.savefig("images/histogram.png")

In [None]:

from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV

X = TfidfVectorizer().fit_transform(data['text']).toarray()
t = data["label"]
print(f"Trivial classifier: all Legitimate accuracy: {accuracy_score(t, np.zeros_like(t))}")



In [None]:
from sklearn.model_selection import learning_curve, validation_curve, cross_val_score

# https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html?highlight=validation%20curve
# https://www.dataquest.io/blog/learning-curves-machine-learning/
# https://jakevdp.github.io/PythonDataScienceHandbook/05.03-hyperparameters-and-model-validation.html


# https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py


## Linear Classification
using sklearn

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.get_params()
# train_scores, valid_scores = validation_curve(LinearRegression(), X, t, param_name="normalize", param_range=(True, False), n_jobs=-1)
# plot_learning_curve(LinearRegression(), "Linear Regression", X, t, n_jobs=-1)

In [None]:
# from sklearn.metrics import make_scorer
accuracies = cross_val_score(lr, X, t, n_jobs=-1)
print(accuracies)
f"Mean accuracy: {np.mean(accuracies)}"

## Logistic regression


In [None]:
from sklearn.linear_model import LogisticRegression
# train_scores, valid_scores = validation_curve(LogisticRegression(), X, t, param_name="penalty", param_range=('l1',), n_jobs=-1)
# plot_learning_curve(LogisticRegression(), "Logistic Regression", X, t, n_jobs=-1)
lgr = LogisticRegression()
lgr.get_params()

In [None]:
accuracies = cross_val_score(lgr, X, t, scoring="accuracy", n_jobs=-1)
print(accuracies)
f"Mean accuracy: {np.mean(accuracies)}"

In [None]:
params = {
    'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
    'penalty': ('l1', 'l2', 'elasticnet', 'none'),
}
lg_gs = GridSearchCV(lgr, params, n_jobs=-1, verbose=4)
lg_gs = lg_gs.fit(X, t)
print("best score", lg_gs.best_score_)
print("best params", lg_gs.best_params_)

In [None]:
from sklearn.naive_bayes import MultinomialNB
# train_scores, valid_scores = validation_curve(MultinomialNB(), X, t, param_name="fit_prior", param_range=(True,), n_jobs=-1)
nb = MultinomialNB()
nb.get_params()
# plot_learning_curve(MultinomialNB(), "Multinomial Naive Bayes", X, t, n_jobs=-1)

In [None]:
accuracies = cross_val_score(nb, X, t, scoring="accuracy", n_jobs=-1)
print(accuracies)
f"Mean accuracy: {np.mean(accuracies)}"

In [None]:
from sklearn.linear_model import Perceptron
p = Perceptron()
p.get_params()

In [None]:
accuracies = cross_val_score(p, X, t, scoring="accuracy", n_jobs=-1)
print(accuracies)
f"Mean accuracy: {np.mean(accuracies)}"

In [None]:
params = {
    'penalty': ('l1', 'l2', 'elasticnet', 'none'),
    'max_iter': (500, 1000, 2000)
}
p_gs = GridSearchCV(p, params, n_jobs=-1, verbose=4)
p_gs = p.fit(X, t)
print("best score", p_gs.best_score_)
print("best params", p_gs.best_params_)

In [None]:
from sklearn.neural_network import MLPClassifier
# train_scores, valid_scores = validation_curve(MLPClassifier(), X, t, param_name="solver", param_range=('lbfgs', 'sgd', 'adam'), n_jobs=-1)
# plot_learning_curve(MLPClassifier(), "Multilayer Perceptron", X, t, n_jobs=-1)
mlp = MLPClassifier()
mlp.get_params()

In [None]:
accuracies = cross_val_score(mlp, X, t, scoring="accuracy", n_jobs=-1)
print(accuracies)
f"Mean accuracy: {np.mean(accuracies)}"

In [None]:
params = {
    'activation': ('relu', 'identity', 'logistic', 'tanh'),
    'hidden_layer_sizes': ((100), (100, 50, 25), (100, 80, 60, 40, 20, 10)),
    'learning_rate': ('invscaling', 'adaptive'),
    'max_iter': (400, 800, 1600)
}
mlp_gs = GridSearchCV(mlp, params, n_jobs=-1, verbose=4)
mlp_gs = mlp.fit(X, t)
print("best score", mlp_gs.best_score_)
print("best params", mlp_gs.best_params_)