In [1]:
import string
import pandas as pd

In [2]:
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df['spam'].value_counts()

spam
0    4360
1    1368
Name: count, dtype: int64

In [4]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocessor(text):
    # remove punctuation and lowercase
    text = "".join([t.lower() for t in text if t not in string.punctuation])

    # tokenize
    tokens = text.split(' ') # "hello world" -> ["hello", "world"]

    # filter out stopwords
    return ' '.join([t for t in tokens if t not in ENGLISH_STOP_WORDS])

df['text'] = df['text'].apply(lambda text: preprocessor(text))

In [5]:
df.head()

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merril...,1
2,subject unbelievable new homes easy im wantin...,1
3,subject 4 color printing special request addi...,1
4,subject money software cds software compati...,1


In [6]:
x = df['text']
y = df['spam']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorize the text so that we can use it in our model using TF-IDF algorithm
tfidf = TfidfVectorizer()
tfidf.fit(x)
x_vectors = tfidf.transform(x)

In [8]:
x_vectors

<5728x37023 sparse matrix of type '<class 'numpy.float64'>'
	with 508390 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_vectors, y, test_size=0.2, random_state=42)

print(x_train.shape)
print(y_train.shape)

(4582, 37023)
(4582,)


In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier()
knn_classifier.fit(x_train, y_train)

In [11]:
y_pred = knn_classifier.predict(x_test)
y_pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [12]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

Accuracy: 0.9729493891797557
Precision: 0.9814126394052045
Recall: 0.9103448275862069
Confusion Matrix: 
[[851   5]
 [ 26 264]]


In [13]:
from joblib import dump


dump(knn_classifier, 'knn_classifier.joblib')
dump(tfidf, 'tfidf.joblib')

['tfidf.joblib']