In [1]:
import pandas as pd
import numpy as np 
from sklearn.datasets import load_files

# Text cleaning and preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv("./spam/train.csv")
df.shape

(30344, 3)

In [3]:
df.head()

Unnamed: 0,Subject,Message,Spam/Ham
0,transfers from ees,attached is the latest version of the cost cen...,spam
1,fw : re ivanhoe e . s . d,"fyi , kim .\n- - - - - original message - - - ...",spam
2,re : enerfin meter 980439 for 10 / 00,it did but tetco prorated the flow between the...,ham
3,meoh plant status,the methanol plant has determined extensive re...,ham
4,re : tenaska iv,i tried calling you this am but your phone rol...,spam


In [4]:
X, y = df['Message'][:5000], df['Spam/Ham'][:5000]

In [5]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

Now we have list of texts, that encoded binary. Using 'decode' is one of possible solutions, but some texts don't allow us to apply decoding. This can be solved by deleting these texts, but because of this, we can lose important information. Instead of this we can do following:

* We should remove all special symbols.
* Remove 'b' in beginning of each text
* Replace all gaps (\t, \n, \r, \f) between words with spaces
* Remove all non-letters characters

In [6]:
# Create corpus
corpus = []
for i in tqdm(range(0, len(X))):
    # Remove special symbols
    review = re.sub(r'\\r\\n', ' ', str(X[i]))
    # Remove all symbols except letters
    review = re.sub('[^a-zA-Z]', ' ', review)
    # Replacing all gaps with spaces 
    review = re.sub(r'\s+', ' ', review)                    
    # Remove 'b' in the beginning of each text
    review = re.sub(r'^b\s+', '', review)       

    review = review.lower().split()
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#tf = TfidfVectorizer()

# Creating the Bag of Words model
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [8]:
# Splitting data on train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=9, test_size=0.2)

Trying Naive Bayes model as the likelihood of whether an email is spam or ham is an aposterior probability and they usually show high performance in spam detection.

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

model = MultinomialNB().fit(X_train, y_train)
pred = model.predict(X_test)

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, pos_label = 'spam')
recall = recall_score(y_test, pred, pos_label = 'spam')
conf_m = confusion_matrix(y_test, pred)

print(f"accuracy: %.3f" %accuracy)
print(f"precision: %.3f" %precision)
print(f"recall: %.3f" %recall)
print(f"confusion matrix: ")
print(conf_m)

accuracy: 0.979
precision: 0.962
recall: 1.000
confusion matrix: 
[[452  21]
 [  0 527]]
