In [1]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re

stpowords = nltk.corpus.stopwords.words('english')

In [2]:
import os
path = os.getcwd()

In [3]:
# reading the email message collection from file
messages = pd.read_csv(path + '/data/spam.csv', encoding='latin-1')

In [4]:
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
# drop the unwanted columns and add appropriate labels
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]

In [6]:
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
# clean the text to identify tokens and remove stopwords and punctuation
def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stpowords]
    return text

In [20]:
# example application of clean_text
# messages['text'][0]
clean_text(str(messages['text'][0]))

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [21]:
# define the tfidf vector
tfidf_vect = TfidfVectorizer(analyzer=clean_text)

# implement fit_transform on the messages and generate a sparse matrix
X_tfidf = tfidf_vect.fit_transform(messages['text'])

In [23]:
# convert the sparse matrix into a dataframe
X_features = pd.DataFrame(X_tfidf.toarray())
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9385,9386,9387,9388,9389,9390,9391,9392,9393,9394
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# view the defaults and arguments for RandmoForestClassifier
# RandomForestClassifier()

In [37]:
# import the other required functions and packages
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [29]:
# split the data into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X_features, messages['label'], test_size=0.2)

In [30]:
# fit the random forest model on the training set
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

In [31]:
# make predictions on the test set 
y_pred = rf_model.predict(X_test)

In [38]:
# evaluate the predictions based on the precision and recall metrices
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1_scr = f1_score(y_test, y_pred, pos_label='spam')

In [39]:
# print the precision and recall scores. Also f1 score
print(f"Precision: {round(precision, 3)}")
print(f"Recall: {round(recall, 3)}")
print(f"F1-Score: {round(f1_scr, 3)}")

Precision: 1.0
Recall: 0.822
F1-Score: 0.902
