In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam_clean.csv')
df.shape

(5572, 2)

In [8]:
df.head(5)

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df['type'].value_counts()

ham     4825
spam     747
Name: type, dtype: int64

## Text-Processing
- tokenization
- lower case
- remove punctuation 
- stopword removal

In [36]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [14]:
word_tokenize("Hi my name is anthony")

['Hi', 'my', 'name', 'is', 'anthony']

In [40]:
# all the english stopwords
sw = stopwords.words('english')
len(sw)

179

In [44]:
def get_clean_email(email):
    
    email = email.lower()
    
    email = re.sub("[^a-z ]+", "", email)
    
    email = word_tokenize(email)
    
    email = [word for word in email if word not in sw]
    
    email = " ".join(email)
    
    # do lemmatisation
    
    return email

In [46]:
get_clean_email("Hi , i have been there germany !!!. @hello anthony @@@")

'hi germany hello anthony'

In [47]:
# i've -> i have 

In [50]:
df['cleaned_message'] = df['message'].apply(get_clean_email)

In [51]:
df.head()

Unnamed: 0,type,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [78]:
df['type'] = df.type.astype('category').cat.codes

In [79]:
df['cleaned_message'][7]

'per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune'

In [80]:
df['message'][7]

"As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"

In [81]:
from sklearn.model_selection import train_test_split

df_X_train, df_X_test, y_train, y_test = train_test_split(df['cleaned_message'], df['type'], test_size=0.25, random_state=42)

print(np.shape(df_X_train), np.shape(df_X_test))

(4179,) (1393,)


## Vectorisation

In [82]:
from sklearn.feature_extraction.text import CountVectorizer # BOW

In [83]:
vectoriser = CountVectorizer()

In [84]:
vectoriser.fit(df_X_train)

In [85]:
X_train = vectoriser.transform(df_X_train)
X_test = vectoriser.transform(df_X_test)

In [86]:
X_train.shape, X_test.shape

((4179, 7093), (1393, 7093))

In [87]:
X_train.shape

(4179, 7093)

In [107]:
# vectoriser.vocabulary_

In [108]:
len(vectoriser.vocabulary_)

7093

# Classfication Algorithm

In [93]:
df['type'].value_counts(normalize=True)

0    0.865937
1    0.134063
Name: type, dtype: float64

In [97]:
df['cleaned_message']

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts st ...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
5    freemsg hey darling weeks word back id like fu...
6       even brother like speak treat like aids patent
7    per request melle melle oru minnaminunginte nu...
8    winner valued network customer selected receiv...
9    mobile months u r entitled update latest colou...
Name: cleaned_message, dtype: object

In [101]:
df[df['type'] == 1]['cleaned_message'].str.contains('free').sum()

199

In [104]:
199/727

0.2737276478679505

In [102]:
df[df['type'] == 0]['cleaned_message'].str.contains('free').sum()

66

In [103]:
66/4800

0.01375

In [128]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [129]:
model = BernoulliNB()

In [111]:
model.fit(X_train, y_train)

In [115]:
y_pred = model.predict(X_test)
y_pred[:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int8)

In [113]:
from sklearn.metrics import f1_score

In [116]:
f1_score(y_test, y_pred)

0.873900293255132

In [126]:
x_query = "i nigerian prince need help send money"

In [127]:
vectoriser.transform([x_query])

<1x7093 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

## Multionomial

In [158]:
model_mnb = MultinomialNB(alpha=1)

In [159]:
model_mnb.fit(X_train, y_train)

In [160]:
y_pred = model_mnb.predict(X_test)

In [161]:
f1_score(y_test, y_pred)

0.9239130434782609

In [162]:
from sklearn.model_selection import GridSearchCV

In [169]:
mnb = MultinomialNB()

clf = GridSearchCV(mnb, param_grid={'alpha': [1.6,1.65, 1.7,1.75, 1.8]}, scoring='f1', cv=3)
clf.fit(X_train, y_train)

In [170]:
clf.best_score_

0.894107148091923

In [171]:
clf.best_params_

{'alpha': 1.75}

In [172]:
final_model = MultinomialNB(alpha=1.75)

In [173]:
final_model.fit(X_train, y_train)

In [174]:
y_pred = model_mnb.predict(X_test)

In [175]:
f1_score(y_test, y_pred)

0.9239130434782609