In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('spam_clean.csv')

In [3]:
df.shape

(5572, 2)

In [4]:
df.head()

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df['message'][10]

"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [54]:
df['type'].value_counts()

0    4825
1     747
Name: type, dtype: int64

## Text-Preprocessing
- lower
- remove punctuations
- tokenization
- stopword removal

In [11]:
email = "Hi..!! My name is Anthony### 2010 @@..."

In [13]:
email = email.lower()

In [17]:
re.sub("[^a-z ]+", "", email).strip()

'hi my name is anthony'

In [26]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [40]:
sw = set(stopwords.words('english'))
# print(sw)

In [35]:
def get_clean_email(email):
    email = email.lower()
    email = re.sub("[^a-z ]+", "", email).strip()
    email = word_tokenize(email)
    
    email = [word for word in email if word not in sw]
    
    email = " ".join(email)
    
    return email

In [36]:
email = "Hi..!! My name is Anthony### 2010 @@..."

In [37]:
get_clean_email(email)

'hi name anthony'

In [43]:
df['cleaned_message'] = df['message'].apply(get_clean_email)

In [44]:
df.head()

Unnamed: 0,type,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [46]:
df['type'] = df['type'].astype('category').cat.codes

In [47]:
df.head()

Unnamed: 0,type,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [52]:
df['message'][7]

"As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"

In [53]:
df['cleaned_message'][7]

'per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune'

### train test split

In [55]:
from sklearn.model_selection import train_test_split

In [57]:
df_X_train, df_X_test, y_train, y_test = train_test_split(df['cleaned_message'], df['type'], test_size=0.25, random_state=42)

In [58]:
df_X_train.shape

(4179,)

In [59]:
df_X_test.shape

(1393,)

In [62]:
df_X_train

4281                                               u call
585               tell u headache want use hour sick time
4545    never try alone take weight tear comes ur hear...
3034    rajipls favour pls convey birthday wishes nimy...
2758                                         time im prob
                              ...                        
3772        came hostel going sleep plz call class hrishi
5191                                 sorry ill call later
5226                prabhaim sorydarealyfrm heart im sory
5390                             nt joking seriously told
860                                        work going min
Name: cleaned_message, Length: 4179, dtype: object

## Vectorization

In [63]:
from sklearn.feature_extraction.text import CountVectorizer # BOW

In [80]:
vectorizer = CountVectorizer(max_features=5000)

In [81]:
# learning the unique vocabulary
vectorizer.fit(df_X_train)

In [82]:
X_train = vectorizer.transform(df_X_train)
X_test  = vectorizer.transform(df_X_test)

In [83]:
X_train

<4179x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 31833 stored elements in Compressed Sparse Row format>

In [84]:
# X_train.toarray()

In [85]:
X_train.nnz

31833

In [86]:
X_train.nnz/(4179*7093)

0.0010739281794969085

In [87]:
X_test.shape

(1393, 5000)

In [112]:
# vectorizer.vocabulary_

## Prior Probability

In [90]:
y_train.value_counts(normalize= True)

0    0.866954
1    0.133046
Name: type, dtype: float64

### Conditional Probability

In [96]:
df[df['type'] == 1]['cleaned_message'].str.contains('free').sum()

199

In [98]:
len(df[df['type'] == 1])

747

In [99]:
199/747

0.26639892904953144

In [100]:
df[df['type'] == 0]['cleaned_message'].str.contains('free').sum()

66

In [102]:
len(df[df['type'] == 0])

4825

In [103]:
66/4825

0.013678756476683937

In [104]:
df[df['type'] == 0]['cleaned_message'].str.contains('love').sum()

217

In [105]:
df[df['type'] == 1]['cleaned_message'].str.contains('love').sum()

8

### Sklearn - NB

In [121]:
from sklearn.naive_bayes import BernoulliNB
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [114]:
bnb = BernoulliNB()

In [115]:
bnb.fit(X_train, y_train)

In [119]:
y_pred = bnb.predict(X_test)

In [120]:
f1_score(y_test, y_pred)

0.8843930635838151