In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from preprocessing import process_text
from nltk import flatten


In [190]:
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.rename(columns={'v1': 'Label', 'v2': 'Mail'}, inplace=True)
print(df['Label'].value_counts())


ham     4825
spam     747
Name: Label, dtype: int64


In [191]:
df['Label'] = df['Label'].map({'ham': 1, 'spam': 0})
df['Msg'] = df['Mail'].apply(lambda x: process_text(x))
df.head()


Unnamed: 0,Label,Mail,Msg
0,1,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,1,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,1,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,1,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]"


In [192]:
all_words = list(flatten((list(df['Msg']))))
print("Length all words: {}".format(len(all_words)))
unique_words = list(set(all_words))
print("Length of unique words: {}".format(len(unique_words)))


Length all words: 49999
Length of unique words: 7937


In [193]:
ham_df = df[df["Label"] == 1]
spam_df = df[df["Label"] == 0]
ham_df.head()


Unnamed: 0,Label,Mail,Msg
0,1,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,1,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
3,1,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,1,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]"
6,1,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aid,..."


In [194]:
spam_df.head()


Unnamed: 0,Label,Mail,Msg
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
5,0,FreeMsg Hey there darling it's been 3 week's n...,"[freemsg, hey, darling, week, word, back, id, ..."
8,0,WINNER!! As a valued network customer you have...,"[winner, valued, network, customer, selected, ..."
9,0,Had your mobile 11 months or more? U R entitle...,"[mobile, month, u, r, entitled, update, latest..."
11,0,"SIX chances to win CASH! From 100 to 20,000 po...","[six, chance, win, cash, pound, txt, csh, send..."


In [195]:
positive_words = list(flatten((list(ham_df['Msg']))))
negative_words = list(flatten((list(spam_df['Msg']))))

print('Pos: {}, Neg: {}'.format(len(positive_words),len(negative_words)))

Pos: 39090, Neg: 10909


In [196]:
word_map={}
for word in unique_words:
    word_map[word]={'pos':positive_words.count(word),'neg':negative_words.count(word)}

In [197]:
data=[]
for word in word_map:
    data.append([word,word_map[word]['pos'],word_map[word]['neg']])
prob_df= pd.DataFrame(data=data,columns=['word','pos','neg'])

In [198]:
def laplacian_smoothing(word_freq,total_sum_occurences,unique_words):
    smooth= (word_freq+1) / (total_sum_occurences+unique_words)
    return smooth


In [199]:
prob_df.head()

Unnamed: 0,word,pos,neg
0,salary,4,0
1,student,8,0
2,callon,1,0
3,temales,1,0
4,high,4,3


In [200]:
#do laplacian smoothi to avoid 0 probability cases
prob_df['laplacian_pos'] = prob_df.apply(lambda x: laplacian_smoothing(x['pos'], len(positive_words),len(unique_words)), axis=1)
prob_df['laplacian_neg'] = prob_df.apply(lambda x: laplacian_smoothing(x['neg'], len(negative_words),len(unique_words)), axis=1)
prob_df

Unnamed: 0,word,pos,neg,laplacian_pos,laplacian_neg
0,salary,4,0,0.000106,0.000053
1,student,8,0,0.000191,0.000053
2,callon,1,0,0.000043,0.000053
3,temales,1,0,0.000043,0.000053
4,high,4,3,0.000106,0.000212
...,...,...,...,...,...
7932,txtx,0,1,0.000021,0.000106
7933,ultimately,1,0,0.000043,0.000053
7934,jst,3,1,0.000085,0.000106
7935,formclark,2,0,0.000064,0.000053


In [201]:
#log likelihood
def log_likelihood(a,b):
    return np.log(a/b)


In [202]:
# data_try=[['i',3,3],['am',3,3],['happy',2,1],['because',1,0],['learning',1,1],['nlp',1,1],['sad',1,2],['not',1,2]]
# data_dum=pd.DataFrame(data=data_try,columns=['word','pos','neg'])
# data_dum
# data_dum['laplacian_pos'] = data_dum.apply(lambda x: laplacian_smoothing(x['pos'], data_dum['pos'].sum(),8), axis=1)
# data_dum['laplacian_neg'] = data_dum.apply(lambda x: laplacian_smoothing(x['neg'], data_dum['neg'].sum(),8), axis=1)
# data_dum['log']= data_dum.apply(lambda x: log_likelihood(x['laplacian_pos'],x['laplacian_neg']),axis=1)
# data_dum

In [203]:
prob_df['log']= prob_df.apply(lambda x: log_likelihood(x['laplacian_pos'],x['laplacian_neg']),axis=1)
prob_df

Unnamed: 0,word,pos,neg,laplacian_pos,laplacian_neg,log
0,salary,4,0,0.000106,0.000053,0.695017
1,student,8,0,0.000191,0.000053,1.282803
2,callon,1,0,0.000043,0.000053,-0.221274
3,temales,1,0,0.000043,0.000053,-0.221274
4,high,4,3,0.000106,0.000212,-0.691278
...,...,...,...,...,...,...
7932,txtx,0,1,0.000021,0.000106,-1.607568
7933,ultimately,1,0,0.000043,0.000053,-0.221274
7934,jst,3,1,0.000085,0.000106,-0.221274
7935,formclark,2,0,0.000064,0.000053,0.184191


In [204]:
#total positive tweets by total negative tweets in dataset
log_prior= np.log(ham_df.shape[0]/spam_df.shape[0])
log_prior


1.865500828640269

In [None]:
def get_spam_or_ham(string,df,log_prior):
    pass
