In [191]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from preprocessing import process_text
from nltk import flatten


In [192]:
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.rename(columns={'v1': 'Label', 'v2': 'Mail'}, inplace=True)
print(df['Label'].value_counts())


ham     4825
spam     747
Name: Label, dtype: int64


In [193]:
df['Label'] = df['Label'].map({'ham': 1, 'spam': 0})
df['Msg'] = df['Mail'].apply(lambda x: process_text(x))
df.head()


Unnamed: 0,Label,Mail,Msg
0,1,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,1,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,1,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,1,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]"


In [194]:
test_df = df.sample(frac=0.20,random_state=42)

cond = df['Mail'].isin(test_df['Mail'])
#Removing Testing dataset from data
df.drop(df[cond].index, inplace = True)

In [195]:
df.shape

(4281, 3)

In [196]:
test_df.shape

(1114, 3)

In [197]:
all_words = list(flatten((list(df['Msg']))))
print("Length all words: {}".format(len(all_words)))
unique_words = list(set(all_words))
print("Length of unique words: {}".format(len(unique_words)))


Length all words: 38260
Length of unique words: 6823


In [198]:
ham_df = df[df["Label"] == 1]
spam_df = df[df["Label"] == 0]
ham_df.head()


Unnamed: 0,Label,Mail,Msg
0,1,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,1,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
3,1,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,1,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]"
6,1,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aid,..."


In [199]:
spam_df.head()


Unnamed: 0,Label,Mail,Msg
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
5,0,FreeMsg Hey there darling it's been 3 week's n...,"[freemsg, hey, darling, week, word, back, id, ..."
19,0,England v Macedonia - dont miss the goals/team...,"[england, v, macedonia, dont, miss, goalsteam,..."
34,0,Thanks for your subscription to Ringtone UK yo...,"[thanks, subscription, ringtone, uk, mobile, c..."
42,0,07732584351 - Rodger Burns - MSG = We tried to...,"[rodger, burn, msg, tried, call, reply, sm, fr..."


In [200]:
positive_words = list(flatten((list(ham_df['Msg']))))
negative_words = list(flatten((list(spam_df['Msg']))))

print('Pos: {}, Neg: {}'.format(len(positive_words),len(negative_words)))

Pos: 30007, Neg: 8253


In [201]:
word_map={}
for word in unique_words:
    word_map[word]={'pos':positive_words.count(word),'neg':negative_words.count(word)}

In [202]:
data=[]
for word in word_map:
    data.append([word,word_map[word]['pos'],word_map[word]['neg']])
prob_df= pd.DataFrame(data=data,columns=['word','pos','neg'])

In [203]:
def laplacian_smoothing(word_freq,total_sum_occurences,unique_words):
    smooth= (word_freq+1) / (total_sum_occurences+unique_words)
    return smooth


In [204]:
prob_df.head()

Unnamed: 0,word,pos,neg
0,galcan,1,0
1,city,1,2
2,geeee,5,0
3,meok,1,0
4,sullivan,0,1


In [205]:
#do laplacian smoothi to avoid 0 probability cases
prob_df['laplacian_pos'] = prob_df.apply(lambda x: laplacian_smoothing(x['pos'], len(positive_words),len(unique_words)), axis=1)
prob_df['laplacian_neg'] = prob_df.apply(lambda x: laplacian_smoothing(x['neg'], len(negative_words),len(unique_words)), axis=1)
prob_df

Unnamed: 0,word,pos,neg,laplacian_pos,laplacian_neg
0,galcan,1,0,0.000054,0.000066
1,city,1,2,0.000054,0.000199
2,geeee,5,0,0.000163,0.000066
3,meok,1,0,0.000054,0.000066
4,sullivan,0,1,0.000027,0.000133
...,...,...,...,...,...
6818,alaikkumpride,1,0,0.000054,0.000066
6819,gbpweek,0,2,0.000027,0.000199
6820,allo,1,0,0.000054,0.000066
6821,rebel,1,0,0.000054,0.000066


In [206]:
#log likelihood
def log_likelihood(a,b):
    return np.log(a/b)


In [207]:
# data_try=[['i',3,3],['am',3,3],['happy',2,1],['because',1,0],['learning',1,1],['nlp',1,1],['sad',1,2],['not',1,2]]
# data_dum=pd.DataFrame(data=data_try,columns=['word','pos','neg'])
# data_dum
# data_dum['laplacian_pos'] = data_dum.apply(lambda x: laplacian_smoothing(x['pos'], data_dum['pos'].sum(),8), axis=1)
# data_dum['laplacian_neg'] = data_dum.apply(lambda x: laplacian_smoothing(x['neg'], data_dum['neg'].sum(),8), axis=1)
# data_dum['log']= data_dum.apply(lambda x: log_likelihood(x['laplacian_pos'],x['laplacian_neg']),axis=1)
# data_dum

In [208]:
prob_df['log']= prob_df.apply(lambda x: log_likelihood(x['laplacian_pos'],x['laplacian_neg']),axis=1)
prob_df

Unnamed: 0,word,pos,neg,laplacian_pos,laplacian_neg,log
0,galcan,1,0,0.000054,0.000066,-0.200061
1,city,1,2,0.000054,0.000199,-1.298674
2,geeee,5,0,0.000163,0.000066,0.898551
3,meok,1,0,0.000054,0.000066,-0.200061
4,sullivan,0,1,0.000027,0.000133,-1.586356
...,...,...,...,...,...,...
6818,alaikkumpride,1,0,0.000054,0.000066,-0.200061
6819,gbpweek,0,2,0.000027,0.000199,-1.991821
6820,allo,1,0,0.000054,0.000066,-0.200061
6821,rebel,1,0,0.000054,0.000066,-0.200061


In [209]:
#total positive tweets by total negative tweets in dataset
log_prior= np.log(ham_df.shape[0]/spam_df.shape[0])
log_prior


1.8856179187991933

In [210]:
def predict_spam_ham(words, df, log_prior):
    log_sum = 0
    for word in words:
        col = df.loc[df['word'] == word]
        if len(col):
            log_sum += float(col['log'])
    score = log_sum+log_prior
    return 1 if score > 0 else 0


In [211]:
# text=input('Enter Message: ')
# text = process_text(text)
# score = predict_spam_ham(text,prob_df,log_prior)
# label = "Ham" if score > 0 else "Spam"
# print("The given Message is {} \nscore: {}".format(label.upper(),score))



In [212]:
# Testing
##predicting the test data set based on lambda given dataset
test_df['Predict'] = test_df.apply(
    lambda x: predict_spam_ham((x['Msg']), prob_df, log_prior), axis=1)
values = test_df.Predict == test_df.Label

print('Accuracy: {:.2f}%'.format(sum(values)/len(values)*100))

Accuracy: 97.49%
