In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import nltk
nltk.download('wordnet')
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer 
from wordcloud import WordCloud

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, precision_recall_curve, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zedin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zedin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


ModuleNotFoundError: No module named 'wordcloud'

In [None]:
df = pd.read_csv("data/dataset.csv", names = ["Spam", "Text"])
df.head()

In [None]:
df.info()

In [None]:
print("Dimension of the data: ", df.shape)

no_of_rows = df.shape[0]
no_of_columns = df.shape[1]

print("\nNo. of Rows: %d" % no_of_rows)
print("No. of Columns: %d" % no_of_columns)

In [None]:
df['Class'] = df['Spam'].map(lambda Spam:0 if Spam == 'ham' else 1)
df

In [None]:
# Create a new column "length" the stores the length of the text on each row
df['length'] = df['Text'].map(lambda text: len(text))

df.groupby('Spam').length.describe()

In [None]:
lemmatizer = WordNetLemmatizer()

df['text_lemmatized'] = df['Text'].map(lambda text: ' '.join(lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text.lower())))
df

In [None]:
df.groupby('Class').count()

In [None]:
label_counts = df.Spam.value_counts()
plt.figure(figsize = (12,6))
sns.barplot(label_counts.index, label_counts.values, alpha = 0.9)

plt.xticks(rotation = 'vertical')
plt.xlabel('Spam', fontsize =12)
plt.ylabel('Counts', fontsize = 12)
plt.show()

In [None]:
# Use a length threshold to visualize the distribution of length per class

emails_subset = df[df.length < 1000]
emails_subset.hist(column='length', by='Spam', bins=50)

In [None]:
X = df["text_lemmatized"]

y = df['Class'] # 1D targer vector

In [None]:
#count_vect = CountVectorizer(lowercase=True, stop_words='english',binary = False)
count_vect = CountVectorizer(lowercase=True, stop_words='english',binary = True)
X_counts = count_vect.fit_transform(X)

In [None]:
X_counts.shape

In [None]:
print("\nIts Index rather than count")
count_vect.vocabulary_


In [None]:
y = np.array(y)
X = np.array(X_counts.toarray())
print(X.shape)
print(y.shape)

In [None]:
def mse(Y_true, Y_pred):
    E = np.array(Y_true).reshape(-1,1) - np.array(Y_pred).reshape(-1,1)
    mse = 1/np.array(Y_true).shape[0] * (E.T.dot(E))
    return mse[(0,0)]

In [None]:
class Multivariate_NB:
    def __init_(self,alpha = 1.0):
        self.alpha = alpha
        self.pie_1 = None
        self.pie_0 = None
        self.theta_jc_1 = None
        self.theta_jc_0 = None
    
    def fit(self,X,Y):
        pie_num = (Y == 1).astype(int).sum() + 1 
        pie_denum = len(set(Y)) + len(Y)
        self.pie_0 = pie_num / pie_denum       
        self.pie_1 = 1 - self.pie_0
        
        
        N_jc_0 =  X[Y == 0].sum(axis=0) 
        N_c_0 = X[Y==0].shape[0]
        self.theta_jc_0 = (N_jc_0 + 1) / (2 + N_c_0)
        
        N_jc_1 =  X[Y == 1].sum(axis=0) 
        N_c_1 = X[Y==1].shape[0]
        self.theta_jc_1 = (N_jc_1 + 1) / (2 + N_c_1)               
        
#         ###without laplace Smoothing
#         self.pie_1 = Y.mean()
#         self.pie_0 = 1 - self.pie_1
#         self.theta_jc_0 = X[Y == 0].mean(axis=0)
#         self.theta_jc_1 = X[Y == 1].mean(axis=0)
    
    def predict(self,X):
        log_prob = self.predict_log_proba(X)
        
        # print((log_prob[:,0] <= log_prob[:,1]).astype(int))
        return np.array((log_prob[:,0] <= log_prob[:,1]).astype(int))
        
    def predict_log_proba(self, X):
        a = self.theta_jc_1[:,None]
        b = self.theta_jc_0[:,None]
        log_prob_1 = np.log(self.pie_1) + np.log(np.where(X.T*a != 0,a, 1-a )).sum(axis=0).T
        log_prob_1 = log_prob_1[:,None]
        log_prob_0 = np.log(self.pie_0) + np.log(np.where(X.T*b != 0,b, 1-b )).sum(axis=0).T
        log_prob_0 = log_prob_0[:,None]
        return np.concatenate((log_prob_0,log_prob_1),axis = 1)
        

In [None]:
class Multinomial_NB:
    def __init(self,alpha = 1.0):
        self.alpha = alpha

    def fit(self,X,Y):
        self.pie_1 = (Y.sum() + 1 ) / (len(set(Y)) + len(Y))

        self.theta_jc = np.zeros((2, X.shape[1]))
        ham_doc = X[Y == 0]
        self.theta_jc[0] = (ham_doc.sum(axis=0) + 1) / (np.einsum('ij->',ham_doc) + X.shape[1])

        spam_doc = X[Y == 1]
        self.theta_jc[1] = (spam_doc.sum(axis=0)+1) / (np.einsum('ij->',spam_doc)+X.shape[1])

    def predict(self,X):
        return np.argmax(self.predict_log_proba(X), axis=1)

    def predict_log_proba(self, X):
        return np.sum(X[:,None] * np.log(self.theta_jc), axis=-1) + np.log([1-self.pie_1,self.pie_1])

In [None]:
def accuracy(x,y):
    x,y = np.array(x),np.array(y)
    pred = (x == y).astype(np.int)
    return pred.mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Multivariate_NB

In [None]:
model = Multivariate_NB()
model.fit(X_train,y_train)
y_pre = model.predict(X_test)
print(y_pre)
mse(y_test, y_pre)
accuracy(y_test, y_pre)

In [None]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train,y_train)
y_pre_sk = clf.predict(X_test)
print(y_pre_sk)
mse(y_test, y_pre_sk)
accuracy(y_test, y_pre_sk)

Multinomial_NB

In [None]:
model = Multinomial_NB()
model.fit(X_train,y_train)
y_pre = model.predict(X_test)
y_pre
mse(y_test, y_pre)

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
y_pre_sk = clf.predict(X_test)
y_pre_sk
mse(y_test, y_pre_sk)