In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer , PorterStemmer
from wordcloud import WordCloud

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, precision_recall_curve, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zedin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("data/dataset.csv", names = ["spam", "text"])
df.head()

FileNotFoundError: [Errno 2] File data/dataset.csv does not exist: 'data/dataset.csv'

In [None]:
df.info()

In [None]:
print("Dimension of the data: ", df.shape)

no_of_rows = df.shape[0]
no_of_columns = df.shape[1]

print("\nNo. of Rows: %d" % no_of_rows)
print("No. of Columns: %d" % no_of_columns)

In [None]:
df.groupby('spam').count()

In [None]:
label_counts = df.spam.value_counts()
plt.figure(figsize = (12,6))
sns.barplot(label_counts.index, label_counts.values, alpha = 0.9)

plt.xticks(rotation = 'vertical')
plt.xlabel('Spam', fontsize =12)
plt.ylabel('Counts', fontsize = 12)
plt.show()

In [None]:
df['length'] = df['text'].map(lambda text: len(text))

df.groupby('spam').length.describe()

In [None]:
emails_subset = df[df.length < 1800]
emails_subset.hist(column='length', by='spam', bins=50);

In [None]:
df.spam.replace(('ham', 'spam'), (0, 1), inplace=True)

In [None]:
df

In [None]:
df = df.sample(frac=1)

In [None]:
%%time
##Lemmatization


lemmatizer = WordNetLemmatizer()

df['text_lemmatized'] = df['text'].map(lambda text: ' '.join(lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text.lower())))

In [None]:
%%time
##Stemming
stemmer = PorterStemmer()
df['text_steam'] = df['text'].map(lambda text: ' '.join(stemmer.stem(w) for w in nltk.word_tokenize(text.lower())))

In [None]:
spam_words = ''.join(list(df[df['spam']==1]['text_lemmatized']))
spam_wordclod = WordCloud(width = 512,height = 512).generate(spam_words)
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(spam_wordclod)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()


In [None]:
spam_words = ''.join(list(df[df['spam']==0]['text_lemmatized']))
spam_wordclod = WordCloud(width = 512,height = 512).generate(spam_words)
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(spam_wordclod)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

In [None]:
spam_words = ''.join(list(df[df['spam']==1]['text_steam']))
spam_wordclod = WordCloud(width = 512,height = 512).generate(spam_words)
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(spam_wordclod)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

In [None]:
spam_words = ''.join(list(df[df['spam']==0]['text_steam']))
spam_wordclod = WordCloud(width = 512,height = 512).generate(spam_words)
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(spam_wordclod)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

In [None]:
X = df["text_lemmatized"]
y = df['spam']

In [None]:
count_vect_multinom = CountVectorizer(lowercase=True, stop_words='english',binary = False)
count_vect_multivar = CountVectorizer(lowercase=True, stop_words='english',binary = True)

In [None]:
X_counts_nom = count_vect_multinom.fit_transform(X)
X_counts_var = count_vect_multivar.fit_transform(X)

In [None]:
y = np.array(y)
X_nom = np.array(X_counts_nom.toarray())
X_var = np.array(X_counts_var.toarray())

In [None]:

class Multivariate_NB:
    def __init_(self,alpha = 1.0):
        self.alpha = alpha
        self.pie_1 = None
        self.pie_0 = None
        self.theta_jc_1 = None
        self.theta_jc_0 = None
    
    def fit(self,X,Y):
        pie_num = (Y == 1).astype(int).sum() + 1 
        pie_denum = len(set(Y)) + len(Y)
        self.pie_0 = pie_num / pie_denum       
        self.pie_1 = 1 - self.pie_0
        
        
        N_jc_0 =  X[Y == 0].sum(axis=0) 
        N_c_0 = X[Y==0].shape[0]
        self.theta_jc_0 = (N_jc_0 + 1) / (2 + N_c_0)
        
        N_jc_1 =  X[Y == 1].sum(axis=0) 
        N_c_1 = X[Y==1].shape[0]
        self.theta_jc_1 = (N_jc_1 + 1) / (2 + N_c_1)                   
    def predict(self,X):
        return np.argmax(self.predict_log_proba(X), axis=1)
        
    def predict_log_proba(self, X):
        a = self.theta_jc_1[:,None]
        b = self.theta_jc_0[:,None]
        log_prob_1 = np.log(self.pie_1) + np.log(np.where(X.T*a != 0,a, 1-a )).sum(axis=0).T
        log_prob_1 = log_prob_1[:,None]
        log_prob_0 = np.log(self.pie_0) + np.log(np.where(X.T*b != 0,b, 1-b )).sum(axis=0).T
        log_prob_0 = log_prob_0[:,None]
        return np.concatenate((log_prob_0,log_prob_1),axis = 1)
    
    def predict_proba(self, X):
        return np.exp(self.predict_log_proba(X))

In [None]:

class Multinomial_NB:
    def __init__(self,alpha = 1.0):
        self.alpha = alpha

    def fit(self,X,Y,alpha=1.0):
        self.alpha = alpha
        #self.pie_1 = (Y.sum() + 1 ) / (len(set(Y)) + len(Y))
        self.pie_1 = (Y.sum() + self.alpha ) / (len(set(Y))*self.alpha + len(Y))

        self.theta_jc = np.zeros((2, X.shape[1]))
        ham_doc = X[Y == 0]
        #self.theta_jc[0] = (ham_doc.sum(axis=0) + 1) / (np.einsum('ij->',ham_doc) + X.shape[1])
        self.theta_jc[0] = (ham_doc.sum(axis=0) + self.alpha) / (np.einsum('ij->',ham_doc) + X.shape[1]*self.alpha)

        spam_doc = X[Y == 1]
        #self.theta_jc[1] = (spam_doc.sum(axis=0)+1) / (np.einsum('ij->',spam_doc)+X.shape[1])
        self.theta_jc[1] = (spam_doc.sum(axis=0)+self.alpha) / (np.einsum('ij->',spam_doc)+X.shape[1]*self.alpha)

    def predict(self,X):
        return np.argmax(self.predict_log_proba(X), axis=1)

    def predict_log_proba(self, X):
        return np.sum(X[:,None] * np.log(self.theta_jc), axis=-1) + np.log([1-self.pie_1,self.pie_1])
    
    def predict_proba(self, X):
        return np.exp(self.predict_log_proba(X))

In [None]:

def mse(Y_true, Y_pred):
    E = np.array(Y_true).reshape(-1,1) - np.array(Y_pred).reshape(-1,1)
    mse = 1/np.array(Y_true).shape[0] * (E.T.dot(E))
    return mse[(0,0)]

def accuracy(x,y):
    x,y = np.array(x),np.array(y)
    pred = (x == y).astype(np.int)
    return pred.mean()

In [None]:
def split_trainTest(X,y,t):
    train_size = int((1-t) * X.shape[0])   
    return X[:train_size],X[train_size:],y[:train_size],y[train_size:]



In [None]:
X_train, X_test, y_train, y_test = split_trainTest(X_var,y,t=0.2)

In [None]:
model = Multivariate_NB()
model.fit(X_train,y_train)
y_pre = model.predict(X_test)
mse(y_test, y_pre)
accuracy(y_test, y_pre)

In [None]:

from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train,y_train)
y_pre_sk = clf.predict(X_test)
print(y_pre_sk)
mse(y_test, y_pre_sk)
accuracy(y_test, y_pre_sk)

In [None]:
X_train, X_test, y_train, y_test = split_trainTest(X_nom,y,t=0.2)

In [None]:
model = Multinomial_NB()
model.fit(X_train,y_train)
y_pre = model.predict(X_test)
y_pre
mse(y_test, y_pre)


In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
y_pre_sk = clf.predict(X_test)
y_pre_sk
mse(y_test, y_pre_sk)

In [None]:
def sFold(folds,data,labels,model,error_fuction,**model_args):
#def sFold(folds,data,labels,model,error_fuction,**model_args):
    if(labels.shape == (labels.shape[0],)):
        labels = np.expand_dims(labels,axis=1)
    dataset = np.concatenate([data,labels],axis=1)
    s_part = s_partition(dataset,folds)
    pred_y = []
    true_y = []
    for idx,val in enumerate(s_part):
        test_y = val[:,-1]
        #test_y = np.expand_dims(test_y, axis=1)
        test = val[:,:-1]
        train = np.concatenate(np.delete(s_part,idx,0))
        label = train[:,-1]
        train = train[:,:-1]        
        model.fit(train,label,**model_args)       
        pred = model.predict(test)
        pred_y.append(pred)
        true_y.append(test_y)
    pred_y = np.concatenate(pred_y)
    true_y = np.concatenate(true_y)

    avg_error = error_fuction(pred_y,true_y).mean()   
    result = {'Expected labels':true_y, 'Predicted labels': pred_y,'Average error':avg_error }
    return result


#helper
def s_partition(x,s):
    return np.array_split(x,3)

In [None]:
validation_accuracy = np.empty((4,2,2))
validation_accuracy

In [None]:
def compute_recall(actual, predicted):
    
    CM =  compute_confusion_matrix(actual, predicted).to_numpy()  # CM is converted into a 2 X 2 array.
    
    TN = CM[0,0]; FP = CM[0,1]; FN = CM[1,0]; TP =  CM[1,1];
    
    recall = TP / (TP + FN)
    
    return recall

def compute_confusion_matrix(actual, predicted):
    
    arary_actual = np.array(actual)
    array_pred = np.array(predicted)
    
    pd_actual = pd.Series(arary_actual, name='Actual')
    pd_predicted = pd.Series(array_pred, name='Predicted')

    pd_actual = pd.Categorical(pd_actual, categories=[0, 1])
    pd_predicted = pd.Categorical(pd_predicted, categories=[0, 1])

    CM =  pd.crosstab(pd_actual, pd_predicted, dropna=False)
    
    return CM

def compute_precision(actual, predicted):
       
    CM =  compute_confusion_matrix(actual, predicted).to_numpy()  # CM is converted into a 2 X 2 array.
    
    TN = CM[0,0]; FP = CM[0,1]; FN = CM[1,0]; TP =  CM[1,1];
    
    precision = TP / (TP + FP)
    
    return precision

In [None]:
def compute_F1_score(actual, predicted):
    
    precision = compute_precision(actual, predicted)
    recall = compute_recall(actual, predicted)
    
    F1_score = 2 * precision * recall / (precision + recall)
    
    return F1_score

In [None]:
validation_accuracy = np.empty((8,1))
alpha = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0]
maxScore = 0
for i,k in enumerate(alpha):
            multi_nom = Multinomial_NB()
            model_args = {'alpha' : k}
            result = sFold(5,X_train,y_train,multi_nom,compute_F1_score,**model_args)
            validation_accuracy[i] = result['Average error']
            if validation_accuracy[i] > maxScore:
                maxScore = validation_accuracy[i]
                index = i

In [None]:
print('optimal alpha: ',alpha[index])

In [None]:
model = Multinomial_NB()
model.fit(X_train,y_train,alpha = 2)
y_pre = model.predict(X_test)
y_pre
compute_F1_score(y_test, y_pre)

In [None]:
print('Testing on Test DATA')
print('Precision :',compute_precision(y_test, y_pre))
print('Recall :',compute_recall(y_test, y_pre))
print('F1 Score :',compute_F1_score(y_test, y_pre))
print('\n\nConfusion Matrix :\n')
print(compute_confusion_matrix(y_test, y_pre))
print('\n\nAccuracy :',accuracy(y_test, y_pre))

In [None]:
def generate_ROC_elements(y_label, y_prob, target_label = 1):
    
    # gets the target label.
    if target_label == 0: non_target_label = 1
    if target_label == 1: non_target_label = 0
    
    # converts the input arguments into arrays.
    ar_y_label = np.array(y_label)
    ar_y_prob = np.array(y_prob)
    
    # creates a list to sort the results of predicted y. 
    y_pred = list(y_prob)
    
    # generates list to store the tpr, fpr and threshold.
    tpr_list = [0, 1]     
    fpr_list = [0, 1]
    thres_lish = [1, 0]
    
    # using the for loop to predicte y based on the input y_prob. 
    for i, prob in enumerate(ar_y_prob):
        threshold = prob
        for index, y_prob in enumerate(ar_y_prob):
            if y_prob >= threshold:
                y_pred[index] = target_label
            else:
                y_pred[index] = non_target_label
        
        # uses the function to compute the confusion matrix, and gets the TN, FP, FN, TP. 
        CM = compute_confusion_matrix(y_label, y_pred).to_numpy()           
        TN = CM[0,0]; FP = CM[0,1]; FN = CM[1,0]; TP =  CM[1,1]
        
        # Calculates tpr and fpr. 
        tpr = TP / (TP + FN)
        fpr = FP / (FP + TN)
    
        # adds the tpr, fpr and threshold into the corresponding lists. 
        tpr_list.append(tpr)
        fpr_list.append(fpr)
        thres_lish.append(threshold)

    # when the for loop is end, generating a dataframe with the lists of threshold, fpr and tpr. 
    data = {'threshold':pd.Series(thres_lish), 'fpr':pd.Series(fpr_list), 'tpr':pd.Series(tpr_list)}
    df_roc = pd.DataFrame(data)
    
    # descending sorting the dataframe according to the threshold column
    df_roc.sort_values(by='threshold', ascending=False, inplace=True)
    
    return np.array(df_roc["fpr"]), np.array(df_roc["tpr"])

def plotting_roc_curve(fpr, tpr, label = None): 
    plt.figure(figsize = (10, 10))
    
    # linewidth and fontsize
    lw = 2
    fontsize = 20
    
    # plot roc curve
    plt.plot(fpr, tpr, color='darkorange', lw = lw, label = label) 
    
    # plot y = x
    plt.plot([0, 1], [0, 1], color='navy', lw = lw, linestyle = '--')  
    
    # set the length of x axis and y axis. 
    plt.axis([0, 1, 0, 1.05])
    
    # add title, xlabel, ylabel, and legend. 
    plt.title(f'Receiver operating characteristic Curve ({label})', fontsize = fontsize)
    plt.xlabel('False Positive Rate', fontsize = fontsize)
    plt.ylabel('True Positive Rate', fontsize = fontsize)
    plt.legend(loc="lower right", fontsize = fontsize)
    
    plt.show()

def generate_precision_recall_curve_elements(y_label, y_prob, target_label = 1):
    
    # gets the target label.
    if target_label == 0: non_target_label = 1
    if target_label == 1: non_target_label = 0
    
    # converts the input arguments into arrays.
    ar_y_label = np.array(y_label)
    ar_y_prob = np.array(y_prob)
    
    # creates a list to sort the results of predicted y. 
    y_pred = list(y_prob)
    
    # generates list to store the tpr, fpr and threshold.
    precision_list = []     
    recall_list = []
    thres_lish = []
    
    # using the for loop to predicte y based on the input y_prob. 
    
    for i, prob in enumerate(ar_y_prob):
        threshold = prob
        for index, y_prob in enumerate(ar_y_prob):
            if y_prob >= threshold:
                y_pred[index] = target_label
            else:
                y_pred[index] = non_target_label
        
        # uses the function to compute the confusion matrix, and gets the TN, FP, FN, TP. 
        CM = compute_confusion_matrix(y_label, y_pred).to_numpy()           
        TN = CM[0,0]; FP = CM[0,1]; FN = CM[1,0]; TP =  CM[1,1]
        
        # Calculates tpr and fpr. 
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
    
        # adds the tpr, fpr and threshold into the corresponding lists. 
        precision_list.append(precision)
        recall_list.append(recall)
        thres_lish.append(threshold)

    # when the for loop is end, generating a dataframe with the lists of threshold, fpr and tpr. 
    data = {'threshold':pd.Series(thres_lish), 'precision':pd.Series(precision_list), 'recall':pd.Series(recall_list)}
    df_roc = pd.DataFrame(data)
    
    # descending sorting the dataframe according to the threshold column
    df_roc.sort_values(by='threshold', ascending = True, inplace = True)
        
    return np.array(df_roc["precision"]), np.array(df_roc["recall"]), np.array(df_roc["threshold"]) 

def calculate_auc(fpr_x_axis, tpr_y_axis):
    
    # Trapezoidal numerical integration 
    auc = np.trapz(tpr_y_axis, fpr_x_axis)
    
    return auc

In [None]:
fpr, tpr = generate_ROC_elements(y_test, y_pre)
plotting_roc_curve(fpr, tpr, "test data")
print('\n\nAUC :',calculate_auc(fpr, tpr))