In [20]:
import pandas as pd
import numpy as np
import string
import math
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def classification_reportt(ytest, ypred):
    tp = tn = fp = fn = 0
    for i in range(len(ytest)):
        if ytest[i] == ypred[i]:
            if ytest[i] == 1:
                tp += 1
            else:
                tn += 1
        else:
            if ypred[i] == 1:
                fp += 1
            else:
                fn += 1
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = (tp) / (tp + fp)
    recall = (tp) / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    print("accuracy  : " + str(accuracy))
    print("precision : " + str(precision))
    print("recall    : " + str(recall))
    print("f1_score  : " + str(f1_score))
    
def laplace_smoothing(freq, alpha, class_count, total_count):
    return math.log2((freq + alpha) / (class_count + total_count))    

def calculate_probability(test, freq, class_count, is_spam, ngram):
    prob = 0
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram, ngram))
    y = vectorizer.fit_transform([test])
    test = vectorizer.get_feature_names()
    
    for word in test:
        if word in freq:
            prob += laplace_smoothing(freq[word][is_spam], 1, class_count, len(freq))
            
        else:
            prob += laplace_smoothing(0, 1, class_count, len(freq))
    return prob


def naive_bayes(ham_count, spam_count, freq, xtest, ngram):
    y_pred = []
    for test in xtest:
        p_spam = math.log2(spam_count / (spam_count + ham_count))
        p_ham = math.log2(ham_count / (spam_count + ham_count))

        # probability for spam
        p_spam += calculate_probability(test, freq, spam_count, 1, ngram)

        # probability for spam
        p_ham += calculate_probability(test, freq, ham_count, 0, ngram)

        if p_spam > p_ham:
            y_pred.append(1)
        else:
            y_pred.append(0)

    return y_pred


def create_dictionary(freq, words, row, is_spam):
    i = 0
    for item in row:
        if item != 0:
            if words[i] in freq:
                arr = freq[words[i]]
                arr[is_spam] += 1
                freq[words[i]] = arr

            else:
                arr = [0, 0]
                arr[is_spam] = 1
                freq[words[i]] = arr
        i += 1
    return freq


def get_frequencies(vectorizer, xtrain, ytrain):
    y = vectorizer.fit_transform(xtrain)
    doc_array = y.toarray()

    frequency_matrix = pd.DataFrame(data=doc_array, columns=vectorizer.get_feature_names())

    words = list(frequency_matrix.columns.values)
    freq = {}

    for i, j in frequency_matrix.iterrows():
        create_dictionary(freq, words, j.tolist(), ytrain[i])
    return freq


def get_frequencies_unigram(xtrain, ytrain, min_df, max_df):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), token_pattern=r"(?u)\b\w+\b",
                                 lowercase=True, min_df=min_df, max_df=max_df)
    return get_frequencies(vectorizer, xtrain, ytrain)


def get_frequencies_unigram_without_stopwords(xtrain, ytrain, min_df, max_df):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), token_pattern=r"(?u)\b\w+\b",
                                 lowercase=True, stop_words=ENGLISH_STOP_WORDS, min_df=min_df, max_df=max_df)
    return get_frequencies(vectorizer, xtrain, ytrain)


def get_frequencies_bigram(xtrain, ytrain, min_df, max_df):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2), token_pattern=r"(?u)\b\w+\b",
                                 lowercase=True, min_df=min_df, max_df=max_df)
    return get_frequencies(vectorizer, xtrain, ytrain)


def get_frequencies_bigram_without_stopwords(xtrain, ytrain, min_df, max_df):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2), token_pattern=r"(?u)\b\w+\b",
                                 lowercase=True, stop_words=ENGLISH_STOP_WORDS, min_df=min_df, max_df=max_df)
    return get_frequencies(vectorizer, xtrain, ytrain)


def get_frequencies_unigram_bigram(xtrain, ytrain):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", lowercase=True)
    return get_frequencies(vectorizer, xtrain, ytrain)


def main():
    df = pd.read_csv('emails.csv')

    x = df.text.values
    y = df.spam.values

    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, shuffle=True)

    ham_count = np.count_nonzero(ytrain == 0)
    spam_count = np.count_nonzero(ytrain == 1)
    
    print("--------------------------UNIGRAM DEFAULT PARAM----------------------------")
    freq = get_frequencies_unigram(xtrain, ytrain, 1, 1.0)
    ypred = naive_bayes(ham_count, spam_count, freq, xtest, 1)
    classification_reportt(ytest, ypred)
    
    print("\n-------------------UNIGRAM min_df = 0.01, max_df = 0.8-------------------")
    freq = get_frequencies_unigram(xtrain, ytrain, 0.01, 0.8)
    ypred = naive_bayes(ham_count, spam_count, freq, xtest, 1)
    classification_reportt(ytest, ypred)
    
    print("\n----------UNIGRAM WITHOUT STOPWORDS min_df = 0.01, max_df = 0.8----------")
    freq = get_frequencies_unigram_without_stopwords(xtrain, ytrain, 0.01, 0.8)
    ypred = naive_bayes(ham_count, spam_count, freq, xtest, 1)
    classification_reportt(ytest, ypred)
    
    print("\n--------------------------BIGRAM DEFAULT PARAM---------------------------")
    freq = get_frequencies_bigram(xtrain, ytrain, 1, 1.0)
    ypred = naive_bayes(ham_count, spam_count, freq, xtest, 2)
    classification_reportt(ytest, ypred)
    
    print("\n--------------------BIGRAM min_df = 0.01, max_df = 0.8--------------------")
    freq = get_frequencies_bigram(xtrain, ytrain, 0.01, 0.8)
    ypred = naive_bayes(ham_count, spam_count, freq, xtest, 2)
    classification_reportt(ytest, ypred)
    
    print("\n----------BIGRAM WITHOUT STOPWORDS min_df = 0.01, max_df = 0.8-----------")
    freq = get_frequencies_bigram_without_stopwords(xtrain, ytrain, 0.01, 0.8)
    ypred = naive_bayes(ham_count, spam_count, freq, xtest, 2)
    classification_reportt(ytest, ypred)
    
main()

--------------------------UNIGRAM DEFAULT PARAM----------------------------
accuracy  : 0.8237347294938918
precision : 1.0
recall    : 0.23193916349809887
f1_score  : 0.3765432098765432

-------------------UNIGRAM min_df = 0.01, max_df = 0.8-------------------
accuracy  : 0.8656195462478184
precision : 0.9658119658119658
recall    : 0.4296577946768061
f1_score  : 0.5947368421052632

----------UNIGRAM WITHOUT STOPWORDS min_df = 0.01, max_df = 0.8----------
accuracy  : 0.9685863874345549
precision : 0.8873720136518771
recall    : 0.9885931558935361
f1_score  : 0.9352517985611511

--------------------------BIGRAM DEFAULT PARAM---------------------------
accuracy  : 0.9066317626527051
precision : 0.9936708860759493
recall    : 0.596958174904943
f1_score  : 0.7458432304038006

--------------------BIGRAM min_df = 0.01, max_df = 0.8--------------------
accuracy  : 0.705933682373473
precision : 0.43833333333333335
recall    : 1.0
f1_score  : 0.6095017381228273

----------BIGRAM WITHOUT STOPWOR

TypeError: get_frequencies_bigram_without_stopwords() takes 2 positional arguments but 4 were given