In [1]:
import collections
from collections import Counter
import numpy
import math
import csv
import operator
from random import seed
from random import randrange
import multiprocessing
from random import randrange
import statistics 

In [2]:
redundant_words = ["the", "of", "and", "a", "in", "to", "that", "is", "with", "for", "from", "are", "by", " ", "was", "we", "this", "were", "as", "an", "have" ,"which", "has", "these", "at", "be"]
with open('trg.csv', newline='') as csvfile:
    data = list(csv.reader(csvfile))
    for i in data:
        a_str = ""
        for j in i[2].split():
            if j not in redundant_words:
                a_str += j + " "
        i[2] = a_str

['1', 'B', '4 202 353 bp genome alkaliphilic bacterium bacillus halodurans c-125 contains 4066 predicted protein coding sequences cdss 2141 527 functional assignments 1182 29 conserved cdss unknown function 743 18 3 no match any protein database among total cdss 88 match sequences proteins found only bacillus subtilis 667 widely conserved comparison proteins various organisms including bsubtilis b halodurans genome contains 112 transposase genes indicating transposases played important evolutionary role horizontal gene transfer also internal genetic rearrangement genome strain c-125 lacks some necessary genes competence such coms srfa rapc supporting fact competence not been demonstrated experimentally c-125 there no paralog tupa encoding teichuronopeptide contributes alkaliphily c-125 genome ortholog tupa cannot found bsubtilis genome out 11 sigma factors belong extracytoplasmic function family 10 unique b halodurans suggesting they may role special mechanism adaptation alkaline envir

In [3]:
with open('tst.csv', newline='') as csvfile:
    test_data = list(csv.reader(csvfile))
    for i in test_data:
        a_str = ""
        for j in i[1].split():
            if j not in redundant_words:
                a_str += j + " "
        i[1] = a_str
test_data.pop(0)

['1', 'previous work all three components comamonas testosteroni b-356 biphenyl bphchlorobiphenyls pcbs dioxygenase dox been purified characterized they include iron-sulphur protein ispbph terminal oxygenase composed two subunits encoded bpha bphe ferredoxin ferbph encoded bphf reductase redbph encoded bphg bphg not located neighbourhood bphaef b-356 reporting cloning b-356-bphg sequencing b-356-bph dox genes comparative analysis genes provided genetic evidence showing two bph dox lineages emerged gram-negative bacteria main features lineage includes b-356 location bphg outside bph gene cluster structure redbph very distinct all other aryl dioxygenase-reductases ']


### Reading in and cleaning the data
Reading in the data for both the test set and the training set I've decided to remove stopwords before we get any counts for any words at all. Removing these words will help our predicting performance as these stop words aren't actually really related to the class, for example the which is the most common word in the english language will not have any correlation to what the actual text is about. Hence removing these words will improve our model performance and not try to make predictions on attributes which are uncorrelated/unimportant.

In [4]:
# Calculate accuracy percentage between two lists
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i][1] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

### Accuracy
Here we are simply creating a accuracy calculating function for our cross validation

In [5]:
seed(1)
def cross_validation_split(dataset, folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

#### Cross validation
To create a cross validation split we need to first split the dataset into k equal parts, which we do above and then use 1 of these as a test set and the rest as the training set while changing the test set to another after we've iterated after each test set so we can get an accurate representation of our model performance.

In [6]:
text_in_class = {'E':{}, 'B':{}, 'A': {}, 'V':{}}
a_counter = Counter()
e_counter = Counter()
b_counter = Counter()
v_counter = Counter()

### Training the model and getting prior probabilities
This is a function where we get a dataset and then calculate each word count for each class and how many times a class will appear overall in our training dataset. Through these word counts we can start calculating probabilites with Naive Bayes.

In [53]:
def get_counts(data):
    text_in_class = {'E':{}, 'B':{}, 'A': {}, 'V':{}}
    a_counter = Counter()
    e_counter = Counter()
    b_counter = Counter()
    v_counter = Counter()
    word_freq = Counter()
    x = []
    y = []
    count = 0
    for i in data:
        if count != 0:
            x.append(i[2])
            y.append(i[1])
            if (i[1] == "A"):
                a = Counter(i[2].split())
                a_counter += a
                word_freq += Counter(set(a))
            if (i[1] == "B"):
                b = Counter(i[2].split())
                b_counter += b
                word_freq += Counter(set(b))
            if (i[1] == "E"):
                e = Counter(i[2].split())
                e_counter += e
                word_freq += Counter(set(e))
            if (i[1] == "V"):
                v = Counter(i[2].split())
                v_counter += v
                word_freq += Counter(set(v))
        count += 1
    target_counter = Counter(y)
    
    return word_freq, target_counter, a_counter, b_counter, e_counter, v_counter

word_freq_in_doc, target_counter, a_counter, b_counter, e_counter, v_counter = get_counts(data)
##print(a_counter)
print(target_counter)
print(len(word_freq))

Counter({'E': 2144, 'B': 1602, 'A': 128, 'V': 126})
32216


In [8]:
a_sum = sum(target_counter.values())
a_probability = target_counter["A"] / a_sum
b_probability = target_counter["B"] / a_sum
v_probability = target_counter["V"] / a_sum
e_probability = target_counter["E"] / a_sum
all_unique = a_counter + b_counter + e_counter + v_counter

In [9]:
print(a_probability)
print(b_probability)
print(v_probability)
print(e_probability)

0.032
0.4005
0.0315
0.536


#### Priori
As we can see above we have very unbalanced priori between the classes which will affect the predicted probabilites of our naive bayes. Through just looking at these prior probabilites (The probability of a class being that class before we've seen the data) we can predict that we will most likely get a lot of E's in our predictions since it has such a high prior probability.

In [11]:
a_probabilites = {}
b_probabilites = {}
v_probabilites = {}
e_probabilites = {}

### Standard Naive Bayes Training

In [15]:
a_word_sum = sum(a_counter.values())
b_word_sum = sum(b_counter.values())
v_word_sum = sum(v_counter.values())
e_word_sum = sum(e_counter.values())

In [16]:
all_unique = a_counter + b_counter + v_counter + e_counter
sum_unique = len(all_unique.keys())

In [17]:
for word in all_unique:
    if word not in a_counter.keys():
        a_prob = 1
    elif word in a_counter.keys():
        a_prob = a_counter.get(word) + 1
    a_probabilites[word] = a_prob / a_word_sum + sum_unique
    
    if word not in b_counter.keys():
        b_prob = 1
    elif word in b_counter.keys():
        b_prob = b_counter.get(word) + 1
    b_probabilites[word] = b_prob / b_word_sum + sum_unique
    
    if word not in v_counter.keys():
        v_prob = 1
    elif word in b_counter.keys():
        v_prob = v_counter.get(word) + 1
    v_probabilites[word] = v_prob / v_word_sum + sum_unique
    
        
    if word not in e_counter.keys():
        e_prob = 1
    elif word in e_counter.keys():
        e_prob = e_counter.get(word) + 1
    e_probabilites[word] = e_prob / e_word_sum + sum_unique
    

### Standard Naive Bayes testing

In [18]:
all_unique = a_counter + b_counter + v_counter + e_counter
sum_unique = len(all_unique.keys())
a_word_sum = len(a_counter.values())
b_word_sum = len(b_counter.values())
v_word_sum = len(v_counter.values())
e_word_sum = len(e_counter.values())
predictions = []

for i in test_data:
    count_train = Counter(i[1].split())
    #count_train = Counter(i[1])
    class_prob = {"A": 0, "B": 0, "V": 0, "E": 0}
    for word in i[1].split():
        a_prob = a_probabilites.get(word)
        if (a_prob == None):
            a_prob = 1 / a_word_sum + sum_unique
        class_prob["A"] = math.log(a_prob) * count_train[word] + class_prob["A"]
        

        b_prob = b_probabilites.get(word) 
        if (b_prob == None):
            b_prob = 1 / b_word_sum + sum_unique
        class_prob["B"] =  math.log(b_prob) * count_train[word] + class_prob["B"]
        

        v_prob = v_probabilites.get(word)
        if (v_prob == None):
            v_prob = 1 / v_word_sum + sum_unique
        class_prob["V"] = math.log(v_prob) * count_train[word] + class_prob["V"]

        
        e_prob = e_probabilites.get(word)
        if (e_prob == None):
            e_prob = 1 / e_word_sum + sum_unique
        class_prob["E"] = math.log(e_prob) * count_train[word] + class_prob["E"]
    #print(class_prob)
    class_prob["A"] = class_prob["A"] + math.log(a_probability)
    class_prob["B"] = class_prob["B"] + math.log(b_probability)
    class_prob["V"] = class_prob["V"] + math.log(v_probability)
    class_prob["E"] = class_prob["E"] + math.log(e_probability)
    predictions.append(max(class_prob.items(), key=operator.itemgetter(1))[0])
    
##print(all_unique)
print(predictions)

['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',

### TF * IDF Naive Bayes
For my extension of Naive Bayes I decided to add TF * IDF to discount words which would appear very frequently in the documents. If a word would appear in many documents in different classes then it won't really help us in our classification as that word most likely isn't related to the class and is hence pretty useless. With TF * IDF we apply a weighting to our words to prefer the rarer words and make their probability for a class higher than common words which will apear in every text, the texts with high frequency will recieve a weighting which reduces the probability of that word given a class which has significantly increased the performance of the algorithm.

### TF * IDF Naive Bayes Training

In [19]:
def get_word_freq(data, index):
    real_counter = Counter()
    for doc in data:
        a_set = set(doc[index].split())
        real_counter = Counter(a_set) + real_counter
    return real_counter

In [55]:
print(len(get_word_freq(data, 2)))

32216


In [29]:
def tf_idf_train(training_split_len, all_unique, word_freq_in_doc,a_sum, b_sum, e_sum, v_sum, sum_unique, a_counter, b_counter, e_counter, v_counter):
    a_probabilites = {}
    b_probabilites = {}
    v_probabilites = {}
    e_probabilites = {}
    
    for word in all_unique:

        idf = math.log(training_split_len / word_freq_in_doc.get(word))

        if word not in a_counter.keys():
            a_prob = 1
        elif word in a_counter.keys():
            a_prob = a_counter.get(word) + 1
        a_prob = (math.log(a_prob)*idf) / ((math.log(a_sum)*idf) + sum_unique)
        if a_prob == 0 or a_prob < 0:
            a_prob = 0.00001
        a_probabilites[word] = a_prob 

        if word not in b_counter.keys():
            b_prob = 1
        elif word in b_counter.keys():
            b_prob = b_counter.get(word) + 1
        b_prob = (math.log(b_prob)*idf) / ((math.log(b_sum)*idf) + sum_unique)
        if b_prob == 0 or b_prob < 0:
            b_prob = 0.00001
        b_probabilites[word] = b_prob

        if word not in v_counter.keys():
            v_prob = 1
        elif word in b_counter.keys():
            v_prob = v_counter.get(word) + 1
        v_prob = (math.log(v_prob)*idf) / ((math.log(v_sum)*idf) + sum_unique)
        if v_prob == 0 or v_prob < 0:
            v_prob = 0.00001
        v_probabilites[word] = v_prob


        if word not in e_counter.keys():
            e_prob = 1
        elif word in e_counter.keys():
            e_prob = e_counter.get(word) + 1
        e_prob = (math.log(e_prob)*idf) / ((math.log(e_sum)*idf) + sum_unique)
        if e_prob == 0 or e_prob < 0:
            e_prob = 0.00001
        e_probabilites[word] = e_prob
        
    return a_probabilites, b_probabilites, e_probabilites, v_probabilites

In [31]:
a_probabilites, b_probabilites, e_probabilites, v_probabilites = tf_idf_train(len(data),all_unique, word_freq_in_doc,a_sum, b_sum, e_sum, v_sum, sum_unique,  a_counter, b_counter, e_counter, v_counter)

### TF * IDF Naive Bayes Testing

In [36]:
def tf_idf_test(a_word_sum, b_word_sum, v_word_sum, e_word_sum,test_data, a_probabilites, b_probabilites, e_probabilites, v_probabilites, a_probability, b_probability, v_probability, e_probability, index):
    predictions = []
    for i in test_data:
        count_train = Counter(i[index].split())
        #count_train = Counter(i[1])
        class_prob = {"A": 0, "B": 0, "V": 0, "E": 0}
        for word in i[index].split():
            a_prob = a_probabilites.get(word)
            if (a_prob == None):
                a_prob = 1 / a_word_sum + sum_unique
            class_prob["A"] = math.log(a_prob) * count_train[word] + class_prob["A"]


            b_prob = b_probabilites.get(word) 
            if (b_prob == None):
                b_prob = 1 / b_word_sum + sum_unique
            class_prob["B"] =  math.log(b_prob) * count_train[word] + class_prob["B"]


            v_prob = v_probabilites.get(word)
            if (v_prob == None):
                v_prob = 1 / v_word_sum + sum_unique
            class_prob["V"] = math.log(v_prob) * count_train[word] + class_prob["V"]


            e_prob = e_probabilites.get(word)
            if (e_prob == None):
                e_prob = 1 / e_word_sum + sum_unique
            class_prob["E"] = math.log(e_prob) * count_train[word] + class_prob["E"]
        #print(class_prob)
        class_prob["A"] =  math.log(a_probability) + class_prob["A"]
        class_prob["B"] =  math.log(b_probability) + class_prob["B"]
        class_prob["V"] =  math.log(v_probability) + class_prob["V"]
        class_prob["E"] =  math.log(e_probability) + class_prob["E"]
        predictions.append(max(class_prob.items(), key=operator.itemgetter(1))[0])
    
    return predictions
predictions = tf_idf_test(a_sum, b_sum, v_sum, e_sum,test_data, a_probabilites, b_probabilites, e_probabilites, v_probabilites, a_probability, b_probability, v_probability, e_probability, 1)
##print(all_unique)
print(predictions)

['B', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'B', 'A', 'E', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'B', 'B', 'E', 'B', 'B', 'E', 'B', 'E', 'B', 'B', 'E', 'E', 'B', 'B', 'E', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'B', 'B', 'E', 'E', 'E', 'E', 'B', 'B', 'E', 'E', 'B', 'B', 'B', 'E', 'E', 'E', 'B', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'B', 'B', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'B', 'B', 'E', 'E', 'E', 'B', 'B', 'E', 'E', 'B', 'B', 'B', 'B', 'B', 'B', 'E', 'E', 'E', 'B', 'B', 'E', 'B', 'B', 'B', 'E', 'E', 'B', 'B', 'B', 'B', 'B', 'E', 'E', 'B', 'E', 'E', 'E', 'B', 'B', 'B', 'E', 'B', 'B', 'E', 'B', 'E', 'E', 'E', 'B', 'B', 'E', 'B', 'B', 'E', 'B', 'E', 'E', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'B', 'E', 'E', 'E', 'B', 'B', 'A', 'E', 'E', 'E', 'E', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'B', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'B', 'B', 'E', 'A', 'B', 'B', 'E', 'B', 'A', 'B', 'E', 'B', 'E',

### Cross Validation
To test how our algoirthm  would perform I chose to use cross validation as my tactic to get a significant result, although I wanted to use stratified cross validation but this would be really really computationally expensive due to the nature of how I created the algorithm. Stratified k-fold cross validation would ensure that we got a good distribution of classes in our testing and training set but due to time constraints, running such an algorithm would t

In [56]:
k = 10
rand_split = cross_validation_split(data, k)
accuracies = []
for i in range(5):
    for i in range(k):
        test_split = rand_split[i]
        training_split = rand_split.copy()
        training_split.pop(i)

        training_split = [item for sublist in training_split for item in sublist]
        text_in_class = {'E':{}, 'B':{}, 'A': {}, 'V':{}}
        a_counter = Counter()
        e_counter = Counter()
        b_counter = Counter()
        v_counter = Counter()

        word_freq_in_doc,target_counter, a_counter, b_counter, e_counter, v_counter = get_counts(training_split)

    #    a_counter = Counter(dict(a_counter.most_common(1250)))
    #    b_counter = Counter(dict(b_counter.most_common(1250)))
    #    e_counter = Counter(dict(e_counter.most_common(1250)))
    #    v_counter = Counter(dict(v_counter.most_common(1250)))

        all_unique = a_counter + b_counter + v_counter + e_counter
        sum_unique = len(all_unique.keys())

        a_sum = sum(target_counter.values())
        a_probability = target_counter["A"] / a_sum
        b_probability = target_counter["B"] / a_sum
        v_probability = target_counter["V"] / a_sum
        e_probability = target_counter["E"] / a_sum


        a_sum = len(a_counter.values())
        b_sum = len(b_counter.values())
        v_sum = len(v_counter.values())
        e_sum = len(e_counter.values())

        a_probabilites, b_probabilites, e_probabilites, v_probabilites = tf_idf_train(len(training_split), all_unique, word_freq_in_doc,a_sum, b_sum, e_sum, v_sum, sum_unique, a_counter, b_counter, e_counter, v_counter)
        predictions = tf_idf_test(a_sum, b_sum, v_sum, e_sum,test_split, a_probabilites, b_probabilites, e_probabilites, v_probabilites, a_probability, b_probability, v_probability, e_probability, 2)


        accuracies.append(accuracy_metric(test_split, predictions))
        print(accuracies)

[92.75]
[92.75, 91.75]


KeyboardInterrupt: 

In [48]:
print(statistics.mean(accuracies))

91.575


### Accuracy with TF*IDF
after a 10-fold cross validation which was run 10 times I got an average accuracy of 91.704166666667 which shows us that our model performed much better after applying a wieghting to the frequencies of the words overall.

### Complement TF*IDF Naive Bayes Training

In [49]:
def complement_train(training_split_len, all_unique, word_freq_in_doc,a_sum, b_sum, e_sum, v_sum, sum_unique,a_counter, b_counter, e_counter, v_counter ):
    a_probabilites = {}
    b_probabilites = {}
    v_probabilites = {}
    e_probabilites = {}
    
    for word in all_unique:

        idf = math.log(training_split_len / word_freq_in_doc.get(word))

        if word not in a_counter.keys():
            a_prob = 1
        elif word in a_counter.keys():
            a_prob = a_counter.get(word) + 1
        a_prob = (math.log(a_prob)*idf) / ((math.log(a_sum)*idf) + sum_unique)
        if a_prob == 0 or a_prob < 0:
            a_prob = 0.00001
        a_probabilites[word] = math.log(a_prob)

        if word not in b_counter.keys():
            b_prob = 1
        elif word in b_counter.keys():
            b_prob = b_counter.get(word) + 1
        b_prob = (math.log(b_prob)*idf) / ((math.log(b_sum)*idf) + sum_unique)
        if b_prob == 0 or b_prob < 0:
            b_prob = 0.00001
        b_probabilites[word] = math.log(b_prob)

        if word not in v_counter.keys():
            v_prob = 1
        elif word in b_counter.keys():
            v_prob = v_counter.get(word) + 1
        v_prob = (math.log(v_prob)*idf) / ((math.log(v_sum)*idf) + sum_unique)
        if v_prob == 0 or v_prob < 0:
            v_prob = 0.00001
        v_probabilites[word] = math.log(v_prob)


        if word not in e_counter.keys():
            e_prob = 1
        elif word in e_counter.keys():
            e_prob = e_counter.get(word) + 1
        e_prob = (math.log(e_prob)*idf) / ((math.log(e_sum)*idf) + sum_unique)
        if e_prob == 0 or e_prob < 0:
            e_prob = 0.00001
        e_probabilites[word] = math.log(e_prob)
        
    return a_probabilites, b_probabilites, e_probabilites, v_probabilites

In [50]:
def complement_test(word_freq_in_doc,a_sum, b_sum, v_sum, e_sum, training_split_len, test_data, a_probabilites, b_probabilites, e_probabilites, v_probabilites, a_probability, b_probability, v_probability, e_probability, index):
    predictions = []
    for i in test_data:
        count_train = Counter(i[index].split())
        #count_train = Counter(i[1])
        class_prob = {"A": 0, "B": 0, "V": 0, "E": 0}
        for word in i[index].split():
            
            if word in word_freq_in_doc:
                word_in_doc = word_freq_in_doc[word]
            else:
                word_in_doc = 0
    #        print(word_in_doc)
            #Calculate the idf 
            if word_in_doc != 0:
                idf = math.log(training_split_len / word_in_doc)
            else:
                idf = 0
            
            
            a_prob = a_probabilites.get(word)
            if (a_prob == None):
                a_prob = (math.log(1)*idf) / ((math.log(a_sum)*idf) + sum_unique)
                
            if a_prob == 0 or a_prob < 0:
                a_prob = 0.000001
            else:
                a_prob = math.log(a_prob) 
            class_prob["A"] = a_prob * count_train[word] + class_prob["A"]


            b_prob = b_probabilites.get(word) 
            if (b_prob == None):
                b_prob = (math.log(1)*idf) / ((math.log(b_sum)*idf) + sum_unique)
                
            if b_prob == 0 or b_prob < 0:
                b_prob = 0.00001
            else:
                b_prob = math.log(b_prob)
            class_prob["B"] =  b_prob * count_train[word] + class_prob["B"]


            v_prob = v_probabilites.get(word)
            if (v_prob == None):
                v_prob = (math.log(1)*idf) / ((math.log(v_sum)*idf) + sum_unique)
            if v_prob == 0 or v_prob < 0:
                v_prob = 0.00001
            else:
                v_prob = math.log(v_prob)
            class_prob["V"] = v_prob * count_train[word] + class_prob["V"]


            e_prob = e_probabilites.get(word)
            if (e_prob == None):
                e_prob = (math.log(1)*idf) / ((math.log(e_sum)*idf) + sum_unique)
                
            if e_prob == 0 or e_prob < 0:
                e_prob = 0.00001
            else:
                e_prob = math.log(b_prob)
            class_prob["E"] = e_prob * count_train[word] + class_prob["E"]
        #print(class_prob)
        class_prob["A"] =  math.log(a_probability) - class_prob["A"]
        class_prob["B"] =  math.log(b_probability) - class_prob["B"]
        class_prob["V"] =  math.log(v_probability) - class_prob["V"]
        class_prob["E"] =  math.log(e_probability) - class_prob["E"]
        predictions.append(max(class_prob.items(), key=operator.itemgetter(1))[0])
    
    return predictions
##print(all_unique)


In [51]:
k = 10
rand_split = cross_validation_split(data, k)
accuracies = []
for i in range(k):
    test_split = rand_split[i]
    training_split = rand_split.copy()
    training_split.pop(i)
   
    training_split = [item for sublist in training_split for item in sublist]
    text_in_class = {'E':{}, 'B':{}, 'A': {}, 'V':{}}
    a_counter = Counter()
    e_counter = Counter()
    b_counter = Counter()
    v_counter = Counter()
    
    word_freq_in_doc,target_counter, a_counter, b_counter, e_counter, v_counter = get_counts(training_split)

    all_unique = a_counter + b_counter + v_counter + e_counter
    sum_unique = len(all_unique.keys())
    
    a_sum = sum(target_counter.values())
    a_probability = target_counter["A"] / a_sum
    b_probability = target_counter["B"] / a_sum
    v_probability = target_counter["V"] / a_sum
    e_probability = target_counter["E"] / a_sum
    
    
    # Complement naive bayes so get all words in other classes
    not_a = e_counter + b_counter + v_counter
    not_b = a_counter + e_counter + v_counter
    not_e = a_counter + v_counter + b_counter
    not_v = a_counter + e_counter + b_counter
    #Get the sum of all counts in a class. Total word count for each class
    a_sum = sum(not_a.values())
    b_sum = sum(not_b.values())
    e_sum = sum(not_e.values())
    v_sum = sum(not_v.values())
    
    a_probabilites, b_probabilites, e_probabilites, v_probabilites = complement_train(len(training_split), all_unique, word_freq_in_doc,a_sum, b_sum, e_sum, v_sum, sum_unique, not_a, not_b, not_e, not_v)
    predictions = complement_test(word_freq_in_doc,a_sum, b_sum, v_sum, e_sum,len(training_split),test_split, a_probabilites, b_probabilites, e_probabilites, v_probabilites, a_probability, b_probability, v_probability, e_probability, 2)
    
    
    accuracies.append(accuracy_metric(test_split, predictions))
    print(accuracies)

[48.25]
[48.25, 53.0]
[48.25, 53.0, 54.0]
[48.25, 53.0, 54.0, 52.75]
[48.25, 53.0, 54.0, 52.75, 56.00000000000001]
[48.25, 53.0, 54.0, 52.75, 56.00000000000001, 56.25]
[48.25, 53.0, 54.0, 52.75, 56.00000000000001, 56.25, 52.5]
[48.25, 53.0, 54.0, 52.75, 56.00000000000001, 56.25, 52.5, 53.25]
[48.25, 53.0, 54.0, 52.75, 56.00000000000001, 56.25, 52.5, 53.25, 56.25]
[48.25, 53.0, 54.0, 52.75, 56.00000000000001, 56.25, 52.5, 53.25, 56.25, 53.75]


In [64]:
overall_accuracy = []
for j in range(10):
    k = 10
    rand_split = cross_validation_split(data, k)
    accuracies = []
    for i in range(k):
        test_split = rand_split[i]
        training_split = rand_split.copy()
        training_split.pop(i)

        training_split = [item for sublist in training_split for item in sublist]
        text_in_class = {'E':{}, 'B':{}, 'A': {}, 'V':{}}
        a_counter = Counter()
        e_counter = Counter()
        b_counter = Counter()
        v_counter = Counter()

        word_freq_in_doc,target_counter, a_counter, b_counter, e_counter, v_counter = get_counts(training_split)

        a_sum = sum(target_counter.values())
        a_probability = target_counter["A"] / a_sum
        b_probability = target_counter["B"] / a_sum
        v_probability = target_counter["V"] / a_sum
        e_probability = target_counter["E"] / a_sum

        training_len = len(training_split)

        predictions = complement_idf(a_counter, b_counter, v_counter, e_counter, a_probability, b_probability, v_probability, e_probability, test_split, training_len, 2)


    #    a_probabilites, b_probabilites, e_probabilites, v_probabilites = complement_train(len(training_split), all_unique, word_freq_in_doc,a_sum, b_sum, e_sum, v_sum, sum_unique, not_a, not_b, not_e, not_v)
     #   predictions = complement_test(word_freq_in_doc,a_sum, b_sum, v_sum, e_sum,len(training_split),test_split, a_probabilites, b_probabilites, e_probabilites, v_probabilites, a_probability, b_probability, v_probability, e_probability, 2)


        accuracies.append(accuracy_metric(test_split, predictions))
    print(accuracies)
    overall_accuracy.append(statistics.mean(accuracies))

[91.25]
[91.25, 94.75]
[91.25, 94.75, 89.5]
[91.25, 94.75, 89.5, 94.5]
[91.25, 94.75, 89.5, 94.5, 92.75]
[91.25, 94.75, 89.5, 94.5, 92.75, 92.25]
[91.25, 94.75, 89.5, 94.5, 92.75, 92.25, 95.25]
[91.25, 94.75, 89.5, 94.5, 92.75, 92.25, 95.25, 92.5]
[91.25, 94.75, 89.5, 94.5, 92.75, 92.25, 95.25, 92.5, 91.75]
[91.25, 94.75, 89.5, 94.5, 92.75, 92.25, 95.25, 92.5, 91.75, 92.75]
[93.25]
[93.25, 90.5]
[93.25, 90.5, 91.75]
[93.25, 90.5, 91.75, 94.5]
[93.25, 90.5, 91.75, 94.5, 93.0]
[93.25, 90.5, 91.75, 94.5, 93.0, 91.25]
[93.25, 90.5, 91.75, 94.5, 93.0, 91.25, 94.75]
[93.25, 90.5, 91.75, 94.5, 93.0, 91.25, 94.75, 93.5]
[93.25, 90.5, 91.75, 94.5, 93.0, 91.25, 94.75, 93.5, 91.75]
[93.25, 90.5, 91.75, 94.5, 93.0, 91.25, 94.75, 93.5, 91.75, 94.5]
[92.0]
[92.0, 91.75]
[92.0, 91.75, 94.0]
[92.0, 91.75, 94.0, 93.25]
[92.0, 91.75, 94.0, 93.25, 94.75]
[92.0, 91.75, 94.0, 93.25, 94.75, 93.75]
[92.0, 91.75, 94.0, 93.25, 94.75, 93.75, 92.5]
[92.0, 91.75, 94.0, 93.25, 94.75, 93.75, 92.5, 93.75]
[92.0, 91.

In [65]:
print(statistics.mean(overall_accuracy))

92.985


### Smoothing out the data

In [8]:
all_unique = a_counter + b_counter + v_counter + e_counter
all_unique = Counter(dict(all_unique.most_common(5000)))
sum_unique = len(all_unique.keys())

In [41]:
for i in list(a_counter.keys()):
    if i not in all_unique.keys():
        a_counter.pop(i)
for i in list(b_counter.keys()):
    if i not in all_unique.keys():
        b_counter.pop(i)
for i in list(v_counter.keys()):
    if i not in all_unique.keys():
        v_counter.pop(i)
for i in list(e_counter.keys()):
    if i not in all_unique.keys():
        e_counter.pop(i)

In [48]:
a_counter = Counter(dict(a_counter.most_common(5000)))
b_counter = Counter(dict(b_counter.most_common(5000)))
e_counter = Counter(dict(e_counter.most_common(5000)))
v_counter = Counter(dict(v_counter.most_common(5000)))
all_unique = a_counter + b_counter + v_counter + e_counter
sum_unique = len(all_unique.keys())

In [None]:
print(a_counter)

In [49]:
a_sum = sum(target_counter.values())
a_probability = target_counter["A"] / a_sum
b_probability = target_counter["B"] / a_sum
v_probability = target_counter["V"] / a_sum
e_probability = target_counter["E"] / a_sum

In [50]:
print("A prior = {0}".format(a_probability))
print("B prior = {0}".format(b_probability))
print("V prior = {0}".format(v_probability))
print("E prior = {0}".format(e_probability))

A prior = 0.032
B prior = 0.4005
V prior = 0.0315
E prior = 0.536


In [10]:
all_unique = a_counter + b_counter + v_counter + e_counter
sum_unique = len(all_unique.keys())
a_word_sum = len(a_counter.keys())
b_word_sum = len(b_counter.keys())
v_word_sum = len(v_counter.keys())
e_word_sum = len(e_counter.keys())

In [56]:
all_unique = a_counter + b_counter + v_counter + e_counter
sum_unique = len(all_unique.keys())
a_word_sum = len(a_counter.values())
b_word_sum = len(b_counter.values())
v_word_sum = len(v_counter.values())
e_word_sum = len(e_counter.values())
predictions = []


for i in test_data:
    count_train = Counter(i[1].split())
    #count_train = Counter(i[1])
    class_prob = {"A": 0, "B": 0, "V": 0, "E": 0}
    for word in i[1].split():
        a_prob = a_probabilites.get(word)
        if (a_prob == None):
            a_prob = 1 / a_word_sum + sum_unique
        class_prob["A"] = math.log(a_prob) * count_train[word] + class_prob["A"]
        

        b_prob = b_probabilites.get(word) 
        if (b_prob == None):
            b_prob = 1 / b_word_sum + sum_unique
        class_prob["B"] =  math.log(b_prob) * count_train[word] + class_prob["B"]
        

        v_prob = v_probabilites.get(word)
        if (v_prob == None):
            v_prob = 1 / v_word_sum + sum_unique
        class_prob["V"] = math.log(v_prob) * count_train[word] + class_prob["V"]

        
        e_prob = e_probabilites.get(word)
        if (e_prob == None):
            e_prob = 1 / e_word_sum + sum_unique
        class_prob["E"] = math.log(e_prob) * count_train[word] + class_prob["E"]
    #print(class_prob)
    class_prob["A"] = class_prob["A"] + math.log(a_probability)
    class_prob["B"] = class_prob["B"] + math.log(b_probability)
    class_prob["V"] = class_prob["V"] + math.log(v_probability)
    class_prob["E"] = class_prob["E"] + math.log(e_probability)
    predictions.append(max(class_prob.items(), key=operator.itemgetter(1))[0])
    
##print(all_unique)
print(predictions)
print(len(predictions))

['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',

In [56]:
def create_ngram(s, n):
    
    ngrams = [ngram for ngram in s.split(" ") if ngram != ""]
    
    ngrams = zip(*[ngrams[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [66]:
predictions = []

not_a = e_counter + b_counter + v_counter
not_b = a_counter + e_counter + v_counter
not_e = a_counter + v_counter + b_counter
not_v = a_counter + e_counter + b_counter

a_sum = sum(not_a.values())
b_sum = sum(not_b.values())
v_sum = sum(not_v.values())
e_sum = sum(not_e.values())

for i in test_data:
    #count_train = Counter(i[1].split())
    count_train = Counter(i[1])
    class_prob = {"A": 0, "B": 0, "V": 0, "E": 0}
    for word in i[1]:
        if word not in not_a.keys():
            a_prob = 1
        elif word in not_a.keys():
            a_prob = not_a.get(word) + 1
        a_prob = a_prob /  a_sum + len(not_a.keys())
        class_prob["A"] = math.log(a_prob) * count_train[word] + class_prob["A"]
        
        if word not in not_b.keys():
            b_prob = 1
        elif word in not_b.keys():
            b_prob = not_b.get(word) + 1
        b_prob = b_prob / b_sum + len(not_b.keys())
        class_prob["B"] =  math.log(b_prob) * count_train[word] + class_prob["B"]
        

        if word not in not_v.keys():
            v_prob = 1
        elif word in not_v.keys():
            v_prob = not_v.get(word) + 1
        v_prob = v_prob / v_sum + len(not_v.keys())
        class_prob["V"] = math.log(v_prob) * count_train[word] + class_prob["V"]

        if word not in not_e.keys():
            e_prob = 1
        elif word in not_e.keys():
            e_prob = not_e.get(word) + 1
        e_prob = e_prob / e_sum + len(not_v.keys())
        class_prob["E"] = math.log(e_prob) * count_train[word] + class_prob["E"]
    class_prob["A"] = 1/class_prob["A"] + math.log(a_probability)
    class_prob["B"] = 1/class_prob["B"] + math.log(b_probability)
    class_prob["V"] = 1/class_prob["V"] + math.log(v_probability)
    class_prob["E"] = 1/class_prob["E"] + math.log(e_probability)
    predictions.append(min(class_prob.items(), key=operator.itemgetter(1))[0])
print(predictions)

['V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V', 'V',

In [19]:
#Set all unique words to 1 as it will appear once in a document
word_freq_in_doc = {}
for word in all_unique.keys():
    word_freq_in_doc[word] = 1

In [52]:
#Get the frequency of word in separate documents
for word in all_unique.keys():
    for doc in data:
        if word in doc[2]:
            word_freq_in_doc[word] += 1

In [60]:
def complement_idf(a_counter, b_counter, v_counter, e_counter, a_probability, b_probability, v_probability, e_probability, test_data, training_len, index):
    predictions = []
    # Complement naive bayes so get all words in other classes
    not_a = e_counter + b_counter + v_counter
    not_b = a_counter + e_counter + v_counter
    not_e = a_counter + v_counter + b_counter
    not_v = a_counter + e_counter + b_counter
    
    sum_unique = a_counter + b_counter + v_counter + e_counter
    sum_unique = len(sum_unique.keys())

    #Get the sum of all counts in a class. Total word count for each class
    a_sum = sum(not_a.values())
    b_sum = sum(not_b.values())
    e_sum = sum(not_e.values())
    v_sum = sum(not_v.values())

    #Iterate through the csv
    for i in test_data:
        count_train = Counter(i[index].split())
        # For every document, do a word count
        #count_train = Counter(i[1])
        class_prob = {"A": 1, "B": 1, "V": 1, "E": 1}
        # Iterate through each word in a document
        for word in i[index].split():
            # Get the frequency of a word in all training documents
            if word in word_freq_in_doc:
                word_in_doc = word_freq_in_doc[word]
            else:
                word_in_doc = 0
    #        print(word_in_doc)
            #Calculate the idf 
            if word_in_doc != 0:
                idf = math.log(training_len / word_in_doc)
            else:
                idf = 0
            if word not in not_a.keys():
                a_prob = 1
            elif word in not_a.keys():
                a_prob = not_a.get(word) + 1
            a_prob = (math.log(a_prob)*idf) / ((math.log(a_sum)*idf) + sum_unique)
            if a_prob == 0 or a_prob < 0:
                a_prob = 0.00001
           # print(a_prob)
            class_prob["A"] = math.log(a_prob) * count_train[word] + class_prob["A"]

            if word not in not_b.keys():
                b_prob = 1
            elif word in not_b.keys():
                b_prob = not_b.get(word) + 1
           # print("b_Probability " + str(b_prob))
            #print("idf " + str(idf))
            b_prob = (math.log(b_prob)*idf) / ((math.log(b_sum)*idf) + sum_unique)
            if b_prob == 0 or b_prob < 0:
                b_prob = 0.00001
            class_prob["B"] =  math.log(b_prob) * count_train[word] + class_prob["B"]


            if word not in not_v.keys():
                v_prob = 1
            elif word in not_v.keys():
                v_prob = not_v.get(word) + 1
            v_prob = (math.log(v_prob)*idf) / ((math.log(v_sum)*idf) + sum_unique)
            if v_prob == 0 or v_prob <0:
                v_prob = 0.00001
            class_prob["V"] = math.log(v_prob) * count_train[word] + class_prob["V"]

            if word not in not_e.keys():
                e_prob = 1
            elif word in not_e.keys():
                e_prob = not_e.get(word) + 1
            e_prob = (math.log(e_prob)*idf) / ((math.log(e_sum)*idf) + sum_unique)
            if e_prob == 0 or e_prob < 0:
                e_prob = 0.00001
            class_prob["E"] = math.log(e_prob) * count_train[word] + class_prob["E"]
        class_prob["A"] =  math.log(a_probability) - class_prob["A"]
        class_prob["B"] =  math.log(b_probability) - class_prob["B"]
        class_prob["V"] =  math.log(v_probability) - class_prob["V"]
        class_prob["E"] =  math.log(e_probability) - class_prob["E"]
        #print(class_prob)
        predictions.append(max(class_prob.items(), key=operator.itemgetter(1))[0])
    return predictions
print(predictions)

['B', 'E', 'E', 'B', 'B', 'B', 'E', 'B', 'E', 'B', 'E', 'E', 'E', 'B', 'B', 'E', 'E', 'B', 'B', 'B', 'E', 'B', 'E', 'B', 'E', 'B', 'E', 'B', 'E', 'B', 'B', 'E', 'B', 'E', 'E', 'E', 'B', 'A', 'B', 'B', 'B', 'E', 'B', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'B', 'E', 'B', 'E', 'E', 'B', 'E', 'B', 'B', 'E', 'B', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'B', 'B', 'E', 'B', 'B', 'E', 'E', 'E', 'B', 'E', 'B', 'B', 'B', 'B', 'E', 'B', 'B', 'E', 'E', 'B', 'E', 'E', 'E', 'B', 'A', 'B', 'E', 'E', 'B', 'E', 'B', 'B', 'B', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'A', 'E', 'B', 'B', 'B', 'E', 'B', 'E', 'E', 'B', 'E', 'E', 'B', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'E', 'B', 'B', 'E', 'B', 'E', 'B', 'B', 'E', 'E', 'E', 'E', 'E', 'B', 'E', 'E', 'E', 'E', 'B', 'E', 'B', 'E', 'E', 'E', 'E', 'B', 'B', 'E', 'E', 'B', 'B', 'E', 'E', 'B', 'B', 'E', 'E', 'B', 'E', 'B', 'B', 'E', 'E', 'B', 'E', 'E', 'B', 'E',

In [43]:
predictions = []
# Complement naive bayes so get all words in other classes
not_a = e_counter + b_counter + v_counter
not_b = a_counter + e_counter + v_counter
not_e = a_counter + v_counter + b_counter
not_v = a_counter + e_counter + b_counter
#Get the sum of all counts in a class. Total word count for each class
a_sum = sum(not_a.values())
b_sum = sum(not_b.values())
e_sum = sum(not_e.values())
v_sum = sum(not_v.values())

#Iterate through the csv
for i in test_data:
    count_train = Counter(i[1].split())
    # For every document, do a word count
    #count_train = Counter(i[1])
    class_prob = {"A": 1, "B": 1, "V": 1, "E": 1}
    # Iterate through each word in a document
    for word in i[1].split():
        # Get the frequency of a word in all training documents
        if word in word_freq_in_doc:
            word_in_doc = word_freq_in_doc[word]
        else:
            word_in_doc = 0
#        print(word_in_doc)
        #Calculate the idf 
        if word_in_doc != 0:
            idf = math.log(4000 / word_in_doc)
        else:
            idf = 0
        if word not in not_a.keys():
            a_prob = 1
        elif word in not_a.keys():
            a_prob = not_a.get(word) + 1
        a_prob = (math.log(a_prob)*idf) / ((math.log(a_sum)*idf) + sum_unique)
        if a_prob == 0 or a_prob < 0:
            a_prob = 0.00001
       # print(a_prob)
        class_prob["A"] = math.log(a_prob) * count_train[word] + class_prob["A"]
        class_prob["A"] = class_prob["A"] * count_train[word]
    
        
        if word not in not_b.keys():
            b_prob = 1
        elif word in not_b.keys():
            b_prob = not_b.get(word) + 1
       # print("b_Probability " + str(b_prob))
        #print("idf " + str(idf))
        b_prob = (math.log(b_prob)*idf) / ((math.log(b_sum)*idf) + sum_unique)
        if b_prob == 0 or b_prob < 0:
            b_prob = 0.00001
        class_prob["B"] =  math.log(b_prob) * count_train[word] + class_prob["B"]
        class_prob["B"] = class_prob["B"] * count_train[word]

        if word not in not_v.keys():
            v_prob = 1
        elif word in not_v.keys():
            v_prob = not_v.get(word) + 1
        v_prob = (math.log(v_prob)*idf) / ((math.log(v_sum)*idf) + sum_unique)
        if v_prob == 0 or v_prob <0:
            v_prob = 0.00001
        class_prob["V"] = math.log(v_prob) * count_train[word] + class_prob["V"]
        class_prob["V"] = class_prob["V"] * count_train[word]
        
        if word not in not_e.keys():
            e_prob = 1
        elif word in not_e.keys():
            e_prob = not_e.get(word) + 1
        e_prob = (math.log(e_prob)*idf) / ((math.log(e_sum)*idf) + sum_unique)
        if e_prob == 0 or e_prob < 0:
            e_prob = 0.00001
        class_prob["E"] = math.log(e_prob) * count_train[word] + class_prob["E"]
        class_prob["E"] = class_prob["E"] * count_train[word]
        
        #print(sum(class_prob.values()))
    class_prob["A"] =  math.log(a_probability) - class_prob["A"]
    class_prob["B"] =  math.log(b_probability) - class_prob["B"]
    class_prob["V"] =  math.log(v_probability) - class_prob["V"]
    class_prob["E"] =  math.log(e_probability) - class_prob["E"]
    #print(class_prob)
    predictions.append(max(class_prob.items(), key=operator.itemgetter(1))[0])
print(predictions)

['A', 'A', 'A', 'V', 'V', 'A', 'A', 'V', 'V', 'V', 'V', 'A', 'V', 'A', 'A', 'A', 'A', 'V', 'A', 'V', 'A', 'A', 'V', 'V', 'A', 'V', 'V', 'V', 'A', 'V', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'V', 'V', 'A', 'V', 'V', 'A', 'A', 'V', 'A', 'V', 'V', 'A', 'A', 'V', 'V', 'V', 'V', 'V', 'V', 'A', 'V', 'V', 'V', 'A', 'V', 'A', 'A', 'V', 'V', 'A', 'V', 'A', 'A', 'V', 'A', 'A', 'A', 'V', 'A', 'V', 'A', 'V', 'V', 'A', 'A', 'A', 'V', 'A', 'A', 'A', 'A', 'V', 'A', 'A', 'A', 'V', 'V', 'V', 'A', 'A', 'A', 'V', 'A', 'A', 'V', 'V', 'A', 'A', 'V', 'A', 'A', 'V', 'A', 'V', 'A', 'A', 'V', 'V', 'V', 'V', 'A', 'V', 'A', 'V', 'V', 'A', 'A', 'V', 'A', 'A', 'A', 'A', 'V', 'A', 'A', 'V', 'A', 'V', 'V', 'V', 'A', 'V', 'A', 'V', 'V', 'A', 'A', 'V', 'A', 'A', 'A', 'V', 'V', 'V', 'V', 'A', 'V', 'A', 'V', 'A', 'V', 'V', 'A', 'V', 'V', 'A', 'A', 'A', 'A', 'V', 'A', 'V', 'A', 'A', 'A', 'A', 'V', 'V', 'V', 'V', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'V', 'V', 'V', 'A', 'V', 'A', 'V', 'A', 'V', 'A', 'V', 'V', 'A', 'A', 'V', 'A',

In [115]:
with open('tst.csv', newline='') as csvfile:
    test_data = list(csv.reader(csvfile))
    for i in test_data:
        n_gram = create_ngram(i[1], 2)
        i[1] = n_gram
print(test_data[1])

['1', ['in a', 'a previous', 'previous work', 'work all', 'all three', 'three components', 'components of', 'of comamonas', 'comamonas testosteroni', 'testosteroni b-356', 'b-356 biphenyl', 'biphenyl bphchlorobiphenyls', 'bphchlorobiphenyls pcbs', 'pcbs dioxygenase', 'dioxygenase dox', 'dox have', 'have been', 'been purified', 'purified and', 'and characterized', 'characterized they', 'they include', 'include an', 'an iron-sulphur', 'iron-sulphur protein', 'protein ispbph', 'ispbph which', 'which is', 'is the', 'the terminal', 'terminal oxygenase', 'oxygenase composed', 'composed of', 'of two', 'two subunits', 'subunits encoded', 'encoded by', 'by bpha', 'bpha and', 'and bphe', 'bphe a', 'a ferredoxin', 'ferredoxin ferbph', 'ferbph encoded', 'encoded by', 'by bphf', 'bphf and', 'and a', 'a reductase', 'reductase redbph', 'redbph encoded', 'encoded by', 'by bphg', 'bphg bphg', 'bphg is', 'is not', 'not located', 'located in', 'in the', 'the neighbourhood', 'neighbourhood of', 'of bphaef

In [79]:
with open('trg.csv', newline='') as csvfile:
    data = list(csv.reader(csvfile))
    for i in data:
        n_gram = create_ngram(i[2], 2)
        i[2] = n_gram
print(data[1])

['1', 'B', ['the 4', '4 202', '202 353', '353 bp', 'bp genome', 'genome of', 'of the', 'the alkaliphilic', 'alkaliphilic bacterium', 'bacterium bacillus', 'bacillus halodurans', 'halodurans c-125', 'c-125 contains', 'contains 4066', '4066 predicted', 'predicted protein', 'protein coding', 'coding sequences', 'sequences cdss', 'cdss 2141', '2141 527', '527 of', 'of which', 'which have', 'have functional', 'functional assignments', 'assignments 1182', '1182 29', '29 of', 'of which', 'which are', 'are conserved', 'conserved cdss', 'cdss with', 'with unknown', 'unknown function', 'function and', 'and 743', '743 18', '18 3', '3 of', 'of which', 'which have', 'have no', 'no match', 'match to', 'to any', 'any protein', 'protein database', 'database among', 'among the', 'the total', 'total cdss', 'cdss 88', '88 match', 'match sequences', 'sequences of', 'of proteins', 'proteins found', 'found only', 'only in', 'in bacillus', 'bacillus subtilis', 'subtilis and', 'and 667', '667 are', 'are widel

In [69]:
with open('predictions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["id", "class"])
    for i in range(len(predictions)):
        writer.writerow([i+1, predictions[i]])