In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
import operator
import pandas as pd
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

In [0]:
os.getcwd()
os.chdir('/content/drive/My Drive/NLP Assign 2')

In [0]:
X_train, y_train = load_svmlight_file("aclImdb/train/labeledBow.feat")
X_test, y_test = load_svmlight_file("aclImdb/test/labeledBow.feat")
X_train=X_train.astype(dtype=np.uint16)
X_test=X_test.astype(dtype=np.uint16)

y_train = (y_train >= 7).astype(int)
y_test = (y_test >= 7).astype(int)

In [0]:
def conf_matrix(pred, test):
    conf_matrix={'TP':0,'TN':0,'FP':0,'FN':0}
    
    for i in range(len(test)):
        if test[i]==1 and pred[i]==0:
            conf_matrix['FN']+=1
        else:
            if test[i]==1 and pred[i]==1:
                conf_matrix['TP']+=1
            else:
                if test[i]==0 and pred[i]==1:
                    conf_matrix['FP']+=1
                else:
                    if test[i]==0 and pred[i]==0:
                        conf_matrix['TN']+=1
    return conf_matrix

def get_accuracy(pred, test):
    matrix=conf_matrix(pred,test)
    accuracy=(matrix['TP']+matrix['TN'])/(matrix['TP']+matrix['TN']+matrix['FN']+matrix['FP'])
    return accuracy

def get_precision(pred, test):
    matrix=conf_matrix(pred,test)
    try:
        precision=matrix['TP']/(matrix['TP']+matrix['FP'])
    except:
        precision=0
            
    return precision

def get_recall(pred,test):
    matrix=conf_matrix(pred,test)
    try:
        recall=matrix['TP']/(matrix['TP']+matrix['FN'])
    except:
        recall=0
    return recall

def get_f1_measure(pred,test):
    precision=get_precision(pred,test)
    recall=get_recall(pred,test)
    try:
        f1_measure= 2*precision*recall/(precision+recall)
    except:
        f1_measure=0
    return f1_measure

In [0]:
X_train_arr=X_train.astype(dtype=np.uint16)

In [0]:
a=np.unique(y_train)

In [8]:
clf1 = MultinomialNB()
clf1.partial_fit(X_train_arr, y_train, a)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
pred_train = clf1.predict(X_train_arr)


### Evaluation on Train data

In [17]:
accuracy=get_accuracy(pred_train,list(y_train))
precision=get_precision(pred_train,list(y_train))
recall=get_recall(pred_train,list(y_train))
f1_measure=get_f1_measure(pred_train,list(y_train))
print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F-measure: ",f1_measure)

Accuracy:  0.89996
Precision:  0.9315494173500216
Recall:  0.86336
F-measure:  0.8961594353331949


In [0]:
x_test=X_test.astype(dtype=np.uint16)

In [0]:
X_test_arr=x_test.todense()

In [0]:
z = np.zeros((25000,4), dtype=np.uint16)
Xtest=np.append(X_test_arr, z, axis=1)

In [0]:
pred_test = []
chunkSize = 25
chunks = np.split(Xtest, chunkSize, axis=0) 
for chunk in chunks:
  op = clf1.predict(chunk)
  pred_test.extend(op)

In [23]:
accuracy=get_accuracy(pred_test,list(y_test))
precision=get_precision(pred_test,list(y_test))
recall=get_recall(pred_test,list(y_test))
f1_measure=get_f1_measure(pred_test,list(y_test))
print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F-measure: ",f1_measure)

Accuracy:  0.8136
Precision:  0.8590401172375893
Recall:  0.75032
F-measure:  0.8010077717994706


# Problem 6

On comparison with all the models, Multinomial naive Bayes performed the best since the vocabulary size was very large and the features were distributed as word counts.