In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
import operator
import pandas as pd
import numpy as np
from sklearn.datasets import load_svmlight_file

In [0]:
os.getcwd()
os.chdir('/content/drive/My Drive/NLP Assign 2')

### Load Dataset

In [0]:
X_train, y_train = load_svmlight_file("aclImdb/train/labeledBow.feat")
X_test, y_test = load_svmlight_file("aclImdb/test/labeledBow.feat")
X_train=X_train.astype(dtype=np.uint16)
X_test=X_test.astype(dtype=np.uint16)


y_train = (y_train >= 7).astype(int)
y_test = (y_test >= 7).astype(int)

# Problem 1

In [0]:
def conf_matrix(pred, test):
    conf_matrix={'TP':0,'TN':0,'FP':0,'FN':0}
    
    for i in range(len(test)):
        if test[i]==1 and pred[i]==0:
            conf_matrix['FN']+=1
        else:
            if test[i]==1 and pred[i]==1:
                conf_matrix['TP']+=1
            else:
                if test[i]==0 and pred[i]==1:
                    conf_matrix['FP']+=1
                else:
                    if test[i]==0 and pred[i]==0:
                        conf_matrix['TN']+=1
    return conf_matrix

def get_accuracy(pred, test):
    matrix=conf_matrix(pred,test)
    accuracy=(matrix['TP']+matrix['TN'])/(matrix['TP']+matrix['TN']+matrix['FN']+matrix['FP'])
    return accuracy

def get_precision(pred, test):
    matrix=conf_matrix(pred,test)
    try:
        precision=matrix['TP']/(matrix['TP']+matrix['FP'])
    except:
        precision=0
            
    return precision

def get_recall(pred,test):
    matrix=conf_matrix(pred,test)
    try:
        recall=matrix['TP']/(matrix['TP']+matrix['FN'])
    except:
        recall=0
    return recall

def get_f1_measure(pred,test):
    precision=get_precision(pred,test)
    recall=get_recall(pred,test)
    try:
        f1_measure= 2*precision*recall/(precision+recall)
    except:
        f1_measure=0
    return f1_measure

# Problem 2

In [0]:
def train_majority(y_train):
    majority=np.argmax(np.bincount(y_train))
    return majority

In [0]:
def test_majority(X_test,majority):
    X_Test=X_test.toarray()
    y_pred=np.full(X_Test.shape[0], majority, dtype=int)
    return y_pred

In [0]:
majority=train_majority(y_train)

In [0]:
y_pred_train=test_majority(X_train,majority)

### Evaluation on Training Data

In [14]:
print("Accuracy: ",get_accuracy(y_pred_train,y_train))
print("Precision: ",get_precision(y_pred_train,y_train))
print("Recall: ",get_recall(y_pred_train,y_train))
print("F-measure: ",get_f1_measure(y_pred_train,y_train))

Accuracy:  0.5
Precision:  0
Recall:  0.0
F-measure:  0


In [0]:
y_pred=test_majority(X_test,majority)

In [16]:
print("Accuracy: ",get_accuracy(y_pred,y_test))
print("Precision: ",get_precision(y_pred,y_test))
print("Recall: ",get_recall(y_pred,y_test))
print("F-measure: ",get_f1_measure(y_pred,y_test))

Accuracy:  0.5
Precision:  0
Recall:  0.0
F-measure:  0


# Problem 3:

In [0]:
def train_length(thr_len,rev_length):
    thr=thr_len
    rev_len=rev_length
    pred=[]
    for i in range(len(rev_len)):
      if rev_len[i]>thr:
        pred.append(1)
      else:
        pred.append(0)
    return pred

### Train Data

In [0]:
b=X_train.toarray()

In [0]:
rev_len= np.count_nonzero(b,axis=1)

In [0]:
pos=[]
neg=[]
for i in range(len(y_train)):
  if y_train[i]==1:
    pos.append(rev_len[i])
  else:
    neg.append(rev_len[i])

Threshold 1: Mean of the training review lengths

Threshold 2: Median of the training review lengths

Threshold 3: Mode of the training review lengths

These thresholds have been selected because they give a better estimate about the data




In [0]:
mean=np.mean(rev_len)
median=np.median(rev_len)
mode=max(set(list(rev_len)), key = list(rev_len).count) 

In [0]:
pred_mean_train=train_length(mean,rev_len)
pred_median_train=train_length(median,rev_len)
pred_mode_train=train_length(mode,rev_len)

### Evaluation on Training set

In [23]:
print("Accuracy: ",get_accuracy(pred_mean_train,y_train))
print("Precision: ",get_precision(pred_mean_train,y_train))
print("Recall: ",get_recall(pred_mean_train,y_train))
print("F-measure: ",get_f1_measure(pred_mean_train,y_train))

Accuracy:  0.502
Precision:  0.5027852049910874
Recall:  0.36104
F-measure:  0.4202831067237847


In [24]:
print("Accuracy: ",get_accuracy(pred_median_train,y_train))
print("Precision: ",get_precision(pred_median_train,y_train))
print("Recall: ",get_recall(pred_median_train,y_train))
print("F-measure: ",get_f1_measure(pred_median_train,y_train))

Accuracy:  0.49608
Precision:  0.4960661528580604
Recall:  0.49432
F-measure:  0.49519153710530533


In [25]:
print("Accuracy: ",get_accuracy(pred_mode_train,y_train))
print("Precision: ",get_precision(pred_mode_train,y_train))
print("Recall: ",get_recall(pred_mode_train,y_train))
print("F-measure: ",get_f1_measure(pred_mode_train,y_train))

Accuracy:  0.48284
Precision:  0.48766177739430544
Recall:  0.67824
F-measure:  0.5673749372594947


### Evaluation on Test set

In [0]:
c=X_test.toarray()

In [0]:
rev_len_test= np.count_nonzero(c,axis=1)

In [0]:
pred_mean_test=train_length(mean,rev_len_test)
pred_median_test=train_length(median,rev_len_test)
pred_mode_test=train_length(mode,rev_len_test)

In [29]:
print("Accuracy: ",get_accuracy(pred_mean_test,y_test))
print("Precision: ",get_precision(pred_mean_test,y_test))
print("Recall: ",get_recall(pred_mean_test,y_test))
print("F-measure: ",get_f1_measure(pred_mean_test,y_test))

Accuracy:  0.492
Precision:  0.4881544657664061
Recall:  0.32968
F-measure:  0.39356317448190237


In [30]:
print("Accuracy: ",get_accuracy(pred_median_test,y_test))
print("Precision: ",get_precision(pred_median_test,y_test))
print("Recall: ",get_recall(pred_median_test,y_test))
print("F-measure: ",get_f1_measure(pred_median_test,y_test))

Accuracy:  0.48692
Precision:  0.486303091228952
Recall:  0.4644
F-measure:  0.4750992347669517


In [31]:
print("Accuracy: ",get_accuracy(pred_mode_test,y_test))
print("Precision: ",get_precision(pred_mode_test,y_test))
print("Recall: ",get_recall(pred_mode_test,y_test))
print("F-measure: ",get_f1_measure(pred_mode_test,y_test))

Accuracy:  0.4814
Precision:  0.4861812778603269
Recall:  0.6544
F-measure:  0.5578857630008526
