In [1]:
from collections import defaultdict as dd
import pandas as pd
import numpy as np
import time


In [2]:
#Reading the dataset

cols=["sentence", "sentiment"]
raw_df=pd.read_csv("a1_d3.txt", sep='\t', names=cols, dtype={"sentence":"str", "sentiment":"int"})

raw_df["sentence"]=raw_df["sentence"].str.lower()

In [3]:
class SimpleClassifier(object):
    
    def __init__(self, n_gram=1, printing=False):
        self.prior = dd(int)
        self.n = n_gram
        self.logprior = {}
        self.count_of_words = dd(list)
        self.loglikelihoods = dd(dd)
        self.V = []
        
#Computing prior and word count

    def CPWC(self, training_set, training_labels):
        for x, y in zip(training_set['sentence'], training_labels['sentiment']):
            all_words = x.split(" ")
            if self.n == 1:
                grams = all_words
            else:
                grams = self.words_to_grams(all_words)

            self.prior[y] += len(grams)
            self.count_of_words[y].append(x)
            
#Computing word in classes

    def CWIC(self):
        count = {}
        for c in list(self.count_of_words.keys()):
            docs = self.count_of_words[c]
            count[c] = dd(int)
            for doc in docs:
                words = doc.split(" ")
                for word in words:
                    count[c][word] += 1

        return count

#Computing vocabulary

    def CV(self, documents):
        vocabulary = set()

        for doc in documents:
            for word in doc.split(" "):
                vocabulary.add(word.lower())

        return vocabulary


#Training the classifier

    def train(self, N_c, training_set, training_labels, alpha=1):

        # For getting number of documents

        N_doc = len(training_set)

        # For getting vocabulary used in training set

        self.V = self.CV(training_set['sentence'])

        # Create count_of_words for computing the word count

        for x, y in zip(training_set['sentence'], training_labels['sentiment']):
            self.count_of_words[y].append(x)

        # Get set of all classes

        all_classes = set(training_labels['sentiment'])

        # All word count for each class are computed in a dictionary

        self.count_of_words = self.CWIC()

        # Computing all the count necessary to compute the two terms of the reformulated

        for c in all_classes:
            if(c==0):
                N_c=float(N_c)
            else:
                N_c=float(N_doc-N_c)
                
            # Computing logprior for class

            self.logprior[c] = np.log(N_c / N_doc)

            # Calculate the sum of count of words in current class

            total_count = 0
            for word in self.V:
                total_count += self.count_of_words[c][word]

            # For every word, get the counting and compute the log-likelihood for this class

            for word in self.V:
                counting = self.count_of_words[c][word]
                self.loglikelihoods[c][word] = np.log((counting + alpha) / (total_count + alpha * len(self.V)))

    def predicting(self, test_doc):
        sums = {
            0: 0,
            1: 0,
        }
        for c in self.count_of_words.keys():
            sums[c] = self.logprior[c]
            words = test_doc.split(" ")
            for word in words:
               if word in self.V:
                   sums[c] += self.loglikelihoods[c][word]

        return sums

In [4]:
def prediction_evaluator(validation_set,validation_labels,trained_classifier):
  correct_predictions = 0
  predictions_list = []
  prediction = -1
  
  TP=0
  FP=0
  TN=0 
  FN=0

  for dataset,label in zip(validation_set['sentence'], validation_labels['sentiment']):
    probabilities = trained_classifier.predicting(dataset)

    #Comparing probabilities

    if probabilities[0] >= probabilities[1]:
      prediction = 0
    elif  probabilities[0] < probabilities[1]:
      prediction = 1

    #Checking if the predicted value is same as actual one

    if prediction == label:
      correct_predictions += 1
      predictions_list.append("correct")    #If prediction same then append "correct" else append "wrong"
    else:
      predictions_list.append("wrong")
    
    #True positives
    if prediction == label and prediction ==1:
        TP+=1
    #True negatives
    if prediction == label and prediction ==0:
        TN+=1
    #False positives
    if prediction != label and prediction ==1:
        FP+=1
    #False negatives
    if prediction != label and prediction ==0:
        FN+=1
    
  precision=TP/(TP+FP)
  recall=TP/(TP+FN)
  F_score=2*precision*recall/(precision+recall)
    
  print("Correct Predictions: {} out of {} ({}%)".format(correct_predictions,len(validation_labels),round(correct_predictions/len(validation_labels)*100,5)))
  return predictions_list, round(correct_predictions/len(validation_labels)*100), F_score

n_folds=5
F_Score_list=[]
accuracy_list=[]

In [5]:
#Time for splitting data and computation

start = time.time()

#splitting into n folds

for i in range(n_folds):
    
    msk = np.empty(len(raw_df), dtype=bool)
    np.array(msk)

#Boolean numpy array for splitting

    for j in range(len(raw_df)):
        if j>=200*i and j<200*(i+1):
            msk[j]=False
        else:
            msk[j]=True

#Making datasets with test size of 200 for 5 folds

    train=raw_df[msk]
    test=raw_df[~msk]

#Splitting into lines and their sentiments

    training_set=train.iloc[:, [0]]
    training_labels=train.iloc[:, [1]]
    validation_set=test.iloc[:, [0]]
    validation_labels=test.iloc[:, [1]]
    
    N_c=0

#Counting number of negative sentiments

    for t in training_labels['sentiment']:
        if t==0:
            N_c=N_c+1
            
    NaiveBayesClassifier = SimpleClassifier()
    NaiveBayesClassifier.train(N_c, training_set, training_labels, alpha=1)
    prediction_list, accuracy, F_score = prediction_evaluator(validation_set, validation_labels, NaiveBayesClassifier)
    print("F-score is" , F_score, "\n")
    accuracy_list.append(accuracy)
    F_Score_list.append((F_score))
    


print("\nF-Score is {}±{}".format(round(np.mean(F_Score_list),4),round(np.std(F_Score_list),4)))
print("Accuracy is {}±{}\n".format(round(np.mean(accuracy_list),2),round(np.std(accuracy_list),2)))    
            
end = time.time()
print('Ran in {} seconds'.format(round(end - start, 3)))

Correct Predictions: 159 out of 200 (79.5%)
F-score is 0.8075117370892019 

Correct Predictions: 159 out of 200 (79.5%)
F-score is 0.8093023255813954 

Correct Predictions: 151 out of 200 (75.5%)
F-score is 0.7741935483870969 

Correct Predictions: 154 out of 200 (77.0%)
F-score is 0.7653061224489796 

Correct Predictions: 154 out of 200 (77.0%)
F-score is 0.7553191489361702 


F-Score is 0.7823±0.0221
Accuracy is 78.0±1.67

Ran in 0.271 seconds
