# Importing Data and Modules 

In [1]:
import numpy as np
import pandas as pd
import csv

dataset = pd.read_csv("sentiments.tsv",sep="\t" ,header=None)
dataset.rename(columns = {0:'X',1:'Y'},inplace=True)

# Data Preprocessing 

In [2]:
punctuation = ['.','?','"',"'",',','-','_','!',':',';','(',')','{','}','[',']','/','#','*','&','$','%','^','@',
               '+','-','\\','>','<','=','0','1','2','3','4','5','6','7','8','9']

stopword = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 
            'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 
            'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its',
            'itself', 'they', 'them', 'their', 'theirs', 'themselves',
            'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have',
            'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
            'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
            'of', 'at', 'by', 'for', 'with', 'about', 'against', 'gonna','between', 'into',
            'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
            'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
            'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
            'each', 'few', 'more', 'most', 'other', 'some', 'such', 'nor','only', 'own',
            'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will','just', 'don',
            'should', 'now', 'd', 'll', 're', 've', 'm','whose']


### Punctuation Removing

In [3]:
for i in range(len(dataset['X'])):
    s = dataset.loc[i,'X']
    for j in punctuation:
        s = s.lower()
        s = s.replace(j,' ')
    dataset.loc[i,'X'] = s


### Stopword Removing

In [4]:
for i in range(len(dataset['X'])):
    s = dataset.loc[i,'X']
    s = s.split()
    l = []
    for j in s:
        if j not in stopword:
            l.append(j)
    s = " ".join(l)
    dataset.loc[i,'X'] = s

In [5]:
dataset

Unnamed: 0,X,Y
0,slow moving aimless movie distressed drifting ...,0
1,not sure lost flat characters audience nearly ...,0
2,attempting artiness black white clever camera ...,0
3,little music anything speak,0
4,best scene movie gerardo trying find song keep...,1
...,...,...
804,got bored watching jessice lange take clothes,0
805,unfortunately virtue film production work lost...,0
806,word embarrassing,0
807,exceptionally bad,0


In [6]:
x_dataset = []
y_dataset = []

for i in range(0,len(dataset)):
        xt = dataset.loc[i,'X'].split()
        x_dataset.append(xt)
        y_dataset.append(dataset.loc[i,'Y'])
        


# Spliting Dataset into train/test

In [7]:
def train_test_split():
    train_split = 0.8
    test_split = 0.2
    idx = dataset.index.values.astype(int)
    n = len(dataset)

    np.random.seed(776)  
    np.random.shuffle(idx)

    xtrain = []
    ytrain = []
    xtest = []
    ytest = []

    train_cnt = round(0.8*n)
    test_cnt = round(0.2*n)
    
    for i in range(0,train_cnt):
        xtrain.append(x_dataset[idx[i]])
        ytrain.append(y_dataset[idx[i]])

    for i in range(train_cnt,len(idx)):
        xtest.append(x_dataset[idx[i]])
        ytest.append(y_dataset[idx[i]])
        
    return xtrain,ytrain,xtest,ytest 

x_train,y_train,x_test,y_test = train_test_split()

# Creating Vocabulary on Training Dataset

In [8]:
vocabulary_set = set()
vocabulary = []

for i in range(len(x_train)):
    l = x_train[i]
    for j in l:
        vocabulary_set.add(j)
        
for i in vocabulary_set:
    vocabulary.append(i)



# Calculating Prior Probability for each Class

In [9]:
zero_class,one_class = 0,0
z_cnt,o_cnt = 0,0

for i in range(len(y_train)):
    if y_train[i]==1:
        o_cnt+=1
    else:
        z_cnt+=1
zero_class = z_cnt/len(y_train)
one_class = o_cnt/len(y_train)
print(zero_class)
print(one_class)

0.4652241112828439
0.5347758887171561


# Calculating Conditional Probabilities for all Vocabulary

In [10]:
zero_class_data = []
one_class_data = []

for i in range(len(x_train)):
    if(y_train[i]==0):
        zero_class_data += x_train[i]
    else:
        one_class_data += x_train[i]

In [11]:
zero_class_dict = {}
one_class_dict = {}

v_cnt = len(vocabulary)
z_cnt = len(zero_class_data)
o_cnt = len(one_class_data)

for i in range(len(vocabulary)):
    word = vocabulary[i]
    cnt = 0
    for j in range(len(zero_class_data)):
        if (word==zero_class_data[j]):
            cnt += 1
    probability = (cnt+1)/(z_cnt+v_cnt)
    zero_class_dict.update({word:probability})

for i in range(len(vocabulary)):
    word = vocabulary[i]
    cnt = 0
    for j in range(len(one_class_data)):
        if (word==one_class_data[j]):
            cnt += 1
    probability = (cnt+1)/(o_cnt+v_cnt)
    one_class_dict.update({word:probability})
    

# Calculating Posterior Probability

In [12]:
y_predict = []

for i in range(len(x_test)):
    z_class,o_class,l = 1,1,x_test[i]
    for j in range(len(l)):
        if l[j] in vocabulary:
            z_class *= zero_class_dict[l[j]]
            o_class *= one_class_dict[l[j]]
        else:
            z_class *= 1/(z_cnt+v_cnt)
            o_class *= 1/(o_cnt+v_cnt)
            
    z_class *= zero_class
    o_class *= one_class
    
    if(z_class>o_class):
        y_predict.append(0)
    else:
        y_predict.append(1)


In [13]:
def confusion_matrix():
    TP,TN,FP,FN = 0,0,0,0
    for i in range(len(y_test)):
        if(y_predict[i]==1 and y_test[i]==1):
            TP += 1
        elif(y_predict[i]==1 and y_test[i]==0):
            FP += 1
        elif(y_predict[i]==0 and y_test[i]==1):
            FN += 1
        elif(y_predict[i]==0 and y_test[i]==0):
            TN += 1
    accuracy = ((TP+TN)/(TP+TN+FP+FN))*100
    precision = (TP/(TP+FP))*100
    recall = (TP/(TP+FN))*100
    print("Accuracy "+str(accuracy))
    print("Precision "+str(precision))
    print("Recall "+str(recall))
    
confusion_matrix()

Accuracy 87.03703703703704
Precision 83.75
Recall 89.33333333333333
