In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from sklearn.metrics import accuracy_score 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adelalkhamisy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# create a dataframe with two columns "Rate" and "Comment" to make the preprocessing easy
reviewTrain = pd.read_csv("train_file.txt", sep='\t',header=None,names=['Rate','Comment'], skip_blank_lines=True, dtype = str, na_filter=False,infer_datetime_format=True, error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
reviewTrain.head()

Unnamed: 0,Rate,Comment
0,-1,German filmmaker Ulli Lommel has managed a tas...
1,1,It is an excellent thriller from Turkey which ...
2,1,"Finally, the uncut version of ""Baby Face"" surf..."
3,1,In this glorious telling of a weekend shared a...
4,1,"Dog Bite Dog isn't going to be for everyone, b..."


In [4]:
reviewTest = pd.read_csv("test_file.txt", sep='\t',header=None,names=['Comment'], skip_blank_lines=True, dtype = str, na_filter=False,infer_datetime_format=True, error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
"""
Function: preProcessing(corpus): 
It does many necessary preprocessing to data as follow:
1. delete anything that does not belong to [a-z] or [A-Z] from the data
2. transfer all characters to lower case
3. remove all stop words from the data except the word "not" because it is necessary information
4. apply stemming to data

Parameter: dataframe of reviewDataset 
return: list of preprocessed corpus of reviewDataset according to the above 4 steps
"""
def preProcessing(reviewDataset):
        preprocessedCorpus = []
        # iterate over ratring in datafame
        for i in range(len(reviewDataset)):
            #delete all symbols except a-z and A-Z in the Comment column
            comment = re.sub('[^a-zA-Z]', ' ', reviewDataset['Comment'][i])
            comment = comment.lower()
            comment = comment.split()
            porterstemmer = PorterStemmer() 
            sw = stopwords.words('english')
            #exclude not from stop word set
            sw.remove('not')
            # apply stemming to words in comment 
            comment = [porterstemmer.stem(word) for word in comment if not word in set(sw)]
            #join the words 
            comment = ' '.join(comment)
            #add train data to corpus
            preprocessedCorpus.append(comment)
        return preprocessedCorpus

In [6]:
#preprocess data
sentiment = reviewTrain.iloc[:,0].values
preprocessedReviewTrain = preProcessing(reviewTrain)
preprocessedReviewTest = preProcessing(reviewTest)

In [7]:
def knn(k, testTfidf, trainTfidf, x_test, y_train):
    prediction = []    
    for i, l in enumerate(x_test):
        # calculate cosine similarity with the help of cosine_similarity in sklearn 
        cs = cosine_similarity(tfidfTest[i], tfidfTrain).flatten()
        # utilizing numpy.argsort to find the K neighbors indices      
        nearestNeighborIndices = cs.argsort()[:-k:-1]
        #get the list of k nearest neighbor  from the sentiment  
        nearestNeighborList = y_train[nearestNeighborIndices]
        
        # convert nearestNeighborList from string type to int type
        integer_map=map(int, nearestNeighborList)
        nearestNeighborList = list(integer_map)
        
        ''' if sum of the nearestNeighborList is >0, then this mean that the +1 class is the majority 
        if the sum is zero, then the positive and negative are equal, but the program will classify it as positive
        if the the sum is negative, then the majority of the neighbors are "-1" 
        '''
        decision = sum(nearestNeighborList)
        
        if decision >= 0:
          prediction.append("+1")
        else:
          prediction.append("-1")
            
    return pd.DataFrame(prediction)

In [8]:
Kfold = KFold(n_splits=10)
for itrain, itest in Kfold.split(preprocessedReviewTrain):
   x_train, x_test = preprocessedReviewTrain[itrain[0]:itrain[-1]], preprocessedReviewTrain[itest[0]:itest[-1]]
   y_train, y_test = sentiment[itrain[0]:itrain[-1]], sentiment[itest[0]:itest[-1]]

In [9]:
# create spase matrix using Tf-idf vectorizer for training and testing data
v = TfidfVectorizer(ngram_range=(2,5), max_features=30000)
tfidfTrain = v.fit_transform(x_train)
tfidfTest = v.transform(x_test)

In [10]:
Accuracy = []
for i in range(1, 300, 5):
  print(i)
  y_hat = knn(i,tfidfTest, tfidfTrain, x_test, y_train)    
  ac = accuracy_score(y_test, y_hat)
  Accuracy.append((i, ac))
  print(i, ac)

1
1 0.4783188792528352
6
6 0.7058038692461641
11
11 0.7284856571047365
16
16 0.7531687791861241
21
21 0.7531687791861241
26
26 0.7671781187458305
31
31 0.7658438959306204
36
36 0.7725150100066711
41
41 0.7745163442294863
46
46 0.7751834556370913
51
51 0.7778519012675117
56
56 0.7791861240827218
61
61 0.7645096731154103
66
66 0.7671781187458305
71
71 0.7705136757838559
76
76 0.7785190126751167
81
81 0.781187458305537
86
86 0.7805203468979319
91
91 0.7791861240827218
96
96 0.7758505670446965
101
101 0.7858572381587725
106
106 0.7798532354903269
111
111 0.781187458305537
116
116 0.7791861240827218
121
121 0.781854569713142
126
126 0.7825216811207472
131
131 0.7925283522348232
136
136 0.7885256837891927
141
141 0.7938625750500333
146
146 0.7938625750500333
151
151 0.7911941294196131
156
156 0.7918612408272181
161
161 0.7911941294196131
166
166 0.7911941294196131
171
171 0.789859906604403
176
176 0.7851901267511674
181
181 0.7858572381587725
186
186 0.790527018012008
191
191 0.7951967978652