In [12]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics

In [15]:
import re
import nltk
import unicodedata
import contractions
from bs4 import BeautifulSoup
from nltk.lm import vocabulary
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [17]:
#Data Handling
train_datapath = '/content/drive/MyDrive/code/traindata.csv'
train_data = pd.read_csv(train_datapath, header=None)
train_data.columns = ["y", "x"]
x_train = train_data["x"]
y_train = train_data["y"]
test_datapath = '/content/drive/MyDrive/code/testdata.csv'
test_data = pd.read_csv(test_datapath, header=None)
test_data.columns = ["x"]
x_test = test_data["x"]

In [18]:
#Preprocessor function
def myPreprocessor(textreview):
    fixed_contractions = contractions.fix(textreview)
    #Removing HTML tags
    textreview_html = BeautifulSoup(fixed_contractions, features="html.parser").get_text() 

    #Cleaning accents, email IDs, URLs, numbers and punctuation
    textreview_proc = re.sub(r'(\s+@\s+)|(http[s]?://\s+)|(\d)|([^\w\s])','', textreview_html)

    #Tokenizing the text
    tokenizer = ToktokTokenizer()
    words= tokenizer.tokenize(textreview_proc)

    #Setting and removing stopwords
    stop_words = set(stopwords.words("english"))
    textreview_stop = [word for word in words if word not in stop_words and len(word)>3]                  

    #Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = ''
    lemmatized_words = [str(lemmatizer.lemmatize(word)) + ' ' for word in textreview_stop]
    textreview_lemmatized = "".join(lemmatized_words)

    return textreview_lemmatized.lower()

In [19]:
class KNearestNeighboursText(object):
  def __init__(self, k):
    self.k = k

  #Converting text into features for computing
  def getFeatures(self, train_text, test_text):
    count_vectorizer = CountVectorizer(preprocessor=myPreprocessor, ngram_range=(1,3), min_df=2, max_df=0.95, 
                                       max_features=8000)
    train_vector = count_vectorizer.fit_transform(train_text)
    test_vector = count_vectorizer.transform(test_text)
    
    return train_vector, test_vector

  #Cosine similarity for ranking feature neighbours
  def getCosineSimilarity(self, train_vector, test_vector):
    similarities = cosine_similarity(test_vector, train_vector)
    return similarities
  
  #k nearest neighbours for the features 
  def predictScoresNN(self, similarities, score):
    self.predicted_scores = []
    for similarity in similarities:
        #Ranking points by similarity
        nearest_neighbours = np.argsort(-similarity)[:self.k] 

        #Clustering neighbouring points 
        counter = 0
        for neighbor in nearest_neighbours:
            if int(score[neighbor]) == 1:
                counter += 1
                
        #Assigning predicted scores
        if counter > 0.5*self.k:
            self.predicted_scores.append(1)
        else:
            self.predicted_scores.append(-1)

    return self.predicted_scores

def main(x_train, y_train, x_test, k):
  model = KNearestNeighboursText(k)

  train_vec, test_vec = model.getFeatures(x_train, x_test)
  sim_matrix = model.getCosineSimilarity(train_vec, test_vec)
  del train_vec, test_vec
  predicted_scores = model.predictScoresNN(sim_matrix, y_train)

  return predicted_scores

In [20]:
predicted_testscores = main(x_train, y_train, x_test, k=134)

In [21]:
#Saving the csv file for predicted test scores
np.savetxt("predicted_scores.csv", predicted_testscores, delimiter="\n", fmt='%d')

In [24]:
#K Fold Cross Validation 
accuracy_list = []
k_fold = 5
dataset = train_data.to_numpy()
l = len(dataset)

i=0
while (i<k_fold):
  kf_split = int((l / k_fold))
  m = (i+1) * kf_split

  #Data splitting 
  ts_subset = dataset[l-m:l-m+kf_split]
  test_subset = pd.DataFrame(ts_subset)
  score_subset_ts = test_subset.iloc[:, 0]
  text_subset_ts = test_subset.iloc[:, -1]

  tr_subset = np.concatenate((dataset[:l-m], dataset[l-m+kf_split:]), axis=0)
  train_subset = pd.DataFrame(tr_subset)
  score_subset_tr = train_subset.iloc[:, 0]
  text_subset_tr = train_subset.iloc[:, -1]
      
  actual_scores = score_subset_ts.tolist()
 
  #Calling the KNN model for predicting sentiment scores
  predicted_valscores = main(text_subset_tr, score_subset_tr, text_subset_ts, k=134)    

  print("Fold %d:" %(i+1), "\nConfusion matrix:")
  print(metrics.confusion_matrix(actual_scores, predicted_valscores))
  accuracy = metrics.accuracy_score(actual_scores, predicted_valscores)
  print("Accuracy: %s" %(100*accuracy), "\n")
  accuracy_list.append(accuracy)
  i+=1

  np.random.shuffle(dataset)
  
print(accuracy_list)
np.mean(accuracy_list)

Fold 1: 
Confusion matrix:
[[1448  377]
 [ 299 1476]]
Accuracy: 81.22222222222221 

Fold 2: 
Confusion matrix:
[[1431  378]
 [ 329 1462]]
Accuracy: 80.36111111111111 

Fold 3: 
Confusion matrix:
[[1469  377]
 [ 301 1453]]
Accuracy: 81.16666666666667 

Fold 4: 
Confusion matrix:
[[1457  364]
 [ 313 1466]]
Accuracy: 81.19444444444444 

Fold 5: 
Confusion matrix:
[[1469  340]
 [ 336 1455]]
Accuracy: 81.22222222222221 

[0.8122222222222222, 0.8036111111111112, 0.8116666666666666, 0.8119444444444445, 0.8122222222222222]


0.8103333333333331

In [None]:
#TruncatedSVD for dimensionality reduction and spelling correction in preprocessing both resulted in low accuracies, hence omitted