## ***Step 0.0: Import required libraries***

In [1]:
# import libraries  for preprocessing
import os
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
stopWords = stopwords.words("english")

# import libraries for word2vec
from gensim.models import Word2Vec

# import libraries for classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from scipy.spatial.distance import cosine
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC



***
## ***Step 0.1: preprocessing***
##### **in this step I defined a function for preprocessing documents including removing stopWords, eliminate punctuations, convert to lower case and stemming each word.**

In [2]:
def preprocessing(folderPath):
    stemmer = PorterStemmer()
    uniqueTerms = set()
    documents = dict()
    for i in ["pos", "neg"]:
        for fileName in os.listdir(folderPath+i):
            filePath = os.path.join(folderPath+i, fileName)
            if os.path.isfile(filePath):
                file = open(filePath, 'r', encoding='utf-8')
                content = file.read()
                translator = str.maketrans('', '', string.punctuation + "“”’—")
                contentPreprocessed = content.lower().translate(translator)
                terms = [word for word in list(contentPreprocessed.split()) if word not in stopWords]
                terms = [stemmer.stem(word) for word in terms]
                uniqueTerms.update(terms)
                documents[fileName] = terms
    uniqueTerms = sorted(list(uniqueTerms))
    return uniqueTerms, documents

#### **calling the function for train documents.**

In [3]:
uniqueTermsTrain, trainDocuments = preprocessing("F:/uni/Term7/IR/homeWorks/HW5/aclImdb_v1/aclImdb/train/")
vocabSize = len(uniqueTermsTrain)

In [4]:
print("Vocabulary size: ", vocabSize)

Vocabulary size:  92785


#### **Making train data labels list:**

In [5]:
xTrain = list(trainDocuments.values())[0]
xTrain = np.array(xTrain)
yTrain = []
for key in trainDocuments.keys():
    if int(key[-5]) > 5 or (key[-5] == '0' and key[-6] == '1'):
        yTrain.append(1)
    else:
        yTrain.append(0)

***
#### **preprocessing the test set and creating labels list:**

In [6]:
uniqueTermsTest, testDocuments = preprocessing("F:/uni/Term7/IR/homeWorks/HW5/aclImdb_v1/aclImdb/test/")

In [7]:
xTest = list(testDocuments.values())[0]
xTest = np.array(xTest)
yTest = []
for key in testDocuments.keys():
    if int(key[-5]) > 5 or (key[-5] == '0' and key[-6] == '1'):
        yTest.append(1)
    else:
        yTest.append(0)

***
## ***Step 0.2: Words Embedding (Word2Vec)***

In [8]:
model = Word2Vec(sentences=trainDocuments.values(), vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [9]:
def documentEmbedding(doc, model):
    embeddings = [model.wv[word] for word in doc if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

trainEmbeddings = [documentEmbedding(doc, model) for doc in trainDocuments.values()]
testEmbedding = [documentEmbedding(doc, model) for doc in testDocuments.values()]


***
## ***Step 0.3: k-Nearest Neighbors (kNN) Classification:***
#### **I tested the result for different values of n_neighbors and i have notices n_neighbors=5 gives us the best result**

In [10]:
knn = KNeighborsClassifier(n_neighbors=5, metric='cosine', n_jobs=-1)
knn.fit(trainEmbeddings, yTrain)

In [11]:
yPrediction = knn.predict(trainEmbeddings)
print("Accuracy:", accuracy_score(yTrain, yPrediction))
print(classification_report(yTrain, yPrediction))

Accuracy: 0.82568
              precision    recall  f1-score   support

           0       0.82      0.84      0.83     12500
           1       0.83      0.81      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



***
## ***Step 0.4: Rocchio Classifier:***
#### **I have defined a class for Rocchio classifier**

In [12]:
class Rocchio:
    def __init__(self, distance_function=cosine):
        self.centroids = {}
        self.distance_function = distance_function

    def fit(self, X, y):
        grouped_vectors = defaultdict(list)
        for i in range(len(X)):
            grouped_vectors[y[i]].append(X[i])
        self.centroids = {label: np.mean(vectors, axis=0) for label, vectors in grouped_vectors.items()}

    def predict(self, X):
        predictions = []
        for vector in X:
            distances = {label: self.distance_function(vector, centroid) for label, centroid in self.centroids.items()}
            predictions.append(min(distances, key=distances.get))
        return predictions

    def evaluate(self, X, y_true):
        y_pred = self.predict(X)
        accuracy = np.mean(np.array(y_pred) == np.array(y_true))
        return accuracy


#### **creating a instance of the class above:**

In [13]:
rocchio = Rocchio()
rocchio.fit(trainEmbeddings, yTrain)

#### **Predict and evaluate on train set:**

In [14]:
rocchioPrediction = rocchio.predict(trainEmbeddings)
rocchioAccuracy = rocchio.evaluate(trainEmbeddings, yTrain)
print("Accuracy:", rocchioAccuracy)
print(classification_report(yTrain, rocchioPrediction))

Accuracy: 0.6996
              precision    recall  f1-score   support

           0       0.70      0.69      0.70     12500
           1       0.70      0.71      0.70     12500

    accuracy                           0.70     25000
   macro avg       0.70      0.70      0.70     25000
weighted avg       0.70      0.70      0.70     25000



***
## ***Step 0.5: Word Embeddings and LSA with SVM Classification:***
#### **using TruncatedSVD library I have defined and used LSA:**

In [15]:
lsa = TruncatedSVD(n_components=100, random_state=42)
lsaTrain = lsa.fit_transform(trainEmbeddings)
xTrainLSA = lsaTrain
lsaTest = lsa.transform(testEmbedding)
xTestLSA = lsaTest 

#### **using SVM classifier:**

In [16]:
svm = SVC(kernel="poly", C=1)
svm.fit(xTrainLSA, yTrain)

#### **Evaluate SVM classifier on train set:**

In [17]:
y_pred = svm.predict(xTrainLSA)
print("Accuracy:", accuracy_score(yTrain, y_pred))
print(classification_report(yTrain, y_pred))

Accuracy: 0.82136
              precision    recall  f1-score   support

           0       0.83      0.81      0.82     12500
           1       0.81      0.83      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



***

## ***Step 0.6: Evaluate on test set***

#### ***Evaluate KNN on test set:***

In [21]:
yPrediction = knn.predict(testEmbedding)
print("Accuracy:", accuracy_score(yTest, yPrediction))
print(classification_report(yTest, yPrediction))

Accuracy: 0.72964
              precision    recall  f1-score   support

           0       0.72      0.76      0.74     12500
           1       0.75      0.70      0.72     12500

    accuracy                           0.73     25000
   macro avg       0.73      0.73      0.73     25000
weighted avg       0.73      0.73      0.73     25000



#### ***Evaluate rocchio classifier on test set:***

In [19]:
rocchioAccuracy = rocchio.evaluate(testEmbedding, yTest)
rocchioPrediction = rocchio.predict(testEmbedding)
print("Accuracy:", rocchioAccuracy)
print(classification_report(yTest, rocchioPrediction))

Accuracy: 0.69696
              precision    recall  f1-score   support

           0       0.69      0.71      0.70     12500
           1       0.70      0.68      0.69     12500

    accuracy                           0.70     25000
   macro avg       0.70      0.70      0.70     25000
weighted avg       0.70      0.70      0.70     25000



#### ***Evaluate SVM (with LAS) on test set:***

In [20]:
y_pred = svm.predict(xTestLSA)
print("Accuracy:", accuracy_score(yTest, y_pred))
print(classification_report(yTest, y_pred))

Accuracy: 0.81688
              precision    recall  f1-score   support

           0       0.83      0.80      0.81     12500
           1       0.81      0.83      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



***

## ***Comparison of Results:***
##### **we used three methods: KNN, rocchio and SVM with LSA. Surprisingly accuracy, precision and recall of Rocchio classifier was much lower than other methods, but the result on test set was exactly the same with the train set. KNN classifier had much higher accuracy, precision and recall but surprisingly this metrics was much lower in the test set, so we can conclude that the knn didn't learn data good enough. SVM classifier had both strength of the previous methods: accuracy, precision and recall was good in both train and test set. so the best classifier for this dataset and this problem might be SVM classifier.<br> <br> In all the three classifiers precision, recall and accuracy was almost close to each other.<br> It's clear that word embedding had a good effect over all these classifiers. <br><br> We should also consider the time for classification task, so SVM classifier took around 22 seconds to fit and around 22 seconds to predict the result, but rocchio classifier took much less time to give us an answer. knn also took more time than rocchio too. <br> <br> accuracy, precision and recall was shown in each related cells above.**