## Doing K nearest neighbors algorithm

This code will apply the K Nearest Neighbors algorithm (KNN) to the IMDB Dataset based on vectors computed using TF-IDF
We will use a training set of size 20000 for each class (i.e. 20k positive and 20k negative reviews)


In [8]:
import pickle
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.decomposition import PCA
import plotly.express as px


## Un comment this lines the first time you run the code
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')


In [9]:
with open('./normalized_TF_IDF_matrix.pckl', 'rb') as f:
    normalized_TF_IDF_matrix = pickle.load(f)

In [10]:
normalized_TF_IDF_matrix

array([[-0.0038498 , -0.0038498 , -0.0038498 , ..., -0.0038498 ,
        -0.0038498 ,  0.09975314],
       [-0.003268  , -0.003268  , -0.003268  , ..., -0.003268  ,
        -0.003268  , -0.003268  ],
       [-0.00354295, -0.00354295, -0.00354295, ..., -0.00354295,
        -0.00354295, -0.00354295],
       ...,
       [-0.003316  , -0.003316  , -0.003316  , ..., -0.003316  ,
        -0.003316  , -0.003316  ],
       [-0.00366195, -0.00366195, -0.00366195, ..., -0.00366195,
        -0.00366195, -0.00366195],
       [-0.00298504, -0.00298504, -0.00298504, ..., -0.00298504,
        -0.00298504, -0.00298504]], shape=(50000, 2000))

In [13]:
np.linalg.norm(normalized_TF_IDF_matrix[1, :])

np.float64(1.0000000000000007)

In [14]:
data = pd.read_csv("./IMDB_Dataset.csv")

In [15]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
# Create the training set and test set
positive_Ix = list(data[data['sentiment']=="positive"].index)
negative_Ix = list(data[data['sentiment']=="negative"].index)

train_Ix = np.concatenate([positive_Ix[:20000], negative_Ix[:20000]])
test_Ix = np.concatenate([positive_Ix[20000:], negative_Ix[20000:]])


In [17]:
train_Ix[-10:]

array([39969, 39970, 39974, 39976, 39978, 39979, 39980, 39981, 39983,
       39985])

In [18]:
train_vectors = normalized_TF_IDF_matrix[train_Ix, :]
test_vectors = normalized_TF_IDF_matrix[test_Ix, :]

In [19]:
train_vectors

array([[-0.0038498 , -0.0038498 , -0.0038498 , ..., -0.0038498 ,
        -0.0038498 ,  0.09975314],
       [-0.003268  , -0.003268  , -0.003268  , ..., -0.003268  ,
        -0.003268  , -0.003268  ],
       [-0.00354295, -0.00354295, -0.00354295, ..., -0.00354295,
        -0.00354295, -0.00354295],
       ...,
       [-0.00234925, -0.00234925, -0.00234925, ..., -0.00234925,
        -0.00234925, -0.00234925],
       [-0.00397128, -0.00397128, -0.00397128, ..., -0.00397128,
        -0.00397128, -0.00397128],
       [-0.0029042 , -0.0029042 , -0.0029042 , ..., -0.0029042 ,
        -0.0029042 , -0.0029042 ]], shape=(40000, 2000))

In [20]:
test_vectors

array([[-0.00509787, -0.00509787, -0.00509787, ..., -0.00509787,
        -0.00509787, -0.00509787],
       [-0.00373814, -0.00373814, -0.00373814, ..., -0.00373814,
        -0.00373814, -0.00373814],
       [-0.00449679, -0.00449679, -0.00449679, ..., -0.00449679,
        -0.00449679, -0.00449679],
       ...,
       [-0.003316  , -0.003316  , -0.003316  , ..., -0.003316  ,
        -0.003316  , -0.003316  ],
       [-0.00366195, -0.00366195, -0.00366195, ..., -0.00366195,
        -0.00366195, -0.00366195],
       [-0.00298504, -0.00298504, -0.00298504, ..., -0.00298504,
        -0.00298504, -0.00298504]], shape=(10000, 2000))

In [25]:
cosine_train_test_vectors = np.dot(train_vectors, test_vectors.T)


In [26]:
distances_from_test_to_train = 1 - np.abs(cosine_train_test_vectors)

In [27]:
with open('./distances_from_test_to_train.pckl', 'wb') as f:
    pickle.dump(distances_from_test_to_train, f)

In [28]:
distances_from_test_to_train[:, 0]

array([0.98374036, 0.98699907, 0.98797618, ..., 0.93380536, 0.98259355,
       0.99487187], shape=(40000,))