In [16]:
import ast
import math
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for i in range(len(train)):
        train_row = train.iloc[i][1]
        dist = np.linalg.norm(train_row - test_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [2]:
data = pd.read_csv('w2v_yelp.csv')
data = data[['Review_Labels', 'cleaned']]
#data_new = data.iloc[:int(0.1*len(data))]

In [3]:
#data_new.head()
data.head()

Unnamed: 0,Review_Labels,cleaned
0,0,someon ha work mani museum wa eager visit thi ...
1,0,actual horrifi thi place still busi year old s...
2,2,love deagan realli atmospher cozi festiv shrim...
3,0,dismal lukewarm defrostedtast texmex glop mumb...
4,2,oh happi day final cane near casa ye gripe dri...


In [4]:
# X = data_new.iloc[:, 1].values
# y = data_new.iloc[:, 0].values
X = data.iloc[:, 1].values
y = data.iloc[:, 0].values

In [5]:
cv = CountVectorizer(binary=True)
X = cv.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [7]:
knn = KNeighborsClassifier(n_neighbors = 7)

In [8]:
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=7)

In [9]:
y_pred = knn.predict(X_test)

In [10]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 2494    62  4141]
 [  432    77  2830]
 [  833    42 19089]]
              precision    recall  f1-score   support

           0       0.66      0.37      0.48      6697
           1       0.43      0.02      0.04      3339
           2       0.73      0.96      0.83     19964

    accuracy                           0.72     30000
   macro avg       0.61      0.45      0.45     30000
weighted avg       0.68      0.72      0.66     30000



In [17]:
print("Test Accuracy: ", round(metrics.accuracy_score(y_test, y_pred), 3))

Test Accuracy:  0.722


In [22]:
print("Test Recall: ", round(metrics.recall_score(y_test, y_pred, average="micro"), 3))

Test Recall:  0.722


In [23]:
print("Test F1: ", round(metrics.f1_score(y_test, y_pred, average="micro"), 3))

Test F1:  0.722


In [13]:
train_pred = knn.predict(X_train)

In [18]:
print("Train Accuracy: ", round(metrics.accuracy_score(y_train, train_pred), 3))

Train Accuracy:  0.781


In [19]:
print(confusion_matrix(y_train,train_pred))
print(classification_report(y_train,train_pred))

[[ 8240   120  6901]
 [ 1343   561  5922]
 [  952    93 45867]]
              precision    recall  f1-score   support

           0       0.78      0.54      0.64     15261
           1       0.72      0.07      0.13      7826
           2       0.78      0.98      0.87     46912

    accuracy                           0.78     69999
   macro avg       0.76      0.53      0.55     69999
weighted avg       0.78      0.78      0.74     69999



In [24]:
print("Train Recall: ", round(metrics.recall_score(y_train, train_pred, average="micro"), 3))

Train Recall:  0.781


In [25]:
print("Train F1: ", round(metrics.f1_score(y_train, train_pred, average="micro"), 3))

Train F1:  0.781
