In [42]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from sklearn import utils
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
df  = pd.read_csv('w2v_yelp.csv')
df = df[['Review_Labels', 'cleaned']]

In [4]:
print("total: %d ." % len(df))

total: 99999 .


In [5]:
df.sample(10)

Unnamed: 0,Review_Labels,cleaned
20019,2,thi place awesom mean blind remot control open...
29954,2,bone wing mild deep dish pizza sooooo good ser...
98798,2,ah sushi bloor long known mani year eat shiita...
43002,2,daughter love maria elena alway pleas alway fe...
86526,2,love thi place work hard good unlik fight styl...
75491,2,wa fortun enough get invit grand open event ne...
10078,2,like dark chocol thi place went whim friend bi...
19558,2,everyon friendli great servic warm welcom plac...
96498,0,veri disappoint valentin day dinner thi ha fav...
3814,0,send steak back time unaccept medium rare diff...


In [21]:
text = df.iloc[0][1]
token = nltk.word_tokenize(text)
print(token)

['someon', 'ha', 'work', 'mani', 'museum', 'wa', 'eager', 'visit', 'thi', 'galleri', 'recent', 'trip', 'la', 'vega', 'saw', 'would', 'show', 'infam', 'egg', 'hous', 'faberg', 'virginia', 'museum', 'fine', 'art', 'vmfa', 'knew', 'go', 'tuck', 'away', 'near', 'gelateria', 'garden', 'galleri', 'pretti', 'much', 'hidden', 'view', 'real', 'estat', 'agent', 'would', 'call', 'cozi', 'charm', 'basic', 'ani', 'euphem', 'small', 'said', 'still', 'see', 'wonder', 'art', 'galleri', 'ani', 'size', 'whi', 'two', 'ask', 'let', 'tell', 'price', 'thi', 'rel', 'inexpens', 'la', 'vega', 'attract', 'complet', 'top', 'space', 'amount', 'art', 'fit', 'bit', 'much', 'kid', 'friendli', 'serious', 'bring', 'secur', 'train', 'properli', 'show', 'curat', 'design', 'team', 'collabor', 'exhibit', 'definit', 'flow', 'mean', 'visitor', 'view', 'art', 'certain', 'sequenc', 'whether', 'histor', 'period', 'cultur', 'signific', 'thi', 'audio', 'guid', 'usual', 'develop', 'arriv', 'galleri', 'could', 'tell', 'start', 'se

In [28]:
df['cut_review'] = df['cleaned'].apply(lambda x: [w for w in list(nltk.word_tokenize(x))])

In [31]:
train, test = train_test_split(df, test_size=0.3, random_state=42,stratify = df.Review_Labels.values)
 
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['cut_review'], tags=[r['Review_Labels']]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['cut_review'], tags=[r['Review_Labels']]), axis=1)

In [32]:
train_tagged[0]

TaggedDocument(words=['someon', 'ha', 'work', 'mani', 'museum', 'wa', 'eager', 'visit', 'thi', 'galleri', 'recent', 'trip', 'la', 'vega', 'saw', 'would', 'show', 'infam', 'egg', 'hous', 'faberg', 'virginia', 'museum', 'fine', 'art', 'vmfa', 'knew', 'go', 'tuck', 'away', 'near', 'gelateria', 'garden', 'galleri', 'pretti', 'much', 'hidden', 'view', 'real', 'estat', 'agent', 'would', 'call', 'cozi', 'charm', 'basic', 'ani', 'euphem', 'small', 'said', 'still', 'see', 'wonder', 'art', 'galleri', 'ani', 'size', 'whi', 'two', 'ask', 'let', 'tell', 'price', 'thi', 'rel', 'inexpens', 'la', 'vega', 'attract', 'complet', 'top', 'space', 'amount', 'art', 'fit', 'bit', 'much', 'kid', 'friendli', 'serious', 'bring', 'secur', 'train', 'properli', 'show', 'curat', 'design', 'team', 'collabor', 'exhibit', 'definit', 'flow', 'mean', 'visitor', 'view', 'art', 'certain', 'sequenc', 'whether', 'histor', 'period', 'cultur', 'signific', 'thi', 'audio', 'guid', 'usual', 'develop', 'arriv', 'galleri', 'could',

In [33]:
cores = multiprocessing.cpu_count()

In [34]:
from gensim.models import Doc2Vec
from tqdm import tqdm
 
model_dbow = Doc2Vec(dm=0,  negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 69999/69999 [00:00<00:00, 2643589.82it/s]


In [35]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 69999/69999 [00:00<00:00, 2603064.89it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3597254.07it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3485700.72it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3558321.24it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3656434.76it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3667213.16it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3624834.38it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3100318.75it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3670651.82it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3733431.91it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3612479.98it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3612257.75it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3655660.80it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3601136.84it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3238905.71it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3583117.75it/s]
100%|██████████| 69999/69999 [00:00<00:00, 3716277.68it/

CPU times: user 2min 44s, sys: 9.43 s, total: 2min 53s
Wall time: 1min 11s


In [36]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors
 
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [38]:
knn = KNeighborsClassifier(n_neighbors = 11)

In [39]:
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=11)

In [40]:
y_pred = knn.predict(X_test)

In [43]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[  693    84  5810]
 [  323    43  2984]
 [ 1640   217 18206]]
              precision    recall  f1-score   support

           0       0.26      0.11      0.15      6587
           1       0.12      0.01      0.02      3350
           2       0.67      0.91      0.77     20063

    accuracy                           0.63     30000
   macro avg       0.35      0.34      0.32     30000
weighted avg       0.52      0.63      0.55     30000

