In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import spacy

from sklearn.model_selection import train_test_split

sns.set()

In [2]:
reviews = pd.read_csv('datasets/Restaurant_Reviews.tsv', sep='\t')

reviews.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
# en_core_web_lg is the only dictionary in spacy that can vectorize documents
nlp = spacy.load("en_core_web_lg")

In [4]:
# vectorize each review
with nlp.disable_pipes():
    vectors = np.array([nlp(text).vector for text in  reviews['Review']])

In [5]:
vectors

array([[ 0.01136781,  0.27647015, -0.1290165 , ...,  0.05219183,
        -0.08301983,  0.21277668],
       [-0.10861234,  0.202228  , -0.16707234, ..., -0.199966  ,
        -0.1376802 ,  0.05753095],
       [-0.16005023,  0.06799311, -0.09114511, ..., -0.10335435,
        -0.01798667,  0.13927835],
       ...,
       [-0.01175436,  0.24295382, -0.21152273, ...,  0.03168291,
         0.14142464,  0.15712544],
       [ 0.03936952,  0.12797432, -0.08757241, ...,  0.02857021,
         0.032859  ,  0.10750771],
       [ 0.05028109,  0.16872905, -0.1099451 , ...,  0.01070325,
        -0.03202983,  0.0659922 ]], dtype=float32)

In [6]:
# each review is a vector with 300 dimensions 
vectors.shape

(1000, 300)

In [7]:
reviews.shape

(1000, 2)

In [8]:
# en_core_web_lg is a big file so instead of vectorizing every time we want to train a model
# we can save the vectorized reviews and use them directly for training
# vectorized documents + Y-value aka Class
df = pd.DataFrame(vectors)
df['y'] = reviews['Liked']
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,y
0,0.011368,0.276470,-0.129017,-0.145498,-0.020452,0.027504,0.080474,-0.223450,-0.019728,2.027245,...,-0.033329,-0.142600,-0.083450,0.189887,-0.037975,-0.062776,0.052192,-0.083020,0.212777,1
1,-0.108612,0.202228,-0.167072,-0.187088,0.110493,0.095584,0.102028,-0.099292,-0.104873,1.951306,...,0.039084,-0.204419,-0.182488,0.095534,0.136218,0.072892,-0.199966,-0.137680,0.057531,0
2,-0.160050,0.067993,-0.091145,-0.146044,-0.000933,0.132031,0.146753,-0.153840,-0.054418,2.018404,...,0.094099,-0.053949,-0.072085,0.017531,0.092239,0.029666,-0.103354,-0.017987,0.139278,0
3,-0.030826,0.165351,-0.101266,-0.071269,0.011178,0.016436,0.021951,-0.230690,-0.059459,2.294125,...,0.109200,0.022392,0.075466,0.103716,-0.110551,-0.011041,-0.002226,-0.055208,0.107982,1
4,-0.021087,0.121627,-0.005618,-0.124052,0.144309,0.048415,0.079971,-0.088834,-0.030797,2.276237,...,0.144246,0.042506,-0.027031,0.050316,-0.122555,0.130821,-0.042905,0.109157,0.134107,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.105668,0.129936,-0.126518,-0.253778,0.007856,0.203998,0.136657,-0.022887,-0.071488,2.127359,...,0.040706,-0.179072,-0.026428,-0.000929,0.078157,-0.015047,-0.214913,0.138623,0.233714,0
996,-0.076485,0.191955,-0.057647,-0.200645,-0.044750,0.008917,-0.011535,-0.083520,-0.193808,1.977075,...,0.128426,-0.032170,0.137648,-0.052984,0.087580,0.090400,-0.108720,0.106425,-0.052925,0
997,-0.011754,0.242954,-0.211523,-0.138888,0.035266,0.000046,0.140355,-0.239413,-0.039087,2.473000,...,0.059792,-0.038341,-0.023911,0.200149,0.012663,-0.068889,0.031683,0.141425,0.157125,0
998,0.039370,0.127974,-0.087572,-0.122931,0.145440,0.072317,-0.035612,-0.096539,0.024743,2.171975,...,0.096099,-0.109966,0.017718,0.015680,-0.052448,-0.088792,0.028570,0.032859,0.107508,0


In [9]:
# save entire vectorized dataset + class
df.to_csv('datasets/vectorized_reviews.csv', index=False)

In [10]:
from sklearn.model_selection import train_test_split

In [31]:
# split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(vectors, reviews['Liked'],
                                                    test_size=0.2, random_state=42)

In [32]:
# put X_train and y_train in a pandas dataframe & save to .csv file
# this way they can be used in other notebooks and it will always be the same split
# it will be consistent for training multiple models with different algorithms

In [33]:
train = pd.DataFrame(X_train)
train['y'] = y_train.reset_index(drop=True)
X_train.shape

(800, 300)

In [34]:
# save as
train.to_csv('datasets/train.csv', index=False)

In [35]:
# put X_test and y_test in a pandas dataframe & save to .csv file
# this way they can be used in other notebooks

In [36]:
test = pd.DataFrame(X_test)
test['y'] = y_test.reset_index(drop=True)
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,y
0,-0.006481,0.207454,-0.219204,-0.099184,0.115625,-0.102487,0.015963,-0.126446,0.020083,2.444522,...,0.001127,-0.042989,-0.078605,0.195282,-0.111977,-0.150446,-0.113138,0.210248,0.057089,1
1,-0.002517,0.086505,-0.083503,-0.070633,0.216877,0.134762,0.022478,-0.043002,0.003376,2.289262,...,0.067882,-0.000234,0.000882,-0.009255,-0.018682,0.140099,-0.119847,0.006624,0.212024,1
2,0.086993,0.215541,-0.120345,-0.138272,0.26152,0.039318,0.115829,-0.286253,-0.012244,2.460836,...,0.059883,0.022384,0.095368,0.033024,0.028911,0.051183,-0.124663,0.078999,0.152478,1
3,0.002377,0.152428,-0.030801,0.018144,0.035084,0.207823,-0.008598,-0.194996,0.030936,1.444026,...,0.074016,-0.172355,-0.061356,-0.030822,-0.150044,0.006634,-0.215307,0.091737,0.063013,1
4,-0.08672,0.079271,-0.191686,-0.044966,0.005959,0.081476,0.086774,-0.17887,0.055954,2.110137,...,0.001845,-0.104912,-0.01945,0.059875,0.028149,0.040704,-0.101082,-0.073624,-0.038044,1


In [37]:
# save as
test.to_csv('datasets/test.csv', index=False)

In [47]:
tmp = train.dropna()

In [48]:
tmp.shape

(800, 301)

In [49]:
train.shape

(800, 301)