# In this session we will look into how we can convert words to vectors using WordtoVec and gensim library for amazon phone review

In [26]:
# !pip install gensim
# !pip install python-Levenshtein

In [27]:
# Libraries
import numpy as np
import pandas as pd
import gensim   # "gensim" is a popular NLP Library

# The Amazon Product Review Dataset

In [28]:
# Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz
# Then use Git bash and cd to that directory
# Then type "gunzip filename", which will unzip the .gz file
df = pd.read_json("reviews_Cell_Phones_and_Accessories_5.json", lines=True)
df.head()  # "lines=True" to get each lines as a json object

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [29]:
df.shape

(194439, 9)

In [30]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

# Preprocessing

In [31]:
# Convert words into lower case, remove punctuation marks
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

# Gensim Model Creation

In [32]:
# Model Creation
model = gensim.models.Word2Vec(
    window=10,  # 10 words before and after a word is the window
    min_count=2,    # Setting limit to at least 2 word per sentence for training
    workers=4   # How meny CPU threads to use
)

# Building Vocabulary
model.build_vocab(review_text, progress_per=1000)   # After how meny to show

In [33]:
# Training The Model
# By default "model.epoches" = 5
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(61506509, 83868975)

In [34]:
# Saving The Model For Future
model.save("./word2vec_model.model")

# Experiment on The Model

In [35]:
model.wv.most_similar("bad")

[('terrible', 0.7003341317176819),
 ('shabby', 0.620646059513092),
 ('good', 0.6069127917289734),
 ('horrible', 0.5855830907821655),
 ('funny', 0.5579283237457275),
 ('awful', 0.5565418601036072),
 ('legit', 0.555342435836792),
 ('okay', 0.5540140867233276),
 ('disappointing', 0.5323582887649536),
 ('cheap', 0.5277473330497742)]

In [36]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.49837542

In [37]:
model.wv.similarity(w1="great", w2="nice")

0.67907214