In [1]:
# Importing gensim library for word embedding techniques
import gensim
import pandas as pd

**Below** When lines=True is specified, pandas will treat each line in the file as a separate JSON object. This allows pandas to read the file correctly into a DataFrame, where each line in the JSONL file corresponds to a row in the DataFrame.

In [2]:
df = pd.read_json("reviews_Cell_Phones_and_Accessories_5.json",lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [3]:
df.shape

(194439, 9)

In [4]:
# This is the columns we are intrusted in
df.reviewText

0         They look good and stick good! I just don't li...
1         These stickers work like the review says they ...
2         These are awesome and make my phone look so st...
3         Item arrived in great time and was in perfect ...
4         awesome! stays on, and looks great. can be use...
                                ...                        
194434    Works great just like my original one. I reall...
194435    Great product. Great packaging. High quality a...
194436    This is a great cable, just as good as the mor...
194437    I really like it becasue it works well with my...
194438    product as described, I have wasted a lot of m...
Name: reviewText, Length: 194439, dtype: object

**Now we need to do preprocessing to remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.**

**Other than that, We will also make all the words To Lowercase**

In [5]:
# Preprocessing review text using simple_preprocess
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [6]:
model = gensim.models.Word2Vec(  # Creating a Word2Vec model.
    window=10,  # Specifying the window size for context words.
    min_count=2,  # Setting the minimum frequency threshold for words.
    workers=6  # Specifying the number of worker threads to train the model.
)

In [7]:
# Building the vocabulary from preprocessed review text
# progress_per Indicates how many words to process before showing/updating the progress

model.build_vocab(review_text,progress_per=1000)

In [8]:
model.epochs

5

In [9]:
model.corpus_count

194439

In [10]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(61506729, 83868975)

In [11]:
# Saving the trained Word2Vec model
model.save("./word2vec-amazon-cell-accessories-reviews-short.model")

In [12]:
# Finding words most similar to "old" in the vocabulary
model.wv.most_similar("woman")

[('women', 0.7599573135375977),
 ('men', 0.7441850304603577),
 ('man', 0.7225392460823059),
 ('guy', 0.7118485569953918),
 ('student', 0.6657068729400635),
 ('girl', 0.664975106716156),
 ('young', 0.6372599005699158),
 ('ladies', 0.6335774660110474),
 ('lady', 0.6325311064720154),
 ('child', 0.6312819123268127)]

In [13]:
# Calculating similarity between two words "great"
model.wv.similarity(w1="great", w2="great")

0.9999998

In [14]:
model.wv.similarity(w1="good", w2="bad")


0.58550805

In [15]:
model.wv.similarity(w1="app", w2="apps")

0.7551212