In [1]:
# !pip install gensim
# !pip install python-Levenshtein
import gensim
import pandas as pd

Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [2]:
df = pd.read_json("D:\\Data Science\\Code basics\\py-master\\deep-learning-keras-tf-tutorial-master\\42_word2vec_gensim\\Cell_Phones_and_Accessories_5.json", 
                  lines=True) # It will read the file as a json object per line
df.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"


In [3]:
df.shape

(194439, 9)

In [5]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

# Pre-processing

#### Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [7]:
# Gensim library has a option for preprocess this, we can use it
# for understanding we use the 1st sentence of the review here
gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"
)
# It is tokenizing the sentence
# the capital, it converted it to lower case
# It removed -- I, !, 

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [9]:
# Lets apply this function to entire review text
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text 

## Each object in the pandas series is in the form of list and the list has tokenized words

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

# Training the Word2Vec Model

Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be use

#### Initialize the modeld.

In [10]:
## The Gensim is a NLP library
model = gensim.models.Word2Vec(
    window=10, # means 10 words before and after target word
    min_count=2, # minimum count required to read a sentence, here at least 2 words required in the sentence to read
    workers=4 # how many CPU threads we want to use for training
)

### Build vocabulary

In [11]:
model.build_vocab(review_text, progress_per=1000) # during training, after how many words we wnat to see progress bar

In [12]:
model.epochs # Default it has 5 epochs

5

In [13]:
# total examples we decide by corpus_count
model.corpus_count

194439

In [14]:
# Perform the training
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(61508205, 83868975)

## Save the Model
Save the model so that it can be reused in other applications

In [15]:
model.save("./word2vec-amazon-cell-accessories-reviews-short.model")

#### Finding Similar Words and Similarity between words
https://radimrehurek.com/gensim/models/word2vec.html

In [16]:
# Looking for word similar to bad
model.wv.most_similar("bad")

[('terrible', 0.6527751088142395),
 ('shabby', 0.6282874345779419),
 ('horrible', 0.5923579931259155),
 ('good', 0.5865124464035034),
 ('okay', 0.5444843769073486),
 ('legit', 0.5426936745643616),
 ('ok', 0.5391709804534912),
 ('crappy', 0.5390287637710571),
 ('awful', 0.5363757610321045),
 ('mad', 0.5183902382850647)]

In [17]:
# we can check the cosine similarity score as well
model.wv.similarity(w1="cheap", w2="inexpensive")

0.5206865

In [18]:
model.wv.similarity(w1="great", w2="good")

0.7852415