In [1]:
# !pip install gensim
# !pip install python-Levenshtein
import gensim
import pandas as pd

In [2]:
df = pd.read_json("D:\\Data Science\\Code basics\\py-master\\deep-learning-keras-tf-tutorial-master\\42_word2vec_gensim\\Sports_and_Outdoors_5.json", 
                  lines=True) # It will read the file as a json object per line
df.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"


In [3]:
df.shape

(296337, 9)

In [4]:
df.reviewText[0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

In [5]:
# Gensim library has a option for preprocess this, we can use it
# for understanding we use the 1st sentence of the review here
gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"
)
# It is tokenizing the sentence
# the capital, it converted it to lower case
# It removed -- I, !, 

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [6]:
# Lets apply this function to entire review text
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text 

## Each object in the pandas series is in the form of list and the list has tokenized words

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [7]:
## The Gensim is a NLP library
model = gensim.models.Word2Vec(
    window=10, # means 10 words before and after target word
    min_count=2, # minimum count required to read a sentence, here at least 2 words required in the sentence to read
    workers=4 # how many CPU threads we want to use for training
)

In [8]:
model.build_vocab(review_text, progress_per=1000) # during training, after how many words we wnat to see progress bar

In [9]:
model.epochs # Default it has 5 epochs

5

In [10]:
# total examples we decide by corpus_count
model.corpus_count

296337

In [11]:
# Perform the training
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91335052, 121496535)

In [12]:
# Looking for word similar to bad
model.wv.most_similar("bad")

[('terrible', 0.6901170015335083),
 ('shabby', 0.6550760865211487),
 ('horrible', 0.6299969553947449),
 ('funny', 0.5555282235145569),
 ('chintzy', 0.5481809377670288),
 ('ech', 0.5437890887260437),
 ('upset', 0.536017894744873),
 ('lame', 0.5280051231384277),
 ('awful', 0.5165954828262329),
 ('good', 0.5142667889595032)]

In [13]:
# we can check the cosine similarity score as well
model.wv.similarity(w1="cheap", w2="inexpensive")

0.52744764

In [14]:
model.wv.similarity(w1="great", w2="good")

0.7837087

In [15]:
model.wv.similarity(w1="slow", w2="steady")

0.37644565