In [1]:
!pip install gensim # NLP library for topic modelling, document indexing and similarity retrieval with large corpora



In [3]:
! pip install python-Levenshtein #contains functions for fast computation of:Levenshtein distance,string similarity,...

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.21.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.21.0
  Downloading Levenshtein-0.21.0-cp39-cp39-win_amd64.whl (101 kB)
Collecting rapidfuzz<4.0.0,>=2.3.0
  Downloading rapidfuzz-3.0.0-cp39-cp39-win_amd64.whl (1.8 MB)
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.21.0 python-Levenshtein-0.21.0 rapidfuzz-3.0.0


In [4]:
import pandas as pd
import gensim

# Load Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [7]:
df = pd.read_json('Cell_Phones_and_Accessories_5.json',lines=True)
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"
...,...,...,...,...,...,...,...,...,...
194434,A1YMNTFLNDYQ1F,B00LORXVUE,eyeused2loveher,"[0, 0]",Works great just like my original one. I reall...,5,This works just perfect!,1405900800,"07 21, 2014"
194435,A15TX8B2L8B20S,B00LORXVUE,Jon Davidson,"[0, 0]",Great product. Great packaging. High quality a...,5,Great replacement cable. Apple certified,1405900800,"07 21, 2014"
194436,A3JI7QRZO1QG8X,B00LORXVUE,Joyce M. Davidson,"[0, 0]","This is a great cable, just as good as the mor...",5,Real quality,1405900800,"07 21, 2014"
194437,A1NHB2VC68YQNM,B00LORXVUE,Nurse Farrugia,"[0, 0]",I really like it becasue it works well with my...,5,I really like it becasue it works well with my...,1405814400,"07 20, 2014"


In [10]:
#we are going to train a word2vec model using only reviewText column

## Preprocessing

In [13]:
df['reviewText'][1]

'These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)'

In [18]:
gensim.utils.simple_preprocess(df['reviewText'][1])
#Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.

['these',
 'stickers',
 'work',
 'like',
 'the',
 'review',
 'says',
 'they',
 'do',
 'they',
 'stick',
 'on',
 'great',
 'and',
 'they',
 'stay',
 'on',
 'the',
 'phone',
 'they',
 'are',
 'super',
 'stylish',
 'and',
 'can',
 'share',
 'them',
 'with',
 'my',
 'sister']

In [21]:
review_text = df['reviewText'].apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

# Word2Vec Model
Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter. Workers define how many CPU threads to be used.

In [23]:
model = gensim.models.Word2Vec(window=10,min_count=2,workers=4)

In [24]:
model.build_vocab(review_text) # Build vocabulary from a sequence of sentences

In [26]:
model.epochs , model.corpus_count

(5, 194439)

In [27]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs) 
# Update the model's neural weights from a sequence of sentences

(61503830, 83868975)

### Finding Similar Words and Similarity between words
after train the model it can start understand the language

In [28]:
model.wv.similarity('bad','good')

0.5855808

In [35]:
model.wv.most_similar('good')

[('decent', 0.8113340139389038),
 ('great', 0.7846213579177856),
 ('nice', 0.709064781665802),
 ('fantastic', 0.6993702054023743),
 ('superb', 0.6416525840759277),
 ('excellent', 0.6229591369628906),
 ('outstanding', 0.6127968430519104),
 ('terrific', 0.6042064428329468),
 ('awesome', 0.6038155555725098),
 ('wonderful', 0.5899102687835693)]