# Word To Vector & Model Training Using gensim NLP Library

In [1]:
# pip install gensim
# Gensim is an open-source library for unsupervised topic modeling and natural language processing,
# using modern statistical machine learning.

In [2]:
# conda install -c conda-forge python-levenshtein
# Directly installed in cmd
# The Levenshtein Python C extension module contains functions for fast computation of
# Levenshtein (edit) distance, and edit operations
# string similarity
# approximate median strings, and generally string averaging
# string sequence and set similarity

In [3]:
import numpy as np
import pandas as pd
import gensim
# import Levenshtein

In [4]:
df = pd.read_csv('DataSets/amazon_mobile_reviews.csv')
df.head(5)

Unnamed: 0,product_name,brand_name,price,rating,reviews,review_votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [5]:
# now lets separate each words from review column using gensim.utils
gensim.utils.simple_preprocess("I feel so LUCKY to have found this ")
# Similarly we apply this function on review column

['feel', 'so', 'lucky', 'to', 'have', 'found', 'this']

In [6]:
df.reviews.isna().sum()

62

In [7]:
clean_df = df[df.reviews.isna() == False]
clean_df.reviews.isna().sum()

0

In [8]:
review_df = clean_df.reviews.apply(gensim.utils.simple_preprocess)
review_df

0         [feel, so, lucky, to, have, found, this, used,...
1         [nice, phone, nice, up, grade, from, my, panta...
2                                           [very, pleased]
3         [it, works, good, but, it, goes, slow, sometim...
4         [great, phone, to, replace, my, lost, phone, t...
                                ...                        
413835                 [another, great, deal, great, price]
413836                                                 [ok]
413837    [passes, every, drop, test, onto, porcelain, t...
413838    [returned, it, because, it, did, not, meet, my...
413839    [only, downside, is, that, apparently, verizon...
Name: reviews, Length: 413778, dtype: object

In [9]:
# Now lets build model using genism
model = gensim.models.Word2Vec(
    
    window = 10,
    min_count= 2,
    workers = 4
)

In [10]:
model.build_vocab(review_df,progress_per = 1000)

In [11]:
model.epochs

5

In [12]:
model.corpus_count

413778

In [13]:
model.train(review_df,total_examples = model.corpus_count , epochs = model.epochs )

(57180036, 78161600)

In [14]:
model.wv.most_similar('bad')

[('good', 0.6558669209480286),
 ('poor', 0.6330470442771912),
 ('awful', 0.6116631627082825),
 ('terrible', 0.6111147403717041),
 ('weak', 0.6092661619186401),
 ('horrible', 0.5855326056480408),
 ('wack', 0.547021746635437),
 ('sucks', 0.5196340084075928),
 ('disappointing', 0.5165459513664246),
 ('lame', 0.5052154660224915)]

In [15]:
model.wv.most_similar('great')

[('fantastic', 0.7682809233665466),
 ('good', 0.7477265000343323),
 ('perfect', 0.7203373908996582),
 ('awesome', 0.7076297402381897),
 ('wonderful', 0.67986661195755),
 ('superb', 0.6790093183517456),
 ('nice', 0.6633250713348389),
 ('excellent', 0.6426056623458862),
 ('decent', 0.6194793581962585),
 ('amazing', 0.6176655888557434)]