In [1]:
# !pip install gensim
# !pip install python-Levenshtein

In [2]:
!pip install resampy
!pip install gensim
!pip install python-Levenshtein


Collecting python-Levenshtein
  Using cached python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Using cached levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Using cached rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Using cached python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Using cached levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
Using cached rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.27.1 python-Levenshtein-0.27.1 rapidfuzz-3.14.1


In [3]:
import gensim
import pandas as pd

### Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Sports & Outdoors category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz

In [5]:
df = pd.read_json('/content/reviews_Sports_and_Outdoors_5.json.gz', lines=True)

In [6]:
df.shape

(296337, 9)

### Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data.
For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations.
This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [7]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)

In [8]:
review_text

Unnamed: 0,reviewText
0,"[this, came, in, on, time, and, am, veru, happ..."
1,"[had, factory, glock, tool, that, was, using, ..."
2,"[if, you, don, have, punch, or, would, like, t..."
3,"[this, works, no, better, than, any, punch, yo..."
4,"[purchased, this, thinking, maybe, need, speci..."
...,...
296332,"[this, is, water, bottle, done, right, it, is,..."
296333,"[if, you, re, looking, for, an, insulated, wat..."
296334,"[this, hydracentials, sporty, oz, double, insu..."
296335,"[as, usual, received, this, item, free, in, ex..."


In [9]:
review_text.loc[0]

['this',
 'came',
 'in',
 'on',
 'time',
 'and',
 'am',
 'veru',
 'happy',
 'with',
 'it',
 'haved',
 'used',
 'it',
 'already',
 'and',
 'it',
 'makes',
 'taking',
 'out',
 'the',
 'pins',
 'in',
 'my',
 'glock',
 'very',
 'easy']

In [10]:
df.reviewText.loc[0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

### Training the Word2Vec Model

Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

#### Initialize the model

In [11]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)


#### Build Vocabulary

In [12]:
model.build_vocab(review_text, progress_per=1000)

#### Train the Word2Vec Model

In [13]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91337376, 121496535)

### Finding Similar Words and Similarity between words
https://radimrehurek.com/gensim/models/word2vec.html

In [14]:
model.wv.most_similar("awful")

[('horrible', 0.6977453827857971),
 ('ugly', 0.688739538192749),
 ('terrible', 0.6725580096244812),
 ('overpowering', 0.6188647747039795),
 ('horrendous', 0.5743656158447266),
 ('unusual', 0.5650439262390137),
 ('anomaly', 0.5612172484397888),
 ('funny', 0.5515046119689941),
 ('oreo', 0.5499078035354614),
 ('authentic', 0.5472955107688904)]

In [15]:
model.wv.similarity(w1="good", w2="great")

0.7826075

In [16]:
model.wv.similarity(w1="slow", w2="steady")

0.3734824