In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import gensim
from tqdm import tqdm

In [2]:
reviews_dataset = pd.read_csv('SentimentReviews.csv')
reviews_dataset.head()

Unnamed: 0.1,Unnamed: 0,brand,categories,manufacturer,reviews.rating,reviews.text,label,reviews.text_reduce
0,0,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Mo...",Universal Music Group / Cash Money,5,i love this album. it's very good. more to the...,positive,i love this album. it's very good. more to the...
1,1,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,5,Good flavor. This review was collected as part...,positive,Good flavor. This review was collected as part...
2,2,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,5,Good flavor.,positive,Good flavor.
3,3,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,1,I read through the reviews on here before look...,negative,I read through the reviews on here before look...
4,4,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,1,My husband bought this gel for us. The gel cau...,negative,My husband bought this gel for us. The gel cau...


In [3]:
reviews_dataset.drop(columns=['Unnamed: 0', 'brand', 'categories', 'manufacturer', 'reviews.text', 'label'], inplace=True)

In [4]:
reviews_dataset['reviews.rating'] = reviews_dataset['reviews.rating'].replace([5,4,3,2,1],['Positive', 'Positive', 'OK', 'Negative', 'Negative'])

In [5]:
reviews_dataset.head()

Unnamed: 0,reviews.rating,reviews.text_reduce
0,Positive,i love this album. it's very good. more to the...
1,Positive,Good flavor. This review was collected as part...
2,Positive,Good flavor.
3,Negative,I read through the reviews on here before look...
4,Negative,My husband bought this gel for us. The gel cau...


In [6]:
dataset = list(reviews_dataset['reviews.text_reduce'].values)

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
corpus = []

for sentences in dataset:
    review = re.sub('[^a-zA-Z0-9]', ' ', sentences)
    review = review.lower()
    tokens = word_tokenize(review)
    
    review = [lemmatizer.lemmatize(word) for word in tokens if word not in list(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus

['love album good hip hop side current pop sound hype listen everyday gym give 5star rating way metaphor crazy',
 'good flavor review collected part promotion',
 'good flavor',
 'read review looking buying one couple lubricant ultimately disappointed even live review read starter neither boyfriend could notice sort enhanced captivating sensation notice however messy consistency reminiscent liquid vaseline difficult clean pleasant especially since lacked captivating sensation expecting disappointed paid much lube use could use normal personal lubricant 1 le money 2 le mess',
 'husband bought gel u gel caused irritation felt like burning skin recommend gel',
 'boyfriend bought spice thing bedroom highly disappointed product bought one absolutely love ky mine thought would similar affect absolutely nothing buy',
 'bought earlier today excited check based product description expecting something like regular ky fan expected left little disappointed',
 'bought product husband try impressed t

In [32]:
words = []

for sent in corpus:
    tokens = word_tokenize(sent)
    words.append(tokens)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Explaination
Word2Vec word embedding is actually a key value pair generator

In [45]:
word2vec = gensim.models.Word2Vec(words, window=5, min_count=2)

In [75]:
print(f"{type(word2vec)} and {type(word2vec.wv)}")

<class 'gensim.models.word2vec.Word2Vec'> and <class 'gensim.models.keyedvectors.KeyedVectors'>


In [79]:
print(word2vec.wv['good'])
# The good key/ word in the vocubulary is having this vector
print()
# Shape of the vector assosiated with every word in the vocubulary
print(word2vec.wv['good'].shape)

[-1.7185261   1.0284404  -1.4655803   0.11658531 -0.6850655   0.19774058
 -0.26923662  1.7252489   0.21765614  0.56984615  0.81728977 -0.24802677
  0.15716285 -0.6800941  -0.5398448   0.41284123  1.2184249  -0.33948818
  1.1167039  -0.44440275 -0.09633159 -0.02157155 -2.0933948  -0.31144232
  0.26399466 -1.3307065   0.21741992 -0.4374115  -0.31249267  0.42049402
 -0.18356466 -0.12452838 -0.74160033  1.0053277  -1.2186155   0.6640162
 -0.11497999 -0.5833646   0.60507923  0.53850895  0.9559871   0.06577678
 -0.23007411  0.09638242  0.2756641   1.4093093  -0.562354    1.6006969
  0.5192513   1.0747559   0.28277925  0.9533813  -0.34668905  0.04445101
  0.95150495 -1.1836061  -0.1479237  -0.9950482   0.44974664  0.9716098
  0.02240847 -1.1363729   1.0421133   0.3547091  -1.5177124  -1.462502
  0.27926084 -1.0676948  -0.81132936  1.520548    0.93635213  1.8477347
  0.32910645 -0.9201583  -0.13229074  0.79785347  0.8906264  -0.71870315
  0.14169954  0.35252658  0.45864815  0.6137815  -1.30936

In [53]:
len(word2vec.wv.index_to_key)

14369

In [56]:
word2vec.corpus_count

70867

In [57]:
word2vec.epochs

5

In [66]:
word2vec.wv.similar_by_word('product')

[('moisturizer', 0.6717165112495422),
 ('conditioner', 0.6115230321884155),
 ('cream', 0.5563006401062012),
 ('moisturizers', 0.5556017756462097),
 ('brand', 0.5458189845085144),
 ('condition', 0.5370082259178162),
 ('lotion', 0.5198933482170105),
 ('bodywash', 0.4905184209346771),
 ('clay', 0.48338696360588074),
 ('result', 0.4815327823162079)]

In [71]:
print(f"""The cosine similarity between the words 'good'|'product' and 'bad'|'product' is {word2vec.wv.similarity('good','product')} and {word2vec.wv.similarity('bad','product')}""")

The cosine similarity between the words 'good'|'product' and 'bad'|'product' is 0.02275700680911541 and -0.010990194045007229


In [80]:
words

[['love',
  'album',
  'good',
  'hip',
  'hop',
  'side',
  'current',
  'pop',
  'sound',
  'hype',
  'listen',
  'everyday',
  'gym',
  'give',
  '5star',
  'rating',
  'way',
  'metaphor',
  'crazy'],
 ['good', 'flavor', 'review', 'collected', 'part', 'promotion'],
 ['good', 'flavor'],
 ['read',
  'review',
  'looking',
  'buying',
  'one',
  'couple',
  'lubricant',
  'ultimately',
  'disappointed',
  'even',
  'live',
  'review',
  'read',
  'starter',
  'neither',
  'boyfriend',
  'could',
  'notice',
  'sort',
  'enhanced',
  'captivating',
  'sensation',
  'notice',
  'however',
  'messy',
  'consistency',
  'reminiscent',
  'liquid',
  'vaseline',
  'difficult',
  'clean',
  'pleasant',
  'especially',
  'since',
  'lacked',
  'captivating',
  'sensation',
  'expecting',
  'disappointed',
  'paid',
  'much',
  'lube',
  'use',
  'could',
  'use',
  'normal',
  'personal',
  'lubricant',
  '1',
  'le',
  'money',
  '2',
  'le',
  'mess'],
 ['husband',
  'bought',
  'gel',
  'u

Now for every sentence in the words array, we need to find the average cbow vector, which will be used for the sequential model. 

In [136]:
def avg_word2vec(doc):
    vectors = [list(word2vec.wv[word]) for word in doc if word in word2vec.wv.index_to_key]
    sum_vect = np.zeros(shape=(100,))
    for vect in vectors:
        sum_vect += vect
    avg = sum_vect/len(words[0])
    return avg

In [137]:
X = []

for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|███████████████████████████████████| 70867/70867 [00:32<00:00, 2172.83it/s]


In [139]:
X = np.array(X)