In [1]:
import numpy as np
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import  one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
reviews = [
    'i loves this food',
    'great stuff',
    'food tastes like shit',
    'shitty customer service',
    'meh...experience could have been better',
    'no better restaurant in town',
    'words couldnt begin to describe the marvellous experience'
]

sentiment = np.array([1,1,0,0,0,1,1])

**Using Supervised Learning**

In [3]:
one_hot('cool, job',70) # assigns a unique number to each word based off a prescribed vocabulary size

[50, 26]

In [4]:
vocab_size = 50
encoded_review = [one_hot(item,vocab_size) for item in reviews]
encoded_review

[[48, 33, 13, 29],
 [31, 10],
 [29, 3, 16, 11],
 [24, 13, 24],
 [39, 28, 21, 46, 31, 37],
 [5, 37, 6, 37, 10],
 [13, 3, 8, 36, 2, 9, 9, 28]]

In [5]:
# we need to apply padding to the end of some sentences because not all the sentences are of equal length
max_length = 8 # the max length of the longest sequence of encoded reviews
padded_reviews = pad_sequences(encoded_review,padding='post',maxlen=max_length)
padded_reviews

array([[48, 33, 13, 29,  0,  0,  0,  0],
       [31, 10,  0,  0,  0,  0,  0,  0],
       [29,  3, 16, 11,  0,  0,  0,  0],
       [24, 13, 24,  0,  0,  0,  0,  0],
       [39, 28, 21, 46, 31, 37,  0,  0],
       [ 5, 37,  6, 37, 10,  0,  0,  0],
       [13,  3,  8, 36,  2,  9,  9, 28]], dtype=int32)

In [6]:
embedding_vector_size = 5 # the embedding vector that would chracterize each word
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_size,input_length=max_length,name='embedding'))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))

In [7]:
model.compile(
    metrics=['accuracy'],
    optimizer='adam',
    loss='binary_crossentropy'
)

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 5)              250       
                                                                 
 flatten (Flatten)           (None, 40)                0         
                                                                 
 dense (Dense)               (None, 1)                 41        
                                                                 
Total params: 291
Trainable params: 291
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(padded_reviews,sentiment,epochs=50,verbose=0)

<keras.callbacks.History at 0x7f80301c1210>

In [10]:
weights = model.get_layer('embedding').get_weights()[0]
len(weights)

50

In [11]:
weights[10] # the weights for the 15th encoded review

array([ 0.00474573, -0.04492563,  0.02518594,  0.04958897, -0.00197494],
      dtype=float32)

In [12]:
weights[15]

array([-0.02769392,  0.00410155,  0.01230291,  0.02313756, -0.02883825],
      dtype=float32)

In [13]:
! pip install gensim
! pip install python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 4.6 MB/s 
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149873 sha256=b4afd9c43c17a8b8b4d4eab1135a4ebcea1cbcc3e31d5b60d50fe8ab3f29d827
  Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.2


Using Self Supervised Methods like Word2Vec

In [14]:
import pandas as pd
import gensim

In [15]:
reviews = []
with open ("Cell_Phones_&_Accessories.txt") as review:
  for line in review:
    if 'text' in line:
      line = line.strip(" ")
      line = line.split(':')
      reviews.append(line[1])
reviews_dict = {'Reviews':reviews}
reviews_data = pd.DataFrame(reviews_dict)

In [16]:
reviews_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47979 entries, 0 to 47978
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Reviews  47979 non-null  object
dtypes: object(1)
memory usage: 375.0+ KB


In [17]:
reviews_data.head()

Unnamed: 0,Reviews
0,"First of all, the company took my money and s..."
1,Great product- tried others and this is a ten...
2,works real good....a little hard to set up......
3,The price was right for this cable ($11.95+$4...
4,this is NOT a DATA CABLE this is only a USB c...


In [18]:
# simple preprocessing 
reviews_data['Prep_Reviews'] = reviews_data['Reviews'].apply(gensim.utils.simple_preprocess)

In [19]:
reviews_data.head()

Unnamed: 0,Reviews,Prep_Reviews
0,"First of all, the company took my money and s...","[first, of, all, the, company, took, my, money..."
1,Great product- tried others and this is a ten...,"[great, product, tried, others, and, this, is,..."
2,works real good....a little hard to set up......,"[works, real, good, little, hard, to, set, up,..."
3,The price was right for this cable ($11.95+$4...,"[the, price, was, right, for, this, cable, com..."
4,this is NOT a DATA CABLE this is only a USB c...,"[this, is, not, data, cable, this, is, only, u..."


In [20]:
# our word to vec model
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [21]:
# building our vocabulary
model.build_vocab(reviews_data['Prep_Reviews'],progress_per=1000)

In [23]:
model.train(reviews_data['Prep_Reviews'], total_examples=model.corpus_count,epochs=model.epochs)

(13831981, 18987295)

In [24]:
model.save("./Word2Vec.model")

In [26]:
# computing similarity scores between selected words and ones in our training set
model.wv.most_similar("bad")

[('good', 0.5857530832290649),
 ('terrible', 0.5851250290870667),
 ('poor', 0.576062023639679),
 ('horrible', 0.5547113418579102),
 ('awful', 0.5019140243530273),
 ('lousy', 0.4841746687889099),
 ('fuzzy', 0.4778565466403961),
 ('big', 0.47333118319511414),
 ('frustrated', 0.4658671021461487),
 ('ok', 0.4614107012748718)]

In [32]:
model.wv.similarity("apple",'samsung')

0.33523843