In [1]:
!pip install tensorflow-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-gpu
  Downloading tensorflow_gpu-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.0 MB)
[K     |████████████████████████████████| 578.0 MB 18 kB/s 
Collecting tensorflow-estimator<2.11,>=2.10.0
  Downloading tensorflow_estimator-2.10.0-py2.py3-none-any.whl (438 kB)
[K     |████████████████████████████████| 438 kB 25.1 MB/s 
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting tensorboard<2.11,>=2.10
  Downloading tensorboard-2.10.0-py3-none-any.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 29.5 MB/s 
Collecting keras<2.11,>=2.10.0
  Downloading keras-2.10.0-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 14.5 MB/s 
Installing collected packages: tensorflow-estimator, tensorboard, keras, gast, tensorflow-gpu
  Attempting uninstall: tensorflow-estimator
    F

In [2]:
import tensorflow as tf
print(tf.__version__)

2.10.0


In [3]:
from tensorflow.keras.preprocessing.text import one_hot

In [23]:
# When doing Word2vec using keras, First we need to do onehot encoding and then do word2vec using Embedding.

In [4]:
### sentences
sent=['the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'I would like to travel']

In [5]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'I would like to travel']

In [9]:
# Vocabulary size
voc_size=500

# Vocabulary size is a hyper parameter, and it says the number of unique words to be present.
# Here the words are less than 100, but vocabulary size is 500, and hence we get the index of each word as seen in next cell output.
# Each word is automatically arranged in ascending order, and hence index of each word will be based on ascending order.
# If the vocabulary size is 4, then one hot encoding of the sentence 'the glass of milk' will be like :
# [1,0,0,0]
# [0,1,0,0]
# [0,0,1,0]
# [0,0,0,1]
# The index will be like : the - 0, glass - 1, of - 2, milk - 3.

In [12]:
one_hot_resp = [one_hot(words,voc_size) for words in sent]
one_hot_resp

[[245, 137, 361, 59],
 [245, 137, 361, 172],
 [245, 267, 361, 27],
 [493, 114, 222, 242, 74],
 [493, 114, 222, 242, 93],
 [86, 245, 30, 361, 237],
 [493, 391, 25, 97, 282]]

## Word2Vec

In [13]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [15]:
sent_length=8   # sent_lnght is given to get a fixed length, by adding zeros.
embedded_docs = pad_sequences(one_hot_resp,padding='pre',maxlen=sent_length) # padding = 'pre'/'post'
embedded_docs

array([[  0,   0,   0,   0, 245, 137, 361,  59],
       [  0,   0,   0,   0, 245, 137, 361, 172],
       [  0,   0,   0,   0, 245, 267, 361,  27],
       [  0,   0,   0, 493, 114, 222, 242,  74],
       [  0,   0,   0, 493, 114, 222, 242,  93],
       [  0,   0,   0,  86, 245,  30, 361, 237],
       [  0,   0,   0, 493, 391,  25,  97, 282]], dtype=int32)

In [25]:
# we can do word2vec using Keras with Embedding in Deep learning.
# This is a Deep Learning approach, while genism is a machine learning approach.
# When we do LSTM, we will only prefer Embedding using Deep larning for doing word2vec and not Gensim.

In [17]:
dim=10 # feature dimension - Representing each word in 10 dimensions
model= Sequential()   # For Embedding
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.compile('adam','mse')

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             5000      
                                                                 
Total params: 5,000
Trainable params: 5,000
Non-trainable params: 0
_________________________________________________________________


In [21]:
embedded_docs[0]

array([  0,   0,   0,   0, 245, 137, 361,  59], dtype=int32)

In [27]:
model.predict(embedded_docs)

# Since we give dimensions as 10, each wrord have 10 different features.
# If we give dimensions as 100, each wrord have 100 different features in a single list.
# Also, we can see in the place of zeros in embedded_docs, there is no value actually, but it considers a value is there and coverts to word2vec.
# Since its black box, we cannot know what is the value there actually that it considres.



array([[[-0.0137907 , -0.0223021 , -0.04769836, -0.00351093,
          0.03452355,  0.00874112, -0.04502822,  0.03693154,
         -0.00686888, -0.03162664],
        [-0.0137907 , -0.0223021 , -0.04769836, -0.00351093,
          0.03452355,  0.00874112, -0.04502822,  0.03693154,
         -0.00686888, -0.03162664],
        [-0.0137907 , -0.0223021 , -0.04769836, -0.00351093,
          0.03452355,  0.00874112, -0.04502822,  0.03693154,
         -0.00686888, -0.03162664],
        [-0.0137907 , -0.0223021 , -0.04769836, -0.00351093,
          0.03452355,  0.00874112, -0.04502822,  0.03693154,
         -0.00686888, -0.03162664],
        [ 0.0365336 , -0.0460367 , -0.03142039, -0.02069883,
          0.02856798,  0.01409954, -0.01687599,  0.03492972,
          0.01923418,  0.02435598],
        [-0.01635777, -0.02715552,  0.04863297, -0.04636285,
          0.03903886,  0.02146456,  0.0410204 ,  0.04171983,
          0.00306548,  0.04985717],
        [ 0.02827134, -0.00559012, -0.00114548,  0.0