## Training own Word2Vec

In [1]:
#Defined tokenized sentences as training data
tokenized_sentences = [['Hello','Welcome','to','AI','Program','by','John'],
                       ['Hello','Glad','to','have','you','here'],
                       ['Hello','Welcome','back'],
                       ['Welcome','to','DS','Program','by','John']]

In [2]:
#Training Word2vec model
from gensim.models import Word2Vec

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
model = Word2Vec(sentences = tokenized_sentences,min_count=1,size=200)
#take all the words which has been occured 1 time. If Igive it 2,in this whole vocab, the words which has been occured 2 times only be considered.

In [8]:
print(model) #Size = 100 features/characteristics for one word

Word2Vec(vocab=13, size=200, alpha=0.025)


In [9]:
words = list(model.wv.vocab)
words

['Hello',
 'Welcome',
 'to',
 'AI',
 'Program',
 'by',
 'John',
 'Glad',
 'have',
 'you',
 'here',
 'back',
 'DS']

In [10]:
#I want to see the vector of training
print(model['AI']) #The word training has been rated on 100 different features
#Encoded value for Program

[ 1.14093139e-03 -1.47150701e-03  7.06094725e-04 -3.38953862e-04
 -7.76328088e-04 -1.67729321e-03  2.22085346e-03 -1.17279589e-04
  9.55864438e-04 -1.94632681e-03  2.41367248e-04 -2.33507110e-03
 -2.40847724e-03 -1.15489622e-03 -1.93518435e-03 -1.38115836e-03
 -7.09308486e-04  1.09901011e-04 -3.84359068e-04 -2.30878289e-03
  1.62658258e-03 -2.88849173e-04  9.21105093e-05  2.72227218e-04
  6.09109702e-05  7.57335685e-04  7.96799315e-04 -1.79839681e-03
  1.14045455e-03 -9.06046887e-04 -4.97762579e-04 -5.97522245e-04
 -1.83894392e-03  2.46354402e-03  1.63559732e-03  3.50400980e-04
 -2.32593878e-03  1.13409408e-03 -3.12731107e-04  2.98231113e-04
  2.16151879e-04  2.49255623e-04  6.55717275e-04  2.08188058e-03
 -9.70693422e-04  1.02843815e-05  1.13773427e-03  1.27800822e-03
  5.93503355e-04 -1.91714367e-04 -2.08732137e-03  2.43461970e-03
 -1.98970200e-03 -2.28993059e-03 -1.13298791e-03 -2.11226754e-03
  2.40247790e-03 -2.32187752e-03  1.22716487e-03 -1.46973506e-03
 -2.17695534e-03  2.21938

In [12]:
print(model['Bottle']) 

KeyError: ignored

In [13]:
model.most_similar('Hello') #the vector of hello is compared with all other words

[('Program', 0.08773721009492874),
 ('by', 0.07919643819332123),
 ('John', 0.057381052523851395),
 ('to', 0.0523456446826458),
 ('back', 0.042167533189058304),
 ('AI', 0.036395810544490814),
 ('here', -0.010298669338226318),
 ('you', -0.010443344712257385),
 ('DS', -0.02518000453710556),
 ('Welcome', -0.03465665876865387)]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.most_similar('Apple') 

KeyError: ignored

## LP 

Here there is a big limitation where we need to train our model by our own from scratch. Model is not able to identify any new word. That is the disadvantage of when we train our own network

## -------------------------------------------------------

## Without Word2vec how can we create our own word embeddings?

##### Rescue is Keras. Keras embedding layer.

In [14]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [15]:
#Define your own documents
sentences = ['Hello how are you',
             'how are you',
             'whats going on',
             'I am doing great',
             'You are awesome',
             'I really love you so much']

In [16]:
sent_labels = array([1,1,1,0,0,0])

In [17]:
vocab_size = 50 # dimension
encoded_sent = [one_hot(i,vocab_size)for i in sentences]
encoded_sent

[[46, 45, 30, 46],
 [45, 30, 46],
 [5, 33, 32],
 [4, 34, 32, 37],
 [46, 30, 20],
 [4, 41, 18, 46, 1, 25]]

In [18]:
#Now I want my nn to train  for that the size of the vector should be same.
padded_sent = pad_sequences(encoded_sent,maxlen = 6,padding='pre')
print(padded_sent)

[[ 0  0 46 45 30 46]
 [ 0  0  0 45 30 46]
 [ 0  0  0  5 33 32]
 [ 0  0  4 34 32 37]
 [ 0  0  0 46 30 20]
 [ 4 41 18 46  1 25]]


In [39]:
mymodel = Sequential()
mymodel.add(Embedding(vocab_size,output_dim = 50,input_length = 6)) #6 senteences
mymodel.add(Flatten())
mymodel.add(Dense(1,activation = 'sigmoid')) #Because target is categorical in nature

In [40]:
mymodel.compile(optimizer='adam',loss = 'binary_crossentropy',metrics = ['accuracy'])

In [41]:
mymodel.fit(padded_sent,sent_labels,epochs = 30)

ValueError: ignored

In [24]:
#Evaluate the model
mymodel.evaluate(padded_sent,sent_labels,verbose = 1)



[0.5470131039619446, 1.0]

In [25]:
model_loss,model_accuracy = mymodel.evaluate(padded_sent,sent_labels,verbose = 1)
print('Accuracy : %f' %(model_accuracy*100))

Accuracy : 100.000000


### The Prediction part

In [26]:
sent_for_pred = ['Hello how are you',
                 'I am doing great']

In [27]:
vocab_size = 50 #Is it dimension?
encoded = [one_hot(i,vocab_size)for i in sent_for_pred]
encoded

[[46, 45, 30, 46], [4, 34, 32, 37]]

In [28]:
#Now I want my nn to train  for that the size of the vector should be same.
padded_sent = pad_sequences(encoded,maxlen = 6,padding='pre')
print(padded_sent)

[[ 0  0 46 45 30 46]
 [ 0  0  4 34 32 37]]


In [29]:
predict_x = mymodel.predict(padded_sent)
predict_x


array([[0.59334815],
       [0.44221407]], dtype=float32)

## LP
Hre vocab is so small, so model performance wont be that great.

## -------------------------------------------------------------

## 3. Using Pretrained Word2vec model

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
pretrained_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary = True)

FileNotFoundError: ignored

In [None]:
pretrained_model.most_similar('data')
print(result)