# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint
### Not for Grading

#  Word2Vec Similarity

In [None]:
#@title Case Study Walkthrough
#@markdown Word2Vec Similarity
from IPython.display import HTML

HTML("""<video width="520" height="440" controls>
  <source src="https://cdn.talentsprint.com/talentsprint/archives/sc/aiml/aiml_2018_b7_hyd/preview_videos/word2vec_similarity.mp4">
</video>
""")

This experiment is to understand the visualization of Word2Vec representations.


In [None]:
! wget https://cdn.talentsprint.com/talentsprint1/archives/sc/aiml/experiment_related_data/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.rar
! unrar e /content/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.rar
! wget https://www.dropbox.com/s/fm7nvhyvekhaka4/AIML_DS_WORD2VEC2D_STD.pkl.zip?dl=1
! mv AIML_DS_WORD2VEC2D_STD.pkl.zip?dl=1 AIML_DS_WORD2VEC2D_STD.pkl.zip
! unzip AIML_DS_WORD2VEC2D_STD.pkl.zip
    

### Importing required packages

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import gensim
import matplotlib.pyplot as plt
import pickle

### Loading Word2vec pretrained model

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin', binary=True, limit=500000)

In [None]:
words = ['man', 'woman', 'king', 'queen']

### Visualising and plotting the reduced word2vec representations

In [None]:
def words_plot(words, word_pairs, elev=20, azim=32, dimensions = 3, lines = True):
  #print(word_pairs)
  embeddings = []
  #print(words)
  for word in words:
    embeddings.append(model[word])
  embeddings = np.array(embeddings)
  
  
  if dimensions == 3:
    embedding_3d = TSNE(n_components=3).fit_transform(embeddings)

    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111, projection='3d')
    for index, ( x, y, z) in enumerate(embedding_3d):
      ax.scatter(x, y, z, c='r', marker='o')
      ax.text(x, y, z, words[index])
    if lines:
      for pair in word_pairs:
        #print(word_pairs)
        xplt, yplt, zplt = [], [], []
        for word in pair:
          #print(word)
          sn = words.index(word)
          xw,yw,zw = embedding_3d[sn]
          xplt.append(xw)
          yplt.append(yw)
          zplt.append(zw)

          ax.plot3D(xplt, yplt,zplt)

    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')

    ax.grid(True)
    ax.view_init(elev=elev, azim=azim)
    plt.show()
    
  else:
    embedding_2d = TSNE(n_components=2).fit_transform(embeddings)
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    for index, ( x, y) in enumerate(embedding_2d):
      ax.scatter(x, y, c='r', marker='o')
      ax.text(x, y, words[index])
    if lines:    
      for pair in word_pairs:
        xplt, yplt= [], []
        for word in pair:
          sn = words.index(word)
          xw,yw = embedding_2d[sn]
          xplt.append(xw)
          yplt.append(yw)

          ax.plot(xplt, yplt)

    ax.set_xlabel('X')
    ax.set_ylabel('Y')

    ax.grid(True)
    plt.show()

In [None]:
def plot_values(values, labels, figsize = (8,4), c = []):
    x = []
    y = []
    for value in values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=figsize) 
    for i in range(len(labels)):
        plt.scatter(x[i],y[i], color=c[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()


#.pkl file which is already trainied file which contain two dimentional represenatation of a word
two_dim_model = pickle.load(open('AIML_DS_WORD2VEC2D_STD.pkl', 'rb'))

wv_labels = {}
for vec, word in two_dim_model:
    wv_labels[word] = vec
    
colors = ['blue' for i in range(len(wv_labels))]

plot_values(wv_labels.values(), list(wv_labels.keys()), figsize = (16, 9), c = colors)

In [None]:
wv_list = ['king', 'queen', 'man', 'woman', 'Germany', 'France', 'Berlin', 'Paris', 'best', 'good', 'strong', 'strongest']
wv_new_labels = {}
for word in wv_list:
    wv_new_labels[word] = wv_labels[word]

colors = ['green' for i in range(len(wv_new_labels))]
plot_values(wv_new_labels.values(), list(wv_new_labels.keys()), c = colors)

### Representing Man, Woman, King, queen

In [None]:
words = ['man', 'woman', 'king', 'queen']
word_pairs = [['man', 'woman'], ['king', 'queen']]
words_plot(words, word_pairs, dimensions=2, lines=False)

### Representing Countries and its capitals

In [None]:
word_pairs =[['Spain', 'Madrid'], ['Italy', 'Rome'], ['Germany', 'Berlin']
              , ['Turkey', 'Ankara'], ['Russia', 'Moscow'], ['Canada', 'Ottawa']
              , ['Japan', 'Tokyo'], ['Vietnam', 'Hanoi'], ['China', 'Beijing']]
words = list(np.array(word_pairs).flatten())
words_plot(words, word_pairs, dimensions=2, lines=False)

### Finding the nearest or most similar words of .a word using Word2vec

In [None]:
words = ['France','JESUS', 'XBOX', 'Reddish', 'Scratched', 'MB']
for word in words:
  print('Top 10 similar words for {} are:'.format(word))
  for index,  (similar_word, similarity) in enumerate(model.most_similar(word)):
    print("\t {}. {} (similarity is {})".format(index+1, similar_word, similarity) )

### Verify Clusters in the Word2vec from the following link:

https://projector.tensorflow.org/

### Understand the semantics preserved by Word2vec by chosing the words along x and y axis to represents the other words in that co-oridinate system

https://lamyiowce.github.io/word2viz/