<a href="https://colab.research.google.com/github/134130U/Natural-Language-processing/blob/master/word_vectors_Mbaye_Babou.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import io, sys
import numpy as np
from heapq import *

In [0]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(list(map(float, tokens[1:])))
    return data

In [0]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

word_vectors = load_vectors('wiki.en.vec')



 ** Word vectors ** 



In [0]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    ## FILL CODE
    num = u.dot(v)
    den = np.linalg.norm(u)*np.linalg.norm(v)
    return num/den

## This function returns the word corresponding to 
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

In [0]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [0]:
## Functions for nearest neighbors

def nearest_neighbor(x, word_vectors, exclude_words=[]):
    best_score = -1.0
    best_word = ''
    ## FILL CODE
    for word in word_vectors:
        if word not in exclude_words:
            score  = cosine(x,word_vectors[word])
            if score >=best_score:
                best_word = word
                best_score = score
        else:
            pass

    return best_word

## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.

def knn(x, vectors, k,exclude=[]):
    
    heap=[]
    ## FILL CODE
    for i in range(k):
        best_w = nearest_neighbor(x, vectors,exclude)
        #print(best_w)
        scor = cosine(x,vectors[best_w])
        exclude.append(best_w)
        heap.append((scor , best_w))
    return heap
    #return [heappop(heap) for i in range(len(heap))][::-1]

In [0]:
print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, ['cat','cats']))

The nearest neighbor of cat is: dog


In [0]:
# looking at nearest neighbors of a word

print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, ['cat','cats'] ))

knn_cat = knn(word_vectors['cat'], word_vectors, 5,['cat','cats'] )
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5,['cat','cats'] ):
    print(word + '\t%.3f' % score)

The nearest neighbor of cat is: dog

cat
--------------
dog	0.638
pet	0.573
rabbit	0.549
dogs	0.538
pig	0.458


In [0]:
def analogy(a,b,c,word_vectors):
    ## FILL CODE
    a, b, c= a.lower(), b.lower(), c.lower () 
     
    max_cosine_sim = -np.inf 
    best_word = None 
    # search for d in the whole word vector set 
    for w in word_vectors.keys():
        # ignore input words 
        if w in [a, b, c]: 
            continue 

        cos_sim = cosine(word_vectors[b] - word_vectors[a], word_vectors[w] - word_vectors[c]) 
        if cos_sim> max_cosine_sim: 
            max_cosine_sim = cos_sim 
            # update word_d 
            best_word = w
    return best_word

In [0]:
# Word analogies

print('')
print('King - Man + Woman = ' + analogy('man', 'king', 'woman', word_vectors))
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))


King - Man + Woman = queen
france - paris + rome = italy


In [0]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [0]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B

def association_strength(w, A, B, vectors):
    
    ## FILL CODE
    w_A = [cosine(word_vectors[w],word_vectors[a]) for a in A]
    w_B = [cosine(word_vectors[w],word_vectors[b]) for b in B]
    strength = (sum(w_A)/len(A))-(sum(w_B)/len(B))
    return strength

## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X, Y, A, B, vectors):
    ## FILL CODE
    x_AB = [association_strength(x,A,B, vectors) for x in X]
    z_AB = [association_strength(z,A,B, vectors) for z in Y]
    score = sum(x_AB)-sum(z_AB)
    return score

In [0]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation', 
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847
