# Working with pre-trained word vectors

## Loading the vectors

We download a small set of French word vectors trained with **Skip-Gram**.

The training corpus is the entire French Wikipedia and the vocabulary corresponds to the 100,000 most frequent words. Each word in the vocabulary is mapped to a vector $u \in \mathbb{R}^{300}$.

You'll find word vectors in other languages (for much larger vocabularies) over here: https://fasttext.cc/docs/en/crawl-vectors.html#models

# Importing packages

In [None]:
import numpy as np
from numpy.linalg import norm
import pandas as pd

## Read the vocabulary and load the vectors 

In [None]:
# Download the subset of embedded words
! unzip wiki.fr.100k.vec.zip

In [None]:
#! wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
#! wiki-news-300d-1M.vec.zip

In [None]:
## Read the vocabulary and the vectors 
data = pd.read_csv("wiki.fr.100k.vec", sep=" ", quoting=3, header=None, skiprows=1)
vocabulary = list(data[0].values)
vectors = data[range(1, 301)].values

## From words to vectors

In [None]:
# Simple function that returns the word vector, if it exists
def get_vector(word):
    word = str(word) ; word = word.lower()
    if word in vocabulary:
        return np.array(vectors[vocabulary.index(word)])
    else:
        return None

## Getting similar words

We usually measure the similarity between word vectors in terms of the angle between them. More precisely, we rely on the cosine of this angle: $\cos(\theta_{u_1, u_2}) = \frac{u_1 \cdot u_2}{||u_1|| \times ||u_2||}$

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_most_similar_words(vector, n=10):
    if type(vector) == str:
    vector = get_vector(vector)
    s = cosine_similarity(vector.reshape(1, -1), vectors)
    sorted_ids = np.argsort(-s)[0]
    return np.array([vocabulary[i] for i in sorted_ids[:n]])

get_most_similar_words("fréjus", 5)

## Cosine Kernel

In [None]:
## If we want to use another kernel (instead of cosine kernel) we can define it as follows
# Appliying the kernel between two vectors
def f(x,y):
    return np.exp(-np.sum((x-y)**2/(x+y)))

# define g that applies the kernel between a vector and a list of vectors
def g(x,l):
    return np.array([f(x,y) for y in l])

### A small example applying cosine function between two vectors

In [None]:
# define two lists or array
Vector1 = np.array([2,1,2,3,2,9])
Vector2 = np.array([3,4,2,4,5,5])
 
print("Vector1:", Vector1)
print("Vector2:", Vector2)
 
# compute cosine similarity
cosine_value = np.dot(Vector1,Vector2)/(norm(Vector1)*norm(Vector2))
print("Cosine Similarity between Vector1 and Vector2:", cosine_value)

### An example of appliying cosine between one vector in one hand and a list of vectors in the other hand

In [None]:
# define two lists or array
Vectors = np.array([[2,1,2],[3,2,9], [-1,2,-3]])
Vector1 = np.array([3,4,2])
print("Vectors:\n", Vectors)
print("Vectors:\n", Vector1)
print()
 
# compute cosine similarity
cosine_value = np.dot(Vectors,Vector1)/(norm(Vectors, axis=1)*norm(Vector1))
print("Cosine Similarity:\n", cosine_value)

In [None]:
def cosine(A,B):
    if type(B)==str :
        B = get_vector(B)
        return np.dot(A,B)/(norm(A, axis=1)*norm(B))
    else :
        return np.dot(A,B)/(norm(A, axis=1)*norm(B))

In [None]:
cosine(Vectors, Vector1)

In [None]:
print(vectors.shape, vectors[:10,1], sep="\n")

In [None]:
def get_word(vector, n=10):
    if type(vector) == str:
        return vector
    else : 
        cosin = cosine(vectors, vector)
        sorted_idx = np.argsort(-cosin)
        return vocabulary[sorted_idx[0]]

In [None]:
get_word(get_vector("Paris"))

In [None]:
def get_most_similar_words2(word, n=10):
    if type(word) == str:
        vector = get_vector(word)
    else : 
        vector = word
    gg = cosine(vectors, vector)
    sorted_ids = np.argsort(-gg)
    result = [vocabulary[i] for i in sorted_ids[:n+1] if get_word(word) not in vocabulary[i]]
    if len(result) > n :
        return result[:n]
    else :
        return result

In [None]:
get_most_similar_words2("fréjus", 5)

In [None]:
get_most_similar_words2("uruguay", 5)

## Visualizing the vectors and their relationships: countries & capital cities

Exercise:
- Extract the set of vectors for the words given below
- Linearly project them in 3D using the truncated singular value decomposition
- Plot the words according to the 2nd and 3rd axes

In [None]:
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

words = ['paris', 'france', 'berlin', 'allemagne', 'pékin', 'chine', 'tokyo', 'japon', 'mexico', 'mexique', 'caracas', 'venezuela']
word_vectors = [get_vector(word) for word in words] # word vectors for the words above packed into a matrix (row-wise)
word_vectors_2d = TruncatedSVD(n_components=3, algorithm="arpack").fit_transform(word_vectors)
plt.scatter(word_vectors_2d[:, 1], word_vectors_2d[:, 2])
for i, word in enumerate(words):
    plt.annotate(word, (word_vectors_2d[i, 1], word_vectors_2d[i, 2]))
    if i % 2 == 0:
        plt.plot(word_vectors_2d[i:i+2, 1], word_vectors_2d[i:i+2, 2])
plt.show()

In [None]:
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 2])
for i, word in enumerate(words):
    plt.annotate(word, (word_vectors_2d[i, 0], word_vectors_2d[i, 2]))
    if i % 2 == 0:
        plt.plot(word_vectors_2d[i:i+2, 0], word_vectors_2d[i:i+2, 2])
plt.show()

## Predicting the capital city from the country

Exercise :
- Based on the previous observation, think of a way to guess the capital city given a country
- Write a function called `find_capital` and try it out!

In [None]:
import numpy as np

# We compute the average difference between vectors for countries and vectors for capital cities
difference = np.zeros(300)
for i in range(0, len(words), 2):
    difference += get_vector(words[i]) - get_vector(words[i+1])
    difference /= len(words) / 2

def find_capital(country):
    # We simply retrieve the word with the vector representation closest to the vector for the country + the mean difference calculated above
    country_vector = get_vector(country)
    for w in get_most_similar_words(country_vector + difference, 2):
        if w != country:
            return w

In [None]:
find_capital("canada")

In [None]:
differences = np.array([])
for i in range(0, len(words), 2):
    differences = np.append( differences, get_vector(words[i]) - get_vector(words[i+1]) )
    difference2 = np.mean(differences)
    difference3 = np.median(differences)

def find_capital2(country):
    country_vector = get_vector(country)
    similar_words = get_most_similar_words2(country_vector+difference2, 10)
    capitals = [w for w in similar_words if country not in w]
    if len(capitals) > 3 : 
        return capitals[:3]
    else :
        return capitals

def find_capital3(country):
    country_vector = get_vector(country)
    similar_words = get_most_similar_words2(country_vector + difference3, 10)
    capitals = [w for w in similar_words if country not in w]
    if len(capitals) > 3 : 
        return capitals[:3]
    else :
        return capitals

In [None]:
A = np.array([1,2])
B = np.array([2,3])
C = np.array([1,1])
print(np.sqrt(np.sum((A-B)**2)))
print(np.sqrt(np.sum((A-C)**2)))

In [None]:
print(find_capital2("uruguay"))
print(find_capital2("canada"))
print(find_capital2("équateur"))
print()
print(find_capital3("uruguay"))
print(find_capital3("canada"))
print(find_capital3("équateur"))

In [None]:
l1_diff = [] ; l2_diff = []
for i in range(0, len(words), 2):
    l2_diff.append(list((get_vector(words[i])-get_vector(words[i+1]))**2))
    l1_diff.append(list(np.abs(get_vector(words[i])-get_vector(words[i+1]))))
l2_diff = np.sqrt(np.sum(l2_diff, axis=0))
l1_diff = np.sqrt(np.sum(l1_diff, axis=0))
print(l2_diff.shape, l1_diff.shape)

In [None]:
def find_capital_l1(country):
  country_vector = get_vector(country)
  similar_words = get_most_similar_words2(country_vector+l1_diff, 10)
  capitals = [w for w in similar_words if country not in w]
  if len(capitals) > 3 : 
    return capitals[:3]
  else :
    return capitals

print(find_capital_l1("uruguay"))
print(find_capital_l1("canada"))
print(find_capital_l1("équateur"))

In [None]:
def find_capital_l2(country):
  country_vector = get_vector(country)
  similar_words = get_most_similar_words2(country_vector+l2_diff, 10)
  capitals = [w for w in similar_words if country not in w]
  if len(capitals) > 5 : 
    return capitals[:5]
  else :
    return capitals

print(find_capital_l2("uruguay"))
print(find_capital_l2("canada"))
print(find_capital_l2("équateur"))

## Solving analogies

We consider analogies of the following form: *A* is to *B* as *C* is to *D*. In terms of the word vectors, we should have: $u_a - u_b = u_c - u_d$

Exercise:
- Write a function that solves such an analogy in the vector space, given *A*, *B* and *C*.
- Try it out!

In [None]:
def solve_analogy(word_a, word_b, word_c):
    # u_d = u_c - u_a + u_b: we just have to find the word with the vector closest to the vector u_c - u_a + u_b
    vector_d = get_vector(word_c) - get_vector(word_a) + get_vector(word_b)
    for w in get_most_similar_words(vector_d, n=10):
        if w not in [word_a, word_b, word_c]:
            return w

print(solve_analogy("homme", "femme", "roi"))
print(solve_analogy("voiture", "voitures", "camion"))
print(solve_analogy("grand", "haut", "petit"))
print(solve_analogy("ciel", "bleu", "feu"))
print(solve_analogy("chiot", "chien", "chaton"))

In [None]:
def solve_analogy2(word_a, word_b, word_c):
    # u_d = u_c - u_a + u_b: we just have to find the word with the vector closest to the vector u_c - u_a + u_b
    vector_d = get_vector(word_c) - get_vector(word_a) + get_vector(word_b)
    for w in get_most_similar_words2(vector_d, n=10):
        if w not in [word_a, word_b, word_c]:
            return w

print(solve_analogy2("homme", "femme", "roi"))
print(solve_analogy2("voiture", "voitures", "camion"))
print(solve_analogy2("grand", "haut", "petit"))
print(solve_analogy2("ciel", "bleu", "feu"))
print(solve_analogy2("chiot", "chien", "chaton"))

In [None]:
def solve_analogy2(word_a, word_b, word_c):
    # u_d = u_c - u_a + u_b: we just have to find the word with the vector closest to the vector u_c - u_a + u_b
    vector_d = get_vector(word_c) - get_vector(word_a) + get_vector(word_b)
    for w in get_most_similar_words2(vector_d, n=10):
        if w not in [word_a, word_b, word_c]:
            return w

print(solve_analogy2("france", "paris", "canada"))

In [None]:
def solve_analogy2(word_a, word_b, word_c):
    # u_d = u_c - u_a + u_b: we just have to find the word with the vector closest to the vector u_c - u_a + u_b
    vector_d = get_vector(word_c) - get_vector(word_a) + get_vector(word_b)
    for w in get_most_similar_words2(vector_d, n=10):
        if w not in [word_a, word_b, word_c]:
            return w

print(solve_analogy2("espagne", "madrid", "pérou"))