# Word Embedding Alignment

In [3]:
#import libraries
import numpy as np
import pandas as pd
import random
import gensim
from gensim.models import KeyedVectors
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")

## Read Vocabularies From Emebddings and Subset Bilingual Vocabularies

#### Use this code to extract from source and target vocabularies n words that appear in both 
#### Note: Skip this part if you data in the folder "embedding/alignemnt/toy_data"

In [None]:
# read source vocabulary
source_vocabulary_embeddings = KeyedVectors.load_word2vec_format(PATH_EMBEDDING_SOURCE)
# read target vocabulary
target_vocabulary_embeddings = KeyedVectors.load_word2vec_format(PATH_EMBEDDING_TARGET)

In [None]:
# get list of tokens from source and target vocabularies (i.e. words in the vocabulary) 
words_src = []
for word_src in source_vocabulary_embeddings.vocab:
    words_src.append(word_src)

words_trg = []
for word_trg in target_vocabulary_embeddings.vocab:
    words_trg.append(word_trg)

# Summary Source
print("Number of Tokens in source: {}".format(len(words_src)))
print("Dimension of a word vector source embeddings: {}".format(len(source_vocabulary_embeddings[words[0]])))

# Summary Target
print("Number of Tokens in target: {}".format(len(words_it)))
print("Dimension of a word vector target embeddings: {}".format(len(target_vocabulary_embeddings[words[0]])))

In [None]:
# Create dictionary with words index
# Source
word_to_index_en = {}
i=0
for word, _ in source_vocabulary_embeddings.vocab.items():
    word_to_index_en[word] = i
    i=i+1
print("This is the index of the word you are looking int the source: {}".format(word_to_index_eng[WORD_TO_CHECK_SOURCE]))

# Target
word_to_index_it = {}
i=0
for word, _ in target_vocabulary_embeddings.vocab.items():
    word_to_index_it[word] = i
    i=i+1
print("This is the index of the word you are looking int the target: {}".format(word_to_index_eng[WORD_TO_CHECK_TARGET]))

In [None]:
def toy_data(source_vocabulary, target_vocabulary, n_words):
    """Function to extract bilingual vocabulary from source and target vocabularies. 
    It returns two dictionaries (i.e. source and target languages) with same keys."""

    #create a dictionary of overlapping words
    source_words = set(source_vocabulary.index2word)
    target_words = set(target_vocabulary.index2word)
    overlap = list(source_words & target_words)
    bilingual_vocabulary = [(entry, entry) for entry in overlap]
    
    #select a random number of words from the bilingual dictionary
    pair_words = random.sample(bilingual_vocabulary, n_words)
    toy_words = [tup[0] for tup in pair_words]
    
    #store vector for each word in two dictionaries (source and target)
    source_dictionary = dict()
    target_dictionary = dict()
    for word in toy_words:
        source_dictionary[word] = source_vocabulary.get_vector(str(word))
        target_dictionary[word] = target_vocabulary.get_vector(str(word))
    
    return source_dictionary, target_dictionary

In [None]:
# generate toy data eng_pt
source_dictionary, target_dictionary = toy_data(source_vocabulary_embeddings, target_vocabulary_embeddings, 20000)
# check if lenght among source and target matches
print("The two subsets have equal number of entries: {}".format(len(source_dictionary.keys()) == len(target_dictionary.keys())))

## Load Data in the folder "embedding/alignemnt/toy_data"

In [5]:
# Read from file
dictionary_eng_it = np.load('toy_data/toy_data_eng-it_20000.npy').item()
dictionary_it_eng = np.load('toy_data/toy_data_it-eng_20000.npy').item()
#check if the words are the same in the two vocabulary
print("The two dictionaries have same number of words: {}".format(dictionary_eng_it.keys() == dictionary_it_eng.keys()))

The two dictionaries have same number of words: True


### Split Train and Test Data

In [None]:
def split_toy_data_train_test(source_dictionary, target_dictionary, ratio_train_test):
    """Function to split source and target dictionaries in train and test. It returns train and test matrices for both source and target languages."""
    #prepare the matrix
    source_matrix = list(source_dictionary.values())
    target_matrix = list(target_dictionary.values())
    
    #select split ratio and random indices
    indices = np.random.permutation(len(source_matrix))
    ratio_train_test = ratio_train_test
    split_range = int(len(source_matrix) * ratio_train_test)
    training_idx, test_idx = indices[:split_range], indices[split_range :]

    #select by indices train and test for source and target dictionaries
    source_train = np.array([source_matrix [i] for i in training_idx])
    source_test = np.array([source_matrix[i] for i in test_idx])
    target_train = np.array([target_matrix[i] for i in training_idx])
    target_test = np.array([target_matrix[i] for i in test_idx])
    
    return source_train, source_test, target_train, target_test

In [None]:
source_train, source_test, target_train, target_test = split_toy_data_train_test(dictionary_eng_it, dictionary_it_eng, 0.7)

In [None]:
print(source_train.shape, source_test.shape, target_train.shape, target_test.shape)

### Learn Transformation to Align Languages

In [None]:
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

In [None]:
def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [None]:
def apply_transform(transformation, source_test):
    """ Apply the given transformation to the vector space. 
    It returns predictions given transformations with embeddings E: E = E * transform """
    return np.matmul(source_test, transformation)

In [None]:
# learn the transformation
transformation = learn_transformation(source_train, target_train, normalize_vectors = True)
# apply transformation on the test of the source language
source_transformed = apply_transform(transformation, source_test)

### Evaluate Transformation

In [None]:
source_transformed.shape[0]

In [None]:
def cosine_similarity(vec_a, vec_b):
    """Compute cosine similarity between vec_a and vec_b"""
    return np.dot(vec_a, vec_b) / \
        (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

In [None]:
def evaluate_proximity(source_transformed, target_test, close_neighbors, metric):
    """Compute accuracy, cosine similarity and euclidian distance between the closest transformed vectors.
    INPUT:
    close_neighbors: choose number of neighbors
    metric: 'euclidian' or 'cosine' to compute kNN neighbors
    source_tranformed: test set of the source language after transformation
    target_test: test set of the target language
    
    OUTPUT:
    accuracy: percentage of matching words, within k neighbors, between source and target language after the alignment
    cosine_similarity: angolar distance among matching words
    l2_norm_distance: euclidian distance among matching words"""

    distance_embeddings = []
    accuracy = []
    similarity = []
    distance = []

    #fit model to find the closest vectors to tranformed one
    neighbors = NearestNeighbors(n_neighbors = close_neighbors, metric = metric)
    neighbors.fit(target_test)

    for i in range(0,source_transformed.shape[0]):
        index_transformed = i
        #distance between two vocabularies
        distance_embeddings.append(np.linalg.norm(source_transformed[i] - target_test[i]))
        #find vectors in the target vocabulary
        idx_neighbors_target = ((neighbors.kneighbors(source_transformed[index_transformed,:].reshape(1, -1)))[1][0]).tolist()
        #evaluate proximity among target and transformed vectors
        if index_transformed in idx_neighbors_target:
            accuracy.append('True')
            similarity.append(cosine_similarity(source_transformed[index_transformed], target_test[index_transformed]))
            distance.append(np.linalg.norm(source_transformed[index_transformed] - target_test[index_transformed]))
        else:
            accuracy.append('False')

    return [np.mean(distance_embeddings), round(accuracy.count('True')/len(accuracy)*100,2), np.mean(similarity), np.mean(distance)]

In [None]:
def evaluate(source_test, source_transformed, target_test):
    """Evalaute alignment comparing the source vocabulary and its transformed version with the target vocabulary.
    INPUT:
    source_test: vocabulary of the source language
    source_transformed: test set of the source language after transformation
    target_test: test set of the target language
    
    OUTPUT:
    dictionaries evaluating the alignment with different metrics"""
    
    sources = [source_test, source_transformed]
    metrics = ['cosine', 'euclidean']
    neighbors = [1,5,10]
    evaluation_before_cosine = dict()
    evaluation_before_euclidean = dict()
    evaluation_after_cosine = dict()
    evaluation_after_euclidean = dict()

    for source in sources:
        if source[0][0] == source_test[0][0]:
            for metric in metrics:
                if metric == 'cosine':
                    for neighbor in neighbors:
                        evaluation_before_cosine[neighbor] = evaluate_proximity(source_test, target_test, neighbor, 'cosine')

                else:
                    for neighbor in neighbors:
                        evaluation_before_euclidean[neighbor] = evaluate_proximity(source_test, target_test, neighbor, 'euclidean')

        else:
            for metric in metrics:
                if metric == 'cosine':
                    for neighbor in neighbors:
                        evaluation_after_cosine[neighbor] = evaluate_proximity(source_transformed, target_test, neighbor, 'cosine')

                else:
                    for neighbor in neighbors:
                        evaluation_after_euclidean[neighbor] = evaluate_proximity(source_transformed, target_test, neighbor, 'euclidean') 

    return evaluation_before_cosine, evaluation_before_euclidean, evaluation_after_cosine, evaluation_after_euclidean

In [None]:
#evaluate alignment source target
evaluation_before_cosine, evaluation_before_euclidean, evaluation_after_cosine, evaluation_after_euclidean = evaluate(source_test, source_transformed, target_test)

In [None]:
#set alignment evaluation table
index = ['distance_embeddings', 'accuracy', 'cosine_similarity', 'euclidean_distance']

#dump evaluation within dataframe
df_eng_it = pd.DataFrame({'a': list(evaluation_before_cosine[1]),
                          'b': list(evaluation_after_cosine[1]),
                          'c': list(evaluation_before_euclidean[1]),
                          'd': list(evaluation_after_euclidean[1]),
                          
                          'e': list(evaluation_before_cosine[5]),
                          'f': list(evaluation_after_cosine[5]),
                          'g': list(evaluation_before_euclidean[5]),
                          'h': list(evaluation_after_euclidean[5]),
          
                          'i': list(evaluation_before_cosine[10]),
                          'l': list(evaluation_after_cosine[10]),
                          'm': list(evaluation_before_euclidean[10]),
                          'n': list(evaluation_after_euclidean[10])}, index=index)

#assign multilevel temporary columns name
columns=[('K@1','COSINE', 'a'),  ('K@1','COSINE', 'b'), ('K@1','EUCLIDEAN', 'c'),  ('K@1','EUCLIDEAN', 'd'),
   ('K@5','COSINE', 'e'),  ('K@5','COSINE', 'f'), ('K@5','EUCLIDEAN', 'g'),  ('K@5','EUCLIDEAN', 'h'),
    ('K@10','COSINE', 'i'),  ('K@10','COSINE', 'l'), ('K@10','EUCLIDEAN', 'm'),  ('K@10','EUCLIDEAN', 'n')]

#make multindex
df_eng_it.columns= pd.MultiIndex.from_tuples(columns)

#change columns names
change_columns_name = ['non_tranformed','tranformed','non_tranformed','tranformed', 'non_tranformed','tranformed','non_tranformed','tranformed', 'non_tranformed','tranformed','non_tranformed','tranformed']
df_eng_it.columns.set_levels(change_columns_name,level=2,inplace=True)
df_eng_it

In [None]:
#do not run if not needed
#save evaluation table to csv
df_eng_it.to_csv('evaluation_embedding_it_eng_30k.csv')

In [None]:
#read evaluation
df = pd.read_csv('evaluation_eng_pt_20k.csv', header=[0,1,2], tupleize_cols=True)
df.columns = pd.MultiIndex.from_tuples(df.columns)
df.reset_index(drop=True)

In [None]:
def evaluate_rotation(source_test, source_transformed, target_test):
    """Evalaute rotation comparing: 1) the source vocabulary and its transformed version;
                                    2) source vocabulary transformed and the target vocabulary;
                                    3) source and target vocabularies;
    INPUT:
    source_test: vocabulary of the source language
    source_transformed: test set of the source language after transformation
    target_test: test set of the target language
    
    OUTPUT:
    euclidian distance and cosine similarity for each rotation comparison"""

    distance_source_target = []
    distance_source_transformed = []
    distance_transformed_target = []
    similarity_source_target = []
    similarity_source_transformed = []
    similarity_transformed_target = []

    for i in range(0,source_test.shape[0]):
        distance_source_target.append(np.linalg.norm(source_test[i] - target_test[i]))
        distance_source_transformed.append(np.linalg.norm(source_test[i] - source_transformed[i]))
        distance_transformed_target.append(np.linalg.norm(source_transformed[i] - target_test[i]))

        similarity_source_target.append(cosine_similarity(target_test[i],source_test[i]))
        similarity_source_transformed.append(cosine_similarity(source_test[i], source_transformed[i]))
        similarity_transformed_target.append(cosine_similarity(source_transformed[i], target_test[i]))

    return distance_source_target, distance_source_transformed, distance_transformed_target, similarity_source_target, similarity_source_transformed, similarity_transformed_target

In [None]:
# evalutate rotation
distance_source_target, distance_source_transformed, distance_transformed_target, similarity_source_target, similarity_source_transformed, similarity_transformed_target = evaluate_rotation(source_test, source_transformed, target_test)

In [None]:
# store output evaluation rotation in a dictionary
output_evaluation_rotation = {'distance_source_target':distance_source_target, 
                              'distance_source_transformed': distance_source_transformed, 
                              'distance_transformed_target': distance_transformed_target, 
                              'similarity_source_target': similarity_source_target, 
                              'similarity_source_transformed': similarity_source_transformed, 
                              'similarity_transformed_target': similarity_transformed_target}

In [None]:
#rememeber first language is the target, second the source
# save evaluation results 
import pickle
with open('output_evaluation_rotation_pt_eng', 'wb') as f:
    pickle.dump(output_evaluation_rotation, f)

In [None]:
# load evaluation results
with open('output_evaluation_rotation_eng_pt', 'rb') as f:
     data = pickle.load(f)

In [None]:
data.keys()

In [None]:
fig = plt.figure(figsize=(12, 10))
main_ax = fig.add_subplot(111)


main_ax.hist(distance_source_target, 
             color = "lightcoral", bins=40, alpha=.5, 
             edgecolor='black', linewidth=1, label='Source to Target')
main_ax.hist(distance_transformed_target, 
             color = "lightskyblue", bins=40, alpha=0.6, 
             edgecolor='black', linewidth=1, label='Transformed to Target')
main_ax.axvline(np.mean(distance_source_target)-.05, color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)
main_ax.axvline(np.mean(distance_transformed_target), color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)

#main_ax.set_xticklabels(fontsize=15)
#main_ax.set_yticklabels(fontsize=15)
fs = 18

plt.xlabel('Euclidean Distance', fontsize=fs)
plt.ylabel('Frequency', fontsize=fs)

plt.xticks(fontsize=fs)
plt.yticks(fontsize=fs)


plt.legend(loc='upper right',fontsize=fs)
#plt.figlegend(loc = 'upper right', ncol=3, labelspacing=0.5) #bbox_to_anchor=(1.1, 1.05)
#plt.suptitle('Euclidean Distance:\n before and after alignment comparisons\nEnglish-Italian', fontsize=15)
plt.savefig('Euclidean_Distance_word2vec_ENG_IT_Poster.png');

In [None]:
# Plot euclidian distances comparison pt-eng
fig = plt.figure(figsize=(12, 10))
grid = plt.GridSpec(24, 4, hspace=0.5, wspace=0.5)
main_ax = fig.add_subplot(grid[:16, 0:])
x1_hist = fig.add_subplot(grid[-7:, :2], sharex=main_ax)
x2_hist = fig.add_subplot(grid[-7:, 2:], sharex=main_ax)

main_ax.hist(distance_source_target, 
             color = "lightcoral", bins=40, alpha=.5, 
             edgecolor='black', linewidth=1, label='distance_source_target')
main_ax.hist(distance_transformed_target, 
             color = "lightskyblue", bins=40, alpha=0.6, 
             edgecolor='black', linewidth=1, label='distance_transformed_target')
main_ax.axvline(np.mean(distance_source_target)-.05, color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)
main_ax.axvline(np.mean(distance_transformed_target), color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)
#main_ax.annotate(str(round(np.mean(distance_transformed_target),2)), xy=(2, 820), xytext=(4.55, 815))

#ok
x1_hist.hist(distance_source_target, 
             color = "lightcoral", bins=40, alpha=0.4, 
             edgecolor='black', linewidth=.5) #, label='source_target'
x1_hist.hist(distance_source_transformed, 
             color = "seagreen", bins=40, alpha=0.5, 
             edgecolor='black', linewidth=.5, label='distance_source_transformed')
x1_hist.axvline(np.mean(distance_source_target), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)
x1_hist.axvline(np.mean(distance_source_transformed), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)

x2_hist.hist(distance_source_transformed, 
             color = "seagreen", bins=40, alpha=0.5,
             edgecolor='black', linewidth=.5) #, label='source_transformed'
x2_hist.hist(distance_transformed_target, 
             color = "lightskyblue", bins=40, alpha=0.6, 
             edgecolor='black', linewidth=.5) #, label='transformed_target'
x2_hist.axvline(np.mean(distance_source_transformed), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)
x2_hist.axvline(np.mean(distance_transformed_target), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)

plt.figlegend(loc = 'lower center', ncol=3, labelspacing=0.5) #bbox_to_anchor=(1.1, 1.05)
plt.suptitle('Euclidean Distance:\n before and after alignment comparisons\nEnglish-Italian', fontsize=15)

plt.savefig('Euclidean_Distance_Word_Embeddings_word2vec_ENG_IT_30k.png');

In [None]:
# Plot euclidian distances comparison eng-pt
fig = plt.figure(figsize=(12, 10))
grid = plt.GridSpec(24, 4, hspace=0.5, wspace=0.5)
main_ax = fig.add_subplot(grid[:16, 0:])
x1_hist = fig.add_subplot(grid[-7:, :2], sharex=main_ax)
x2_hist = fig.add_subplot(grid[-7:, 2:], sharex=main_ax)

main_ax.hist(distance_source_target, 
             color = "lightcoral", bins=40, alpha=.5, 
             edgecolor='black', linewidth=1, label='distance_source_target')
main_ax.hist(distance_transformed_target, 
             color = "lightskyblue", bins=40, alpha=0.6, 
             edgecolor='black', linewidth=1, label='distance_transformed_target')
main_ax.axvline(np.mean(distance_source_target)-.05, color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)
main_ax.axvline(np.mean(distance_transformed_target), color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)
#main_ax.annotate(str(round(np.mean(distance_transformed_target),2)), xy=(2, 820), xytext=(4.55, 815))

#ok
x1_hist.hist(distance_source_target, 
             color = "lightcoral", bins=40, alpha=0.4, 
             edgecolor='black', linewidth=.5) #, label='source_target'
x1_hist.hist(distance_source_transformed, 
             color = "seagreen", bins=40, alpha=0.5, 
             edgecolor='black', linewidth=.5, label='distance_source_transformed')
x1_hist.axvline(np.mean(distance_source_target), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)
x1_hist.axvline(np.mean(distance_source_transformed), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)

x2_hist.hist(distance_source_transformed, 
             color = "seagreen", bins=40, alpha=0.5,
             edgecolor='black', linewidth=.5) #, label='source_transformed'
x2_hist.hist(distance_transformed_target, 
             color = "lightskyblue", bins=40, alpha=0.6, 
             edgecolor='black', linewidth=.5) #, label='transformed_target'
x2_hist.axvline(np.mean(distance_source_transformed), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)
x2_hist.axvline(np.mean(distance_transformed_target), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)

plt.figlegend(loc = 'lower center', ncol=3, labelspacing=0.5) #bbox_to_anchor=(1.1, 1.05)
#plt.suptitle('Euclidean Distance:\n before and after alignment comparisons\nEnglish-Portuguese', fontsize=15);

In [None]:
# Plot cosine similarities distances
fig = plt.figure(figsize=(12, 10))
grid = plt.GridSpec(24, 4, hspace=0.5, wspace=0.5)
main_ax = fig.add_subplot(grid[:16, 0:])
x1_hist = fig.add_subplot(grid[-7:, :2], sharex=main_ax)
x2_hist = fig.add_subplot(grid[-7:, 2:], sharex=main_ax)

main_ax.hist(similarity_source_target, 
             color = "lightcoral", bins=40, alpha=.5, 
             edgecolor='black', linewidth=1, label='similarity_source_target')
main_ax.hist(similarity_transformed_target, 
             color = "lightskyblue", bins=40, alpha=0.6, 
             edgecolor='black', linewidth=1, label='similarity_transformed_target')
main_ax.axvline(np.mean(similarity_source_target)-.05, color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)
main_ax.axvline(np.mean(similarity_transformed_target), color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)
#main_ax.annotate(str(round(np.mean(distance_transformed_target),2)), xy=(2, 820), xytext=(4.55, 815))

x1_hist.hist(similarity_source_target, 
             color = "lightcoral", bins=20, alpha=0.4, 
             edgecolor='black', linewidth=.5) #, label='source_target'
x1_hist.hist(similarity_source_transformed, 
             color = "seagreen", bins=20, alpha=0.5, 
             edgecolor='black', linewidth=.5, label='similarity_source_transformed')
x1_hist.axvline(np.mean(similarity_source_target), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)
x1_hist.axvline(np.mean(similarity_source_transformed), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)

x2_hist.hist(similarity_source_transformed, 
             color = "seagreen", bins=20, alpha=0.5,
             edgecolor='black', linewidth=.5) #, label='source_transformed'
x2_hist.hist(similarity_transformed_target, 
             color = "lightskyblue", bins=20, alpha=0.6, 
             edgecolor='black', linewidth=.5) #, label='transformed_target'
x2_hist.axvline(np.mean(similarity_source_transformed), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)
x2_hist.axvline(np.mean(similarity_transformed_target), color='black',alpha=0.5, linestyle='dashed', linewidth=1.3)

plt.figlegend(loc = 'lower center', ncol=3, labelspacing=0.5) #bbox_to_anchor=(1.1, 1.05)
plt.suptitle('Cosine Similarity:\n before and after alignment comparisons\nItalian-English', fontsize=15)
#plt.savefig('Cosine_Similarity_Word_Embeddings_word2vec_ENG_IT_30k.png');

In [None]:
fig = plt.figure(figsize=(12, 10))
main_ax = fig.add_subplot(111)

main_ax.hist(similarity_source_target, 
             color = "lightcoral", bins=40, alpha=.5, 
             edgecolor='black', linewidth=1, label='Source to Targe')
main_ax.hist(similarity_transformed_target, 
             color = "lightskyblue", bins=40, alpha=0.6, 
             edgecolor='black', linewidth=1, label='Transformed to Target')
main_ax.axvline(np.mean(similarity_source_target)-.05, color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)
main_ax.axvline(np.mean(similarity_transformed_target), color='black',alpha=0.7, linestyle='dashed', linewidth=1.5)

#main_ax.set_xticklabels(fontsize=15)
#main_ax.set_yticklabels(fontsize=15)
fs = 18

plt.xlabel('Cosine Similarity', fontsize=fs)
plt.ylabel('Frequency', fontsize=fs)

plt.xticks(fontsize=fs)
plt.yticks(fontsize=fs)

plt.legend(loc='upper right',fontsize=fs)
#plt.figlegend(loc = 'best', ncol=1, labelspacing=0.5,fontsize=fs) #bbox_to_anchor=(1.1, 1.05)
#plt.suptitle('Euclidean Distance:\n before and after alignment comparisons\nEnglish-Italian', fontsize=15)
plt.savefig('Cosine_Similarity_word2vec_ENG_IT_30k_Poster.png');

In [None]:
transformed_eng = apply_transform(transformation, source_vocabulary_embeddings.wv.vectors)

In [None]:
len(transformed_eng)

In [None]:
words_en = []
for word_en in source_vocabulary_embeddings.vocab:
    words_en.append(words_en)
    
vectors_en = []
for vector_en in transformed_eng:
    vectors_en.append(vector_en)

In [None]:
len(words_en) == len(vectors_en)

In [None]:
print(source_vocabulary_embeddings.wv.vectors[0][1])
print(vectors_en[0][1])

In [None]:
len(vectors_en) == len(words_en)

In [None]:
#rememeber first language is the target, second the source
# save evaluation results 
import pickle
with open('en_embeddings_transformed_vectors_30k', 'wb') as f:
    pickle.dump(vectors_en, f)

In [None]:
#rememeber first language is the target, second the source
# save evaluation results 
import pickle
with open('en_embeddings_transformed_words_30k', 'wb') as f:
    pickle.dump(words_en, f)

In [None]:
en_embeddings_transformed_1 = dict(zip(words_en, vectors_en))

In [None]:
for item in data:
    list_data_items.append(dict(zip(column_names, item)))

In [None]:
en_embeddings_transformed['the']

In [None]:
transformed_it = apply_transform(transformation, target_vocabulary_embeddings.wv.vectors)

In [None]:
words_it = []
for word_it in target_vocabulary_embeddings.vocab:
    words_it.append(word_it)
    
vectors_it = []
for vector_it in transformed_it:
    vectors_it.append(vector_it)
    
it_embeddings_transformed = dict(zip(words_it, vectors_it))

In [None]:
print(target_vocabulary_embeddings.wv.vectors[0][0])
print(vectors_it[0][0])

In [None]:
#rememeber first language is the target, second the source
# save evaluation results 
import pickle
with open('it_embeddings_transformed_30k', 'wb') as f:
    pickle.dump(it_embeddings_transformed, f)

In [None]:
# load evaluation results
import pickle
with open('it_embeddings_transformed_30k', 'rb') as f:
     data = pickle.load(f)

In [None]:
data['finestra']

In [None]:
from gensim.models import Word2Vec

In [None]:
model= Word2Vec()
model.build_vocab_from_freq(it_embeddings_transformed)

In [None]:
save_word2vec_format(fname, prefix='*dt_', fvocab=None, total_vec=None, binary=False, write_first_line=True)¶
Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility.

Parameters:	
fname (str) – The file path used to save the vectors in.
prefix (str) – Uniquely identifies doctags from word vocab, and avoids collision in case of repeated string in doctag and word vocab.
fvocab (str) – Optional file path used to save the vocabulary
binary (bool) – If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
total_vec (int) – Optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards)
write_first_line (bool) – Whether to print the first line in the file. Useful when saving doc-vectors after word-vectors.

In [None]:
save_word2vec_format(fname, fvocab=None, binary=False, total_vec=None)
Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility.

Parameters:	
fname (str) – The file path used to save the vectors in.
fvocab (str) – Optional file path used to save the vocabulary.
binary (bool) – If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
total_vec (int) – Optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards).

### Plot t-sne (2D)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Limit number of tokens to be visualized
limit = 150
vector_dim = 300

# Getting tokens and vectors
words = []
embedding = np.array([])
i = 0
for word in it_dictionary.vocab:
    # Break the loop if limit exceeds 
    if i == limit: break

    # Getting token 
    words.append(word)

    # Appending the vectors 
    embedding = np.append(embedding, it_dictionary[word])

    i += 1

# Reshaping the embedding vector 
embedding = embedding.reshape(limit, vector_dim)


def plot_with_labels(low_dim_embs, labels, filename='it_dictionary_tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
    plt.savefig(filename)


# Creating the tsne plot
tsne = TSNE(perplexity=30.0, n_components=2, init='pca', n_iter=5000)

low_dim_embedding = tsne.fit_transform(embedding)

# Finally plotting and saving the fig 
plot_with_labels(low_dim_embedding, words)

In [None]:
# Limit number of tokens to be visualized
limit = 500
vector_dim = 300

# Getting tokens and vectors
words = []
embedding = np.array([])
i = 0
for word in eng_dictionary.vocab:
    # Break the loop if limit exceeds 
    if i == limit: break

    # Getting token 
    words.append(word)

    # Appending the vectors 
    embedding = np.append(embedding, eng_dictionary[word])

    i += 1

# Reshaping the embedding vector 
embedding = embedding.reshape(limit, vector_dim)

def plot_with_labels(low_dim_embs, labels, filename='eng_dictionary_tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
    plt.savefig(filename)

# Creating the tsne plot [Warning: will take time]
tsne = TSNE(perplexity=30.0, n_components=2, init='pca', n_iter=5000)

low_dim_embedding = tsne.fit_transform(embedding)

# Finally plotting and saving the fig 
plot_with_labels(low_dim_embedding, words)

In [None]:
def plot_with_labels(low_dim_embs, labels, filename='it_dictionary_tsne_3d.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y, z = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
    plt.savefig(filename)


# Creating the tsne plot [Warning: will take time]
tsne = TSNE(perplexity=30.0, n_components=3, init='pca', n_iter=5000)

low_dim_embedding = tsne.fit_transform(embedding)

# Finally plotting and saving the fig 
plot_with_labels(low_dim_embedding, words)