##### Inspiration from:
https://earlyprint.org/jupyterbook/word2vec.html
https://jalammar.github.io/illustrated-word2vec/
https://www.sci.utah.edu/~beiwang/publications/Word_Embeddings_BeiWang_2017.pdf


# This notebook explores semantic relations between word embeddings

## Importing the libraries and set up

In [None]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight
from src.embeddings import load_embedding
from src.data_processing import preprocess_raw_datasets, PreprocessingOptions, encode_one_hot_labels, get_value_from_tuple
from src.data_loading import load_labels
from src.constants import PATH_EMBEDDINGS_DATA_WORD2VEC, PATH_EMBEDDINGS_DATA_FASTTEXT
from src.plots import visualize_word_vectors, heatmaps_of_feature_vectors

## Data Loading

##### Load the previously computed embeddings

In [None]:
y_train, y_dev, y_test = load_labels()
balanced_class_weight = {k: v for k, v in enumerate(compute_class_weight("balanced", classes=np.unique(y_train), y=y_train))}

In [None]:
# relevant settings to identify the desired embedding
PREPROCESSING_OPTIONS = PreprocessingOptions(remove_stop_words=False, lemmatisation=False)
EMBEDDING = "word2vec" # "word2vec" or "fasttext" - please copy paste it as it is written
EMBEDDING_VERSION = "cbow" # "cbow" or "Skip_N-gram" - please copy paste it as it is written
VECTOR_SIZE = 25

In [None]:
#Load the embedding model trained in Task2_ComputeEmbeddings with the above configurations
model = load_embedding(version=EMBEDDING_VERSION, vector_size=VECTOR_SIZE, embedding_type = EMBEDDING)

#### Exploration of semantic relationships

In [None]:
# checking some words after the embedding
print(list(model.index_to_key)[:150])

In [None]:
#just selected 10 random words from the dataset to  find their most similiar words 
words_to_test = ["blood", "dose", "anxiety", "health", "study", "paper", "tonsillectomy", "ophthalmopathy", "effects", "macronutrient", "hypocaloric", "diet"]
dict_of_most_similiar_words = dict() #will use it later for visualizations
for word in words_to_test:
  most_similiar_words = model.most_similar(word)
  dict_of_most_similiar_words[word] = get_value_from_tuple(most_similiar_words[:4])
  dict_of_most_similiar_words[word].append(word) #add the words to visualize it in the same plot as the most similiar words

In [None]:
for w in words_to_test:
  title = "Heatmap of feature vector values for \'"+ w+"\'"
  heatmaps_of_feature_vectors(model, dict_of_most_similiar_words.get(w), title_of_plot=title)

#### Visualize word similiarity

Visualizing word similiarity using PCA

In [None]:
pca = PCA(n_components=2)
pca_results = pca.fit_transform(model.get_normed_vectors())
pca_df = pd.DataFrame(pca_results, index=model.key_to_index, columns=["pc1","pc2"])

In [None]:
visualize_word_vectors("", pca_df, words_to_test) # "" means that we are plotting all words in words_to_test

In [None]:
for w in words_to_test:
  visualize_word_vectors(w, pca_df, dict_of_most_similiar_words[w])

### Analogies

Here we find analogies between words after addition/subtraction among corresponding vectors

In [None]:
word1 = "patient"
word2 = "treatment"
#word3="health"
analogies_list = get_value_from_tuple(model.most_similar(positive=[word1, word2], negative=[]))
analogies_list.append(word1)
analogies_list.append(word2)
#analogies_list.append(word3)

In [None]:
#title = "Heatmap of feature vector values for analogies:"+" vec("+analogies_list[len(analogies_list)-3]+") + vec("+analogies_list[len(analogies_list)-2]+") - vec("+analogies_list[len(analogies_list)-1]+")"
title = "Heatmap of feature vector values for analogies:"+" vec("+analogies_list[len(analogies_list)-2]+") + vec("+analogies_list[len(analogies_list)-1]+")"
heatmaps_of_feature_vectors(model, analogies_list, title)