# Task 2 computing embeddings

This notebook provides support for computing and persisting Word2Vec sentence embeddings with various configs

# Imports and Setup

In [None]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [None]:
# Needed to make the embedding model training deterministic
# otherwise we cannot supply a pretrained RNN model later,
# because recomputing the embeddings would change its input data
%env PYTHONHASHSEED=0
    

In [None]:
import tensorflow as tf
import numpy as np

from src.data_processing import PreprocessingOptions
from src.data_loading import load_raw_datasets, persist_preprocessed_data, load_preprocessed_data, \
                             persist_labels, load_labels, persist_embeddings
from src.embeddings import load_embedding, SentenceEmbedder, train_and_save_embedding_model


# Data Loading
Load the previously computed embeddings and create tf dataset

In [None]:
PREPROCESSING_OPTIONS = PreprocessingOptions(remove_stop_words=False, lemmatisation=False)


In [None]:
x_preprocessed_train = load_preprocessed_data(PREPROCESSING_OPTIONS, "train")
x_preprocessed_dev = load_preprocessed_data(PREPROCESSING_OPTIONS, "dev")
x_preprocessed_test = load_preprocessed_data(PREPROCESSING_OPTIONS, "test")

y_train, y_dev, y_test = load_labels()


# Word2Vec and FastText embeddings

## Train the Models

In [None]:
EMBEDDING = "word2vec" # "word2vec" or "fasttext"
EMBEDDING_VERSION = "cbow" # "cbow" or "Skip_N-gram"
VECTOR_SIZE = 25


In [None]:
%%time
TRAIN_MODEL = True

if TRAIN_MODEL:
    train_and_save_embedding_model(x_preprocessed_train, sg=0, vector_size=VECTOR_SIZE, embedding_type=EMBEDDING)
    # uncomment if Skip_N-gram is needed
#     train_and_save_embedding_model(x_preprocessed_train, sg=1, vector_size=VECTOR_SIZE, embedding_type=EMBEDDING)

model = load_embedding(version=EMBEDDING_VERSION, vector_size=VECTOR_SIZE, embedding_type = EMBEDDING)


In [None]:
sentence_embedder = SentenceEmbedder(model)
sentence_embedder.print_unknown_words_percentage(x_preprocessed_dev)
sentence_embedder.print_unknown_words_percentage(x_preprocessed_test)


## Constructing Sentence Vectors via Concatenation

In [None]:
longest_sentence_len = sentence_embedder.compute_longest_sentence_length(x_preprocessed_train)


In [None]:
# longer sentences will be cut short 
MAX_WORDS = 50


In [None]:
%%time
x_embeddings_train = sentence_embedder.concatenate_word_vectors(x_preprocessed_train, max_words=MAX_WORDS)
x_embeddings_dev = sentence_embedder.concatenate_word_vectors(x_preprocessed_dev, max_words=MAX_WORDS)
x_embeddings_test = sentence_embedder.concatenate_word_vectors(x_preprocessed_test, max_words=MAX_WORDS)

x_embeddings_train.shape


### Save the concatenated sentence embeddings

In [None]:
persist_embeddings(x_embeddings_train, x_embeddings_dev, x_embeddings_test,
                   PREPROCESSING_OPTIONS, EMBEDDING_VERSION, VECTOR_SIZE, MAX_WORDS, embedding_type=EMBEDDING, mode="concatenation")


## Constructing Sentence Vectors via Summation

In [None]:
%%time
x_embeddings_train = sentence_embedder.sum_word_vectors(x_preprocessed_train)
x_embeddings_dev = sentence_embedder.sum_word_vectors(x_preprocessed_dev)
x_embeddings_test = sentence_embedder.sum_word_vectors(x_preprocessed_test)

x_embeddings_train.shape


### Save the summed sentence embeddings

In [None]:
persist_embeddings(x_embeddings_train, x_embeddings_dev, x_embeddings_test,
                   PREPROCESSING_OPTIONS, EMBEDDING_VERSION, VECTOR_SIZE, 0, embedding_type=EMBEDDING, mode="summation")
