# This notebook runs baseline models from task 1 on word2vec and fasttext based sentence embeddings

In [None]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [None]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

from sklearn.linear_model import SGDClassifier
from src.hyperparameter_search import run_randomized_search
from src.evaluate import train_and_eval_clf

from src.embeddings import get_embedding_config
from src.data_processing import preprocess_raw_datasets, PreprocessingOptions, encode_one_hot_labels
from src.data_loading import load_embeddings, load_labels
from src.constants import PATH_TF_MODELS
from src.plots import plot_tf_history
from src.tf_models.linear_models import get_simple_linear_classifier
from src.tf_datasets import create_tf_datasets



# Data Loading
Load previously computed embeddings

In [None]:
y_train, y_dev, y_test = load_labels()
balanced_class_weight = {k: v for k, v in enumerate(compute_class_weight("balanced", classes=np.unique(y_train), y=y_train))}


In [None]:
# relevant settings to identify the desired embedding
PREPROCESSING_OPTIONS = PreprocessingOptions(remove_stop_words=False, lemmatisation=False)
EMBEDDING = "word2vec" # "word2vec" or "fasttext" - please copy paste it as it is written
EMBEDDING_VERSION = "cbow" # "cbow" or "Skip_N-gram" - please copy paste it as it is written
VECTOR_SIZE = 25


# Case 1: Sentence embeddings via summation of the word vectors

In [None]:
x_embeddings_train, x_embeddings_dev, x_embeddings_test = load_embeddings(PREPROCESSING_OPTIONS, EMBEDDING_VERSION, VECTOR_SIZE, 0, embedding_type = EMBEDDING, mode="summation")
x_embeddings_train.shape


## Train logistic regression

In [None]:
params_linear = {
    "penalty": "l2",
    "loss": "log",
    "early_stopping": True,
    "class_weight": "balanced",
    "alpha": 0.000001
}

train_and_eval_clf(SGDClassifier(random_state=0, **params_linear), x_embeddings_train, y_train, x_embeddings_dev, y_dev, x_embeddings_test, y_test, "Linear Models + Summation embeddings")


# Case 2: Sentence embeddings via concatenation of the word vectors

In [None]:
MAX_WORDS = 50


In [None]:
x_embeddings_train, x_embeddings_dev, x_embeddings_test = load_embeddings(PREPROCESSING_OPTIONS, EMBEDDING_VERSION, VECTOR_SIZE, MAX_WORDS, embedding_type=EMBEDDING, mode="concatenation")
x_embeddings_train.shape


In [None]:
# reshape the data as needed by a simple logistic regression classifier: batch x n_features
x_embeddings_train = x_embeddings_train.reshape((x_embeddings_train.shape[0], -1))
x_embeddings_dev = x_embeddings_dev.reshape((x_embeddings_dev.shape[0], -1))
x_embeddings_test = x_embeddings_test.reshape((x_embeddings_test.shape[0], -1))

n_input_features = x_embeddings_train.shape[1]
x_embeddings_train.shape


## Create Tensorflow Datasets

In [None]:
# need one hot labels
y_train = encode_one_hot_labels(y_train)
y_dev = encode_one_hot_labels(y_dev)
y_test = encode_one_hot_labels(y_test)

train_dataset, dev_dataset, test_dataset = create_tf_datasets(x_embeddings_train, y_train,
                                                              x_embeddings_dev, y_dev,
                                                              x_embeddings_test, y_test)

del x_embeddings_train
del x_embeddings_dev
del x_embeddings_test


## Train Logistic Regression

In [None]:
model = get_simple_linear_classifier(n_input_features)
history = model.fit(x=train_dataset, epochs=2, validation_data=dev_dataset, validation_steps=30, class_weight=balanced_class_weight)
plot_tf_history(history)


In [None]:
model.evaluate(test_dataset)
