# Imports and Setup

In [None]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [None]:
# various nltk elements that's needed for preprocessing
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')


In [None]:
import numpy as np
from scipy.sparse import vstack

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from src.hyperparameter_search import run_randomized_search
from src.evaluate import train_and_eval_clf
from src.embeddings import fit_tf_idf
from src.data_processing import preprocess_raw_datasets, PreprocessingOptions, get_raw_x_y
from src.data_loading import load_raw_datasets, persist_preprocessed_data, load_preprocessed_data, load_data, persist_labels, load_labels

from src.constants import PATH_DEV_DATA, PATH_TEST_DATA, PATH_TRAIN_DATA


# Data Loading & Preprocessing
Takes some time, but needs to be done only once! Otherwise just load the preprocessed data!

## Config

In [None]:
PREPROCESSING_OPTIONS = PreprocessingOptions(remove_stop_words=False, lemmatisation=False)
PREPROCESSED_DATA_AVAILABLE = False


In [None]:
if PREPROCESSED_DATA_AVAILABLE:
    x_preprocessed_train = load_preprocessed_data(PREPROCESSING_OPTIONS, "train")
    x_preprocessed_dev = load_preprocessed_data(PREPROCESSING_OPTIONS, "dev")
    x_preprocessed_test = load_preprocessed_data(PREPROCESSING_OPTIONS, "test")
    
    y_train, y_dev, y_test = load_labels()
    
else:
    train, dev, test = load_raw_datasets()
    x_preprocessed_train, y_train, x_preprocessed_dev, y_dev, x_preprocessed_test, y_test = preprocess_raw_datasets(train, dev, test, PREPROCESSING_OPTIONS)

    # save preprocessed data
    persist_preprocessed_data(x_preprocessed_train, PREPROCESSING_OPTIONS, "train")
    persist_preprocessed_data(x_preprocessed_dev, PREPROCESSING_OPTIONS, "dev")
    persist_preprocessed_data(x_preprocessed_test, PREPROCESSING_OPTIONS, "test")
    
    # save labels
    persist_labels(y_train, y_dev, y_test)


# TF-IDF embeddings

In [None]:
tfidf = fit_tf_idf(x_preprocessed_train)

x_tfidf_train = tfidf.transform(x_preprocessed_train)
x_tfidf_dev = tfidf.transform(x_preprocessed_dev)
x_tfidf_test = tfidf.transform(x_preprocessed_test)

assert x_tfidf_train.shape[0] == len(y_train)
assert x_tfidf_dev.shape[0] == len(y_dev)
assert x_tfidf_test.shape[0] == len(y_test)


# Training Classifiers

# Baseline 1: Naive Bayes Classifier

In [None]:
%%time
train_and_eval_clf(MultinomialNB(), x_tfidf_train, y_train, x_tfidf_dev, y_dev, x_tfidf_test, y_test, "Naive Bayes")


# Baseline 2: Linear SVM model
Linear classifiers are generally well suited for high dimensional data, so they are a reasonable choice for TFIDF embeddings

In [None]:
%%time
train_and_eval_clf(SGDClassifier(random_state=0, max_iter=10), x_tfidf_train, y_train, x_tfidf_dev, y_dev, x_tfidf_test, y_test, "Linear SVM")


## Hyperparameter search for Linear Models
Hinge loss - Corresponds to linear SVM

Log loss - Corresponds to Logistic Regression


In [None]:
RUN_HYPERPARAMETER_SEARCH = False


In [None]:
%%time
if RUN_HYPERPARAMETER_SEARCH:
    
    distributions = {

        "loss": ["hinge", "log"],
        "penalty": ["l1", "l2", "elasticnet"],
        "alpha": [0.0001, 0.000001],
        "class_weight": ["balanced", None],
        "early_stopping": [True]

    }

    # Concatenate the train and dev sets to use for kfold
    train_and_dev_tfidf = vstack([x_tfidf_train, x_tfidf_dev])
    y_train_and_dev = np.concatenate([y_train, y_dev])
    assert train_and_dev_tfidf.shape[0] == x_tfidf_train.shape[0] + x_tfidf_dev.shape[0]
    assert len(y_train_and_dev) == len(y_train) + len(y_dev)

    results, best_params_linear, best_score = run_randomized_search(SGDClassifier(random_state=0), "Linear_Models_" + PREPROCESSING_OPTIONS.get_current_options(),
                                                                 train_and_dev_tfidf, y_train_and_dev, distributions, n_iter=25, cv=4, random_state=0, n_jobs=4)

    # display results
    display(results[[column for column in results.columns if column not in ["std_fit_time", "mean_score_time", "std_score_time", "params"]]])

else:
    # set best params manually
    best_params_linear = {
        "penalty": "l2",
        "loss": "log",
        "early_stopping": True,
        "class_weight": "balanced",
        "alpha": 0.000001
    }
    

## Best hyperparams linear classifier


In [None]:
train_and_eval_clf(SGDClassifier(random_state=0, **best_params_linear), x_tfidf_train, y_train, x_tfidf_dev, y_dev, x_tfidf_test, y_test, "Linear Models")
