In [None]:
import io
import os
import pickle
import random as rn
import warnings
from datetime import datetime
from importlib import reload
from multiprocessing import cpu_count

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from gensim.models import Word2Vec
from gensim.models.fasttext import FastText
from IPython.display import display
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, ndcg_score, \
        label_ranking_average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import data
import models
import preprocessing

seed = 42
sns.set()

def reset_seed():
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ["TF_DETERMINISTIC_OPS"] = "1"
    np.random.seed(seed)
    rn.seed(seed)
    tf.random.set_seed(seed)

## Choose notebook version

In [None]:
# version = "tokenized_cased"
# version = "tokenized_no_sw_no_punct"
version = "tokenized_lemmatized_no_sw_no_punct"

## Extracting the data

In [None]:
# data.extract_data(extraction_dir="train",
#                   data_dir="data",
#                   data_zip_name="reuters-training-corpus.zip")

train_df = pd.read_pickle("train/data.pkl")

# train_df = data.get_docs_labels("train/REUTERS_CORPUS_2")
# train_df.to_pickle("train/data.pkl")

train_docs = train_df["doc"].values
n_train = train_docs.shape[0]
train_labels = np.array(train_df["labels"].tolist())
n_labels = len(data.CODEMAP)

# extract test_docs here

print(train_docs.shape)
print(train_labels.shape)
print(train_docs[2])
print(train_labels[2])

## Preprocessing the documents

In [None]:
path_to_preprocessed_train_docs = f"train/preprocessed_docs_{version}.pkl"

try:
    with open(path_to_preprocessed_train_docs, "rb") as f:
        preprocessed_train_docs = pickle.load(f)
except:
    preprocessed_train_docs = preprocessing.preprocess_corpus(train_docs)
    with open(path_to_preprocessed_train_docs, "wb") as f:
        pickle.dump(preprocessed_train_docs, f)

print(preprocessed_train_docs[2])

In [None]:
# path_to_preprocessed_test_docs = f"test/preprocessed_docs_{version}.pkl"

# try:
#     with open(path_to_preprocessed_test_docs, "rb") as f:
#         preprocessed_test_docs = pickle.load(f)
# except:
#     preprocessed_test_docs = preprocessing.preprocess_corpus(test_docs)
#     with open(path_to_preprocessed_test_docs, "wb") as f:
#         pickle.dump(preprocessed_test_docs, f)

# print(preprocessed_test_docs[2])

## Representing the documents as token index sequences

In [None]:
docs = preprocessed_train_docs # add preprocessed_test_docs
n_vocabulary = None

In [None]:
tokenizer = Tokenizer(num_words=n_vocabulary, filters="", lower=False)
tokenizer.fit_on_texts(docs)
word_idx = tokenizer.word_index
if n_vocabulary is None:
    n_vocabulary = len(word_idx) + 1 # use index 0 for padding

print(n_vocabulary)

In [None]:
n_sequence = 512

sequences = tokenizer.texts_to_sequences(docs)
if n_sequence is None:
    n_sequence = max([len(s) for s in sequences])
sequence_lengths = [min(len(s), n_sequence) for s in sequences]
sequences = pad_sequences(sequences,
                          maxlen=n_sequence,
                          padding="post",
                          truncating="post")

print(n_sequence)
print(sequences.shape)
print(sequences[2][:100])

In [None]:
doc_lengths = [len(doc.split()) for doc in docs]

sns.distplot(doc_lengths,
             bins=np.arange(0, 2500, 25),
             kde=False,
             label="documents")
sns.distplot(sequence_lengths,
             bins=np.arange(0, 2500, 25),
             kde=False,
             label="sequences")
total_word_coverage = np.round(np.sum(sequence_lengths) / np.sum(doc_lengths), 3)
plt.title(f"n_vocabulary={n_vocabulary}, n_sequence={n_sequence},\n"
          f"total_word_coverage={total_word_coverage}")
plt.xlim(0, 1550)
plt.xlabel("length")
plt.legend()
plt.tight_layout()
plt.show()

## Word embeddings

In [None]:
n_embedding = 300 # 300 required by pretrained embeddings

### FastText

In [None]:
reset_seed()

# ft_path = f"data/fasttext_{version}.model"
ft_path = f"train/fasttext_{version}.model"

try:
    ft = FastText.load(ft_path)
except:
    ft = FastText(sentences=[doc.split() for doc in docs], size=n_embedding,
                  min_count=1, workers=cpu_count(), seed=seed)
    ft.save(ft_path)

print(len(list(ft.wv.vocab)))
print(ft.wv.index2entity[:100])

### Word2Vec

In [None]:
reset_seed()

# w2v_path = f"data/w2v_{version}.model"
w2v_path = f"train/w2v_{version}.model"

try:
    w2v = Word2Vec.load(w2v_path)
except:
    w2v = Word2Vec(sentences=[doc.split() for doc in docs], size=n_embedding,
                   min_count=1, workers=cpu_count(), seed=seed)
    w2v.save(w2v_path)

print(len(list(w2v.wv.vocab)))
print(w2v.wv.index2entity[:100])

### Pre-trained FastText

In [None]:
def load_embeddings(fname, skip_first):
    embedding_idx = {}
    with open(fname, "r", encoding="utf-8", errors="ignore") as f:
        for i, line in enumerate(f):
            if i == 0 and skip_first:
                continue
            vals = line.rstrip().split()
            token = "".join(vals[:-300])
            embedding = np.array(vals[-300:], dtype=np.float32)
            embedding_idx[token] = embedding
    return embedding_idx

In [None]:
ft_pretrained = load_embeddings("data/crawl-300d-2M.vec", skip_first=True)

token_iter = iter(ft_pretrained)
print([next(token_iter) for i in range(100)])

### Pre-trained GloVe

In [None]:
if version == "tokenized_cased":
    glove_pretrained = load_embeddings("data/glove.840B.300d.txt", skip_first=False)
else:
    glove_pretrained = load_embeddings("data/glove.42B.300d.txt", skip_first=False)
    
token_iter = iter(glove_pretrained)
print([next(token_iter) for i in range(100)])

### Embedding matrices

In [None]:
def create_embedding_matrix(embedding_model):
    embedding_matrix = np.zeros((n_vocabulary, n_embedding))
    unknown_token_count = 0
    for token, i in word_idx.items():
        if i >= n_vocabulary:
            continue
        if token in embedding_model:
            embedding_matrix[i] = embedding_model[token]
        else:
            unknown_token_count += 1

    print(unknown_token_count)
    print(embedding_matrix.shape)
    print(embedding_matrix[1][:20])

    return embedding_matrix

In [None]:
ft_embedding_matrix = create_embedding_matrix(ft)

In [None]:
w2v_embedding_matrix = create_embedding_matrix(w2v)

In [None]:
ft_pretrained_embedding_matrix = create_embedding_matrix(ft_pretrained)

In [None]:
glove_pretrained_embedding_matrix = create_embedding_matrix(glove_pretrained)

## Shuffling the data

In [None]:
n_samples = None
x_train, y_train = shuffle(sequences[:n_train],
                           train_labels,
                           random_state=seed,
                           n_samples=n_samples)

## Cross-evaluating the models

In [None]:
def cross_evaluate(model_initializer, batch_size=256, model_params={}):
    model_initializer(n_vocabulary, n_embedding, n_sequence, n_labels,
                      **model_params).summary()

    cv_scores = []
    mskf = MultilabelStratifiedKFold(n_splits=5, random_state=seed)
    for train, val in mskf.split(x_train, y_train):
        model = model_initializer(n_vocabulary, n_embedding, n_sequence, n_labels,
                                  **model_params)
        es = EarlyStopping(patience=10, verbose=1, restore_best_weights=True)
        history = model.fit(x_train[train],
                            y_train[train],
                            batch_size=batch_size,
                            epochs=100,
                            verbose=1,
                            validation_data=(x_train[val], y_train[val]),
                            callbacks=[es])

        y_pred_prob = model.predict(x_train[val], batch_size=batch_size, verbose=1)
        y_pred = np.round(y_pred_prob)

        scores = {}
        scores["accuracy"] = accuracy_score(y_train[val], y_pred)
        scores["F1 (macro)"] = f1_score(y_train[val], y_pred, average="macro")
        scores["F1 (micro)"] = f1_score(y_train[val], y_pred, average="micro")
        scores["LRAP"] = label_ranking_average_precision_score(y_train[val],
                                                               y_pred_prob)
        scores["NDCG"] = ndcg_score(y_train[val], y_pred_prob)
        scores["timestamp"] = round(datetime.timestamp(datetime.now()))
        cv_scores.append(scores)
        print(scores)

#         model.save(f"best_models/{model_initializer.__name__}_{version}_" +
#                    f"{scores['timestamp']}_" +
#                    f"{np.round(scores['F1 (micro)'], 6)}")
        model.save(f"best_models_train/{model_initializer.__name__}_{version}_" +
                   f"{scores['timestamp']}_" +
                   f"{np.round(scores['F1 (micro)'], 6)}")

    cv_scores_df = pd.DataFrame(cv_scores)
    display(cv_scores_df)
    print(cv_scores_df.drop("timestamp", axis=1).mean())

In [None]:
reset_seed()
cross_evaluate(models.cnn_bi_lstm_1, model_params={
    "filters_1": 400, "filters_2": 500, "loss": "binary_crossentropy",
    "embedding_matrix": w2v_embedding_matrix})