In [None]:
!sudo pip install tensorflow_hub tensorflow-gpu tensorflow_text tensorflow-addons scikit-multilearn iterative-stratification

In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('sentences-decision.csv')
data.drop_duplicates(subset='sentence', inplace=True)
print(len(data))

# Build train dataset

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.utils import indexable, _safe_indexing
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _validate_shuffle_split
from itertools import chain

def multilabel_train_test_split(*arrays,
                                test_size=None,
                                train_size=None,
                                random_state=None,
                                shuffle=True,
                                stratify=None):
    """
    Train test split for multilabel classification. Uses the algorithm from: 
    'Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-Label Data'.
    """
    if stratify is None:
        return train_test_split(*arrays, test_size=test_size,train_size=train_size,
                                random_state=random_state, stratify=None, shuffle=shuffle)
    
    assert shuffle, "Stratified train/test split is not implemented for shuffle=False"
    
    n_arrays = len(arrays)
    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(
        n_samples, test_size, train_size, default_test_size=0.25
    )
    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return list(
        chain.from_iterable(
            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
        )
    )

## Dataset from private data

In [None]:
import numpy as np


x = data['sentence'].values
y = data[['decision', 'hace_lugar']].values

train, test = multilabel_train_test_split(data, test_size=0.2, random_state=42, stratify=y)
train, val = multilabel_train_test_split(test, test_size=0.2, random_state=42)

print('train:', len(train))
print('test:', len(test))
print('val:', len(val))



In [None]:
print(f"decisiones: {len(train.query('decision'))} from {len(train)} sentences")
print(f"hace lugar: {len(train.query('decision and hace_lugar'))} from {len(train.query('decision'))} decisiones")

print(f"decisiones: {len(val.query('decision'))} from {len(val)} sentences")
print(f"hace lugar: {len(val.query('decision and hace_lugar'))} from {len(val.query('decision'))} decisiones")

print(f"decisiones: {len(test.query('decision'))} from {len(test)} sentences")
print(f"hace lugar: {len(test.query('decision and hace_lugar'))} from {len(test.query('decision'))} decisiones")

# manual train balance

In [None]:
# class_0 = train.query("decision == 0")
# class_1 = train.query("decision == 1")
# train = pd.concat(
#     [
#         class_0.sample(len(class_1)),
#         class_1,
#     ]
# )

# print(f"decisiones: {len(train.query('decision'))} from {len(train)} sentences")
# print(f"hace lugar: {len(train.query('decision and hace_lugar'))} from {len(train.query('decision'))} decisiones")

In [None]:
import numpy as np

x_train = train['sentence'].values
y_train = train[['decision', 'hace_lugar']].values

x_val = val['sentence'].values
y_val = val[['decision', 'hace_lugar']].values

x_test = test['sentence'].values
y_test = test[['decision', 'hace_lugar']].values

# training

## build vocab

In [None]:
from more_itertools import collapse
from itertools import chain

vocab = set()

# text = chain()
# text = map(str.lower, text)
text = data['sentence'].values
text = map(str.split, text)
text = collapse(text)
vocab.update(text)
len(vocab)

## class weights

In [None]:


class_weights = {k: w for k, w in enumerate(1/(y_train.sum(axis=0)/len(y_train)))}
class_weights

class_weights = {1: 1 - y_train.sum()/np.prod(y_train.shape)}
class_weights[0] = 1 - class_weights[1]
class_weights

## training routine

In [None]:
from typing import Iterator

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.auto import tqdm
import tensorflow_text as text
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
from more_itertools import flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, TextVectorization, LSTM, Bidirectional, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

num_classes = y_train.shape[-1]
f1_score = tfa.metrics.F1Score(
    num_classes=num_classes, average="macro", name="f1_score"
)


text_vectorizer = TextVectorization(
    max_tokens=len(vocab),
    ngrams=3,
    output_mode="tf_idf",
    split='whitespace',
    standardize='lower_and_strip_punctuation'
)

with tf.device("/CPU:0"):
    text_vectorizer.adapt(x_train)

def get_model_tfidf(encoder_trainable: bool = False) -> Model:
    input_ = Input(shape=[], dtype=tf.string)
    x = text_vectorizer(input_)
    embed_shape = x.shape[1]
    # x = Dense(1024, activation="relu")(x)
    x = Dense(512, activation="relu")(x)
    x = Dense(128, activation="relu")(x)

    output = Dense(num_classes, activation="sigmoid")(x)
    model = Model(inputs=[input_], outputs=output)

    adamw = tf.keras.optimizers.experimental.AdamW()
    model.compile(
        loss="binary_crossentropy",
        optimizer=adamw,
        metrics=["categorical_accuracy", f1_score],
    )
    return model

def get_model(encoder_trainable: bool = False, lr=1e-3) -> Model:

    input_ = Input(shape=[], dtype=tf.string)
    x = hub.KerasLayer(
        # "https://tfhub.dev/google/sentence-t5/st5-base/1",
        # "https://tfhub.dev/google/universal-sentence-encoder-large/5",
        # "https://tfhub.dev/google/universal-sentence-encoder/4",
        # "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3",
        "https://tfhub.dev/google/nnlm-es-dim128/2",
        trainable=encoder_trainable,
    )(input_)

    embed_shape = x.shape[1]
    # x = Bidirectional(LSTM(128))
    x = LSTM(128)
    # x = Dense(embed_shape, activation="relu")(x)
    # x = Dense(embed_shape, activation="relu")(x)

    output = Dense(num_classes, activation="sigmoid")(x)
    model = Model(inputs=[input_], outputs=output)
    adamw = tf.keras.optimizers.experimental.AdamW(lr=lr)
    model.compile(
        loss="binary_crossentropy",
        optimizer=adamw,
        metrics=["categorical_accuracy"],
    )
    return model


# model = get_model_tfidf(encoder_trainable=True)
model = get_model(encoder_trainable=False)
model.summary()


In [None]:
# This function keeps the initial learning rate for the first ten epochs
# and decreases it exponentially after that.
def scheduler(epoch, lr):
  if epoch < 3:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

callbacks = [
    # ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True, save_format='tf'),
    EarlyStopping(patience=15, monitor="val_loss", restore_best_weights=True),
    # tf.keras.callbacks.LearningRateScheduler(scheduler)
]


from tensorflow.keras.utils import Sequence
import numpy as np   

class DataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

train_gen = DataGenerator(x_train, y_train, 32)
val_gen = DataGenerator(x_val, y_val, 32)


# model.fit(
#     x_train,
#     y_train,
#     batch_size=64,
#     validation_data=(x_val, y_val),
#     epochs=50,
#     callbacks=callbacks,
#     class_weight=class_weights,
#     shuffle=True,
# )
model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=50,
    callbacks=callbacks,
    class_weight=class_weights,
    shuffle=True,
)



# Evaluation

In [None]:
CATEGORIES = ['decision', 'hace lugar']

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('TRAIN')

hypothesis = model.predict(x_train)
reference = y_train

fig, subplot = plt.subplots(1, 2, figsize=(10, 4))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('VALIDATION')

hypothesis = model.predict(x_val)
reference = y_val

fig, subplot = plt.subplots(1, 2, figsize=(10, 4))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('TEST')

hypothesis = model.predict(x_test)
reference = y_test

fig, subplot = plt.subplots(1, 2, figsize=(10, 4))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

