In [None]:
!sudo pip install tensorflow_hub tensorflow-gpu tensorflow_text tensorflow-addons scikit-multilearn iterative-stratification

In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
CATEGORIES = ['decision', 'hace_lugar']
# CATEGORIES = ['decision']
data = pd.read_csv('sentences-decision.csv')
data.drop_duplicates(subset='sentence', inplace=True)
print(len(data))
data['sentence'].apply(lambda x: len(x.split(' '))).hist(bins=[32*i for i in range(10)])

In [None]:
len(data['sentence'].iloc[0].split(' '))

# Build train dataset

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.utils import indexable, _safe_indexing
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _validate_shuffle_split
from itertools import chain

def multilabel_train_test_split(*arrays,
                                test_size=None,
                                train_size=None,
                                random_state=None,
                                shuffle=True,
                                stratify=None):
    """
    Train test split for multilabel classification. Uses the algorithm from: 
    'Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-Label Data'.
    """
    if stratify is None:
        return train_test_split(*arrays, test_size=test_size,train_size=train_size,
                                random_state=random_state, stratify=None, shuffle=shuffle)
    
    assert shuffle, "Stratified train/test split is not implemented for shuffle=False"
    
    n_arrays = len(arrays)
    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(
        n_samples, test_size, train_size, default_test_size=0.25
    )
    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return list(
        chain.from_iterable(
            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
        )
    )

## Dataset from private data

In [None]:
import numpy as np


x = data['sentence'].values
y = data[CATEGORIES].values

if len(CATEGORIES) > 1:
    train, test = multilabel_train_test_split(data, test_size=0.2, random_state=42, stratify=y)
    test, val = multilabel_train_test_split(train, test_size=0.5, random_state=42)
else:
    train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=y)
    test, val = train_test_split(train, test_size=0.5, random_state=42)


print('train:', len(train))
print('test:', len(test))
print('val:', len(val))



In [None]:
print(f"decisiones: {len(train.query('decision'))} from {len(train)} sentences")
print(f"hace lugar: {len(train.query('decision and hace_lugar'))} from {len(train.query('decision'))} decisiones")

print(f"decisiones: {len(val.query('decision'))} from {len(val)} sentences")
print(f"hace lugar: {len(val.query('decision and hace_lugar'))} from {len(val.query('decision'))} decisiones")

print(f"decisiones: {len(test.query('decision'))} from {len(test)} sentences")
print(f"hace lugar: {len(test.query('decision and hace_lugar'))} from {len(test.query('decision'))} decisiones")

# manual train balance

In [None]:
class_0 = train.query("decision == 0")
class_1 = train.query("decision == 1")
train = pd.concat(
    [
        class_0.sample(len(class_1), random_state=42),
        class_1,
    ]
)

print(f"decisiones: {len(train.query('decision'))} from {len(train)} sentences")
print(f"hace lugar: {len(train.query('decision and hace_lugar'))} from {len(train.query('decision'))} decisiones")

In [None]:
import numpy as np

x_train = train['sentence'].values
y_train = train[CATEGORIES].values

x_val = val['sentence'].values
y_val = val[CATEGORIES].values

x_test = test['sentence'].values
y_test = test[CATEGORIES].values

# training

## class weights

In [None]:


class_weights = {k: w for k, w in enumerate(1/(y_train.sum(axis=0)/len(y_train)))}
class_weights

class_weights = {1: 1 - y_train.sum()/np.prod(y_train.shape)}
class_weights[0] = 1 - class_weights[1]
class_weights

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(train['sentence'].values)

x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_tokens = 128

x_train = pad_sequences(
    x_train, maxlen=max_tokens, padding="post", truncating="post", value=0
)
x_val = pad_sequences(
    x_val, maxlen=max_tokens, padding="post", truncating="post", value=0
)
x_test = pad_sequences(
    x_test, maxlen=max_tokens, padding="post", truncating="post", value=0
)


In [None]:
class_weights = {
    'decision': [1, 1],
    'hace_lugar': np.array([0, 0], dtype='float32') 
}

In [None]:
# class_weights['decision'].dtype

In [None]:
import tensorflow.keras.backend as K


def multilabel_weighted_binary_crossentropy(y_true, y_pred):
    loss = 0
    y_true = K.cast(y_true, K.floatx())
    # loss -= (class_weights['decision'][1]*y_true[0]*K.log(y_pred[0]) + class_weights['decision'][0]*(1-y_true[0])*K.log(1-y_pred[0]))
    # loss -= (class_weights['hace_lugar'][1]*y_true[1]*K.log(y_pred[1]) + class_weights['hace_lugar'][0]*(1-y_true[1])*K.log(1-y_pred[1]))
    loss -= 0.5*(y_true[0] * K.log(y_pred[0]) + (1 - y_true[0]) * K.log(1 - y_pred[0]))
    # loss -= 0.5*(y_true[1] * K.log(y_pred[1]) + (1 - y_true[1]) * K.log(1 - y_pred[1]))

    return loss


In [None]:
import tensorflow as tf

tf.keras.utils.set_random_seed(42)


In [None]:
# multilabel_weighted_binary_crossentropy(np.array([[0., 0.]]), np.array([[0.9, 0.8]]))

## training routine

In [None]:
from typing import Iterator

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.auto import tqdm
import tensorflow_text as text
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
from more_itertools import flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Dense,
    Input,
    TextVectorization,
    LSTM,
    Bidirectional,
    Embedding,
    Dropout, Flatten
)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential

num_classes = y_train.shape[-1]
f1_score = tfa.metrics.F1Score(num_classes=num_classes, average='micro', name="f1_score")
embed_len = 25
lstm_out = 20


def get_model() -> Model:
    model = Sequential(
        [
            Embedding(
                input_dim=len(tokenizer.word_index) + 1,
                output_dim=embed_len,
                input_length=max_tokens,
            ),
            # Dropout(0.5),
            # LSTM(lstm_out),
            Bidirectional(LSTM(lstm_out)),
            # Flatten(),
            # Dense(50),
            Dense(num_classes, activation="sigmoid"),
        ]
    )

    adamw = tf.keras.optimizers.experimental.AdamW()
    model.compile(
        loss="binary_crossentropy",
        # loss=multilabel_weighted_binary_crossentropy,
        optimizer=adamw,
        metrics=["categorical_accuracy", f1_score],
    )
    return model


model = get_model()
model.summary()


In [None]:
import matplotlib.pyplot as plt
# This function keeps the initial learning rate for the first ten epochs
# and decreases it exponentially after that.
def scheduler(epoch, lr):
  if epoch < 3:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

callbacks = [
    # ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True, save_format='tf'),
    EarlyStopping(patience=15, monitor="val_loss", restore_best_weights=True),
    # tf.keras.callbacks.LearningRateScheduler(scheduler)
]


history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    validation_data=(x_val, y_val),
    epochs=3,
    callbacks=callbacks,
    # class_weight=class_weights,
    # shuffle=True,
)




In [None]:
import matplotlib.pyplot as plt
# This function keeps the initial learning rate for the first ten epochs
# and decreases it exponentially after that.
def scheduler(epoch, lr):
  if epoch < 3:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

callbacks = [
    # ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True, save_format='tf'),
    EarlyStopping(patience=15, monitor="val_loss", restore_best_weights=True),
    # tf.keras.callbacks.LearningRateScheduler(scheduler)
]


history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    validation_data=(x_val, y_val),
    epochs=3,
    callbacks=callbacks,
    # class_weight=class_weights,
    # shuffle=True,
)




In [None]:

fig, subplot = plt.subplots(1, 2, figsize=(10, 4))

subplot[0].plot(history.history['loss'], label='train')
subplot[0].plot(history.history['val_loss'], label='val')

subplot[1].plot(history.history['f1_score'], label='train')
subplot[1].plot(history.history['val_f1_score'], label='val')

subplot[0].legend()
subplot[1].legend()

# Evaluation

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('TRAIN')

hypothesis = model.predict(x_train)
reference = y_train

fig, subplot = plt.subplots(1, 2, figsize=(10, 4))

if len(CATEGORIES) > 1:
    confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
else:
    confusion =  confusion_matrix(reference, hypothesis > 0.5)
    confusion = [confusion]
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

target_names = CATEGORIES if len(CATEGORIES) > 1 else [0, 1]
report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=target_names)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('VALIDATION')

hypothesis = model.predict(x_val)
reference = y_val

fig, subplot = plt.subplots(1, 2, figsize=(10, 4))

if len(CATEGORIES) > 1:
    confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
else:
    confusion =  confusion_matrix(reference, hypothesis > 0.5)
    confusion = [confusion]
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

target_names = CATEGORIES if len(CATEGORIES) > 1 else [0, 1]
report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=target_names)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix, confusion_matrix

print("TEST")

hypothesis = model.predict(x_test)
reference = y_test

fig, subplot = plt.subplots(1, 2, figsize=(10, 4))

if len(CATEGORIES) > 1:
    confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
else:
    confusion =  confusion_matrix(reference, hypothesis > 0.5)
    confusion = [confusion]
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt="d", ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

target_names = CATEGORIES if len(CATEGORIES) > 1 else [0, 1]
report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=target_names)
pd.DataFrame(report).T


In [None]:
a = test.copy()
a[["pred_decision", "pred_hace_lugar"]] = hypothesis

In [None]:
pd.set_option(
    "display.max_columns",
    1000,
    "display.width",
    1000,
    "display.max_colwidth",
    None,
)


In [None]:
a.query('decision == 1 and pred_decision < 0.5').sample(10)

In [None]:
!sudo pip install lime

In [None]:


def make_predictions(X_batch_text):
    X = tokenizer.texts_to_sequences(X_batch_text)
    X = pad_sequences(X, maxlen=max_tokens, padding="post", truncating="post", value=0) ## Bringing all samples to max_tokens length.
    preds = model.predict(X)
    return preds

In [None]:
make_predictions([text])

In [None]:
from lime import lime_text
import numpy as np

explainer = lime_text.LimeTextExplainer(class_names=CATEGORIES, verbose=True)


rng = np.random.RandomState(1)
idx = rng.randint(1, len(test))
X = tokenizer.texts_to_sequences(test['sentence'].iloc[idx:idx+1])
X = pad_sequences(X, maxlen=max_tokens, padding="post", truncating="post", value=0) ## Bringing all samples to max_tokens length.
preds = model.predict(X)

text = test['sentence'].iloc[idx]

print(text)
print("Prediction : ", preds > 0.5)
print("Actual :     ", y_test[idx])

explanation = explainer.explain_instance([text], classifier_fn=make_predictions, labels=y_test[idx:idx+1])
explanation.show_in_notebook()
