In [None]:
!sudo pip install tensorflow_hub tensorflow-gpu tensorflow_text tensorflow-addons scikit-multilearn iterative-stratification

In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('sentences-decision.csv')

def get_category(pair):
    if pair[0] == 0:
        cat = 0
    elif pair[0] == 1 and pair[1] == 0:
        cat = 1
    elif pair[0] == 1 and pair[1] == 1:
        cat = 2
    else:
        raise "not valid"
    return cat

data['category'] = data[['decision', 'hace_lugar']].apply(get_category, axis=1) 

data.drop_duplicates(subset='sentence', inplace=True)
print(len(data))
data['sentence'].apply(lambda x: len(x.split(' '))).hist(bins=[32*i for i in range(10)])

In [None]:
data

# Build train dataset

In [None]:
import numpy as np


x = data['sentence'].values
y = data['category'].values

train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=y)
test, val = train_test_split(test, test_size=0.5, random_state=42)


print('train:', len(train))
print('test:', len(test))
print('val:', len(val))



In [None]:
print(f"cat 0: {len(train.query('category == 0'))} from {len(train)} sentences")
print(f"cat 1: {len(train.query('category == 1'))} from {len(train)} sentences")
print(f"cat 2: {len(train.query('category == 2'))} from {len(train)} sentences")

In [None]:
import numpy as np

x_train = train['sentence'].values
y_train = train['category'].values

x_val = val['sentence'].values
y_val = val['category'].values

x_test = test['sentence'].values
y_test = test['category'].values

# training

## class weights

In [None]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(
    "balanced", classes=np.unique(y_train), y=y_train
)
class_weights = {k: v for k, v in enumerate(class_weights)}
class_weights

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(train['sentence'].values)

x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_tokens = 128

x_train = pad_sequences(
    x_train, maxlen=max_tokens, padding="post", truncating="post", value=0
)
x_val = pad_sequences(
    x_val, maxlen=max_tokens, padding="post", truncating="post", value=0
)
x_test = pad_sequences(
    x_test, maxlen=max_tokens, padding="post", truncating="post", value=0
)


## training routine

In [None]:
from typing import Iterator

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.auto import tqdm
import tensorflow_text as text
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
from more_itertools import flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Dense,
    Input,
    TextVectorization,
    LSTM,
    Bidirectional,
    Embedding,
    Dropout, Flatten
)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential

num_classes = y_train.shape[-1]
f1_score = tfa.metrics.F1Score(num_classes=3, average='micro', name="f1_score")
embed_len = 25
lstm_out = 20


def get_model() -> Model:
    model = Sequential(
        [
            Embedding(
                input_dim=len(tokenizer.word_index) + 1,
                output_dim=embed_len,
                input_length=max_tokens,
            ),
            # Dropout(0.5),
            # LSTM(lstm_out),
            Bidirectional(LSTM(lstm_out)),
            # Flatten(),
            # Dense(50),
            Dense(3, activation="softmax"),
        ]
    )

    adamw = tf.keras.optimizers.experimental.AdamW()
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=adamw,
        # optimizer='rmsprop',
        metrics=["accuracy"],
    )
    return model


model = get_model()
model.summary()


In [None]:
import matplotlib.pyplot as plt
# This function keeps the initial learning rate for the first ten epochs
# and decreases it exponentially after that.
def scheduler(epoch, lr):
  if epoch < 3:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

callbacks = [
    # ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True, save_format='tf'),
    EarlyStopping(patience=15, monitor="val_loss", restore_best_weights=True),
    # tf.keras.callbacks.LearningRateScheduler(scheduler)
]


history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    validation_data=(x_val, y_val),
    epochs=50,
    callbacks=callbacks,
    class_weight=class_weights,
    # shuffle=True,
)




In [None]:

fig, subplot = plt.subplots(1, 2, figsize=(10, 4))

subplot[0].plot(history.history['loss'], label='train')
subplot[0].plot(history.history['val_loss'], label='val')

subplot[1].plot(history.history['accuracy'], label='train')
subplot[1].plot(history.history['val_accuracy'], label='val')

subplot[0].legend()
subplot[1].legend()

# Evaluation

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix, confusion_matrix

print('VAL')

hypothesis = model.predict(x_val).argmax(axis=1)
reference = y_val

fig, ax = plt.subplots(1, 1, figsize=(10, 6))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('VAL')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix, confusion_matrix

print('TEST')

hypothesis = model.predict(x_test).argmax(axis=1)
reference = y_test

fig, ax = plt.subplots(1, 1, figsize=(10, 6))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('VAL')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix, confusion_matrix

print('TRAIN')

hypothesis = model.predict(x_train).argmax(axis=1)
reference = y_train

fig, ax = plt.subplots(1, 1, figsize=(10, 6))

confusion =  confusion_matrix(reference, hypothesis)
print(confusion)
sns.heatmap(confusion, annot=True, fmt='d', ax=ax)
ax.set_xlabel("hypothesis")
ax.set_ylabel("reference")
ax.set_xticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_yticklabels(["None", "desicion/no_hace_lugar", "descion/hace_lugar"])
ax.set_title('TRAIN')

plt.tight_layout()

report = classification_report(reference, hypothesis, output_dict=True)
pd.DataFrame(report).T

In [None]:
a = test.copy()
a[["pred_decision", "pred_hace_lugar"]] = hypothesis

In [None]:
pd.set_option(
    "display.max_columns",
    1000,
    "display.width",
    1000,
    "display.max_colwidth",
    None,
)


In [None]:
a.query('decision == 1 and pred_decision < 0.5').sample(10)

In [None]:
!sudo pip install lime

In [None]:


def make_predictions(X_batch_text):
    X = tokenizer.texts_to_sequences(X_batch_text)
    X = pad_sequences(X, maxlen=max_tokens, padding="post", truncating="post", value=0) ## Bringing all samples to max_tokens length.
    preds = model.predict(X)
    return preds

In [None]:
make_predictions([text])

In [None]:
from lime import lime_text
import numpy as np

explainer = lime_text.LimeTextExplainer(class_names=CATEGORIES, verbose=True)


rng = np.random.RandomState(1)
idx = rng.randint(1, len(test))
X = tokenizer.texts_to_sequences(test['sentence'].iloc[idx:idx+1])
X = pad_sequences(X, maxlen=max_tokens, padding="post", truncating="post", value=0) ## Bringing all samples to max_tokens length.
preds = model.predict(X)

text = test['sentence'].iloc[idx]

print(text)
print("Prediction : ", preds > 0.5)
print("Actual :     ", y_test[idx])

explanation = explainer.explain_instance([text], classifier_fn=make_predictions, labels=y_test[idx:idx+1])
explanation.show_in_notebook()
