In [105]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_colwidth', None)
import random
import scikitplot as skplt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from transformers import DefaultDataCollator

from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification

In [106]:
X_train = pd.read_csv('../datasets/future_statements_dataset/X_train.csv')["statement"]
y_train = pd.read_csv('../datasets/future_statements_dataset/y_train.csv')["future"]

In [107]:
params = {'MAX_LENGTH': 128,
          'EPOCHS': 50,
          #learningrate
          'LEARNING_RATE': 5e-5,
          'FT_EPOCHS': 10,
          'OPTIMIZER': 'adam',
          'FL_GAMMA': 2.0,
          'FL_ALPHA': 0.2,
          'BATCH_SIZE': 64,
          'NUM_STEPS': len(X_train.index) // 64,
          #dropouts:
          'DISTILBERT_DROPOUT': 0.2,
          'DISTILBERT_ATT_DROPOUT': 0.2,
          'LAYER_DROPOUT': 0.2,
          'KERNEL_INITIALIZER': 'GlorotNormal',
          'BIAS_INITIALIZER': 'zeros',
          'POS_PROBA_THRESHOLD': 0.90,
          'ADDED_LAYERS': 'Dense 256, Dense 32, Dropout 0.2',
          'LR_SCHEDULE': '5e-5 for 6 epochs, Fine-tune w/ adam for 2 epochs @2e-5',
          'FREEZING': 'All DistilBERT layers frozen for 6 epochs, then unfrozen for 2',
          'CALLBACKS': '[early_stopping w/ patience=0]',
          'RANDOM_STATE': 42
          }

In [108]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_projector', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_59', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [109]:
def tokenize_func(examples):
    return tokenizer(examples["statement"], padding="max_length", truncation=True)

In [110]:
def batch_encode(_tokenizer, texts, batch_size=256, max_length=params['MAX_LENGTH']):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed
    into a pre-trained transformer model.
    Input:
        - _tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""

    input_ids = []
    attention_mask = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = _tokenizer.batch_encode_plus(batch,
                                              max_length=max_length,
                                              padding='max_length',
                                              truncation=True,
                                              return_attention_mask=True,
                                              return_token_type_ids=False
                                              )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [111]:
train = pd.concat([X_train, y_train], axis=1)

In [112]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

# Create train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

In [113]:
# Sort index
X_train.sort_index(inplace=True)
X_valid.sort_index(inplace=True)
X_test.sort_index(inplace=True)
y_train.sort_index(inplace=True)
y_valid.sort_index(inplace=True)
y_test.sort_index(inplace=True)

# Reset index
X_train.reset_index(drop=True, inplace=True)
X_valid.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print('Training data:   ', len(X_train.index), ' rows. Negatives:', (y_train==0).sum(), 'Positives:', (y_train==1).sum())
print('Validation data: ', len(X_valid.index), ' rows. Negatives:', (y_valid==0).sum(), 'Positives:', (y_valid==1).sum())
print('Test data:       ', len(X_test.index), ' rows. Negatives:', (y_test==0).sum(), 'Positives:', (y_test==1).sum())

Training data:    1600  rows. Negatives: 810 Positives: 790
Validation data:  400  rows. Negatives: 180 Positives: 220
Test data:        500  rows. Negatives: 260 Positives: 240


In [114]:
# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())

# Encode X_valid
X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist())

# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())

In [115]:
X_train_short = X_train[[1, 2, 501, 502]]
y_train_short = y_train[[1, 2, 501, 502]]

In [116]:
tokenizer(X_train_short.to_list(), padding=True, truncation=True)

{'input_ids': [[101, 1996, 2783, 2095, 2038, 2042, 2053, 6453, 2000, 2008, 6517, 5418, 1998, 9272, 3613, 2000, 2022, 1996, 4539, 1997, 4491, 1024, 1999, 5712, 1010, 2119, 8956, 1998, 3097, 10342, 2031, 2042, 9416, 1025, 1999, 6921, 1010, 2019, 2886, 2001, 2566, 22327, 9250, 2114, 1037, 14334, 3345, 1025, 1998, 1010, 1999, 1996, 2845, 4657, 1010, 2045, 2031, 2042, 1037, 5164, 1997, 4491, 1010, 1996, 6745, 2108, 1996, 2082, 9288, 1999, 2022, 14540, 2319, 1010, 2073, 2336, 3062, 6778, 2000, 2019, 15741, 2012, 21735, 1012, 102], [101, 2005, 1996, 2034, 2051, 1999, 2256, 2381, 1010, 2256, 2455, 6645, 2031, 2042, 4225, 1010, 1998, 2027, 2024, 2085, 2108, 9530, 23633, 2135, 17183, 2906, 12921, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2256, 2784, 17005, 2005, 2017, 2004, 1037, 2270, 3275, 1998, 1037, 17689, 14306, 2015, 2149, 2008, 2023, 5219, 2104, 2

In [117]:
y_train_short.to_list()

[0, 0, 1, 1]

In [118]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  mode='min',
                                                  min_delta=0,
                                                  patience=0,
                                                  restore_best_weights=True)

In [119]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

if not os.path.exists('./checkpoints/'):
    print('Train model...')
    model.fit(x=[X_train_ids, X_train_attention]
              , y=y_train.to_numpy()
              , epochs=3
              #, batch_size=params['BATCH_SIZE']
              #, steps_per_epoch=params['NUM_STEPS']
              , validation_data=([X_valid_ids, X_valid_attention], y_valid.to_numpy())
              #, callbacks=[early_stopping]
              , verbose=1
             )
    model.save_weights('./checkpoints/my_checkpoint')
else:
    print('Already trained weights available...')

Already trained weights available...


In [None]:
loss, acc = model.evaluate(x=[X_test_ids, X_test_attention], y=y_test, verbose=1)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))



In [None]:
if not y_pred:
    y_pred = model.predict([X_test_ids, X_test_attention]
                           , verbose=1)
prediction_logits = y_pred[0]
prediction_probs = tf.nn.softmax(prediction_logits,axis=1).numpy()
y_pred_thresh = np.where(prediction_probs > params['POS_PROBA_THRESHOLD'], 1, 0)

In [None]:
y_pred_t = pd.Series([el[1] for el in y_pred_thresh])

# Get evaluation results
accuracy = accuracy_score(y_test, y_pred_t)
auc_roc = roc_auc_score(y_test, y_pred_t)

print(accuracy)
print(auc_roc)

In [None]:
skplt.metrics.plot_confusion_matrix(y_pred_t.tolist()
                                    , y_test.to_list()
                                    , figsize=(6, 6)
                                    , text_fontsize=14)
plt.title(label='Test Confusion Matrix', fontsize=20, pad=17)
plt.xlabel('Predicted Label', labelpad=14)
plt.ylabel('True Label', labelpad=14)

plt.savefig('../figures/future_statements_confusionmatrix.png', dpi=300.0, transparent=False)

In [None]:
#model.save('saved_model/my_model2', save_format='h5')

In [None]:
#data_collator = DefaultDataCollator(return_tensors="tf")