In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_colwidth', None)
import random
import scikitplot as skplt
import tensorflow as tf

from sklearn.model_selection import train_test_split

from transformers import DefaultDataCollator

from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification

2022-08-07 09:38:49.449481: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-07 09:38:49.449589: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
X_train = pd.read_csv('../datasets/future_statements_dataset/X_train.csv')["statement"]
y_train = pd.read_csv('../datasets/future_statements_dataset/y_train.csv')["future"]

In [4]:
params = {'MAX_LENGTH': 128,
          'EPOCHS': 50,
          #learningrate
          'LEARNING_RATE': 5e-5,
          'FT_EPOCHS': 10,
          'OPTIMIZER': 'adam',
          'FL_GAMMA': 2.0,
          'FL_ALPHA': 0.2,
          'BATCH_SIZE': 64,
          'NUM_STEPS': len(X_train.index) // 64,
          #dropouts:
          'DISTILBERT_DROPOUT': 0.2,
          'DISTILBERT_ATT_DROPOUT': 0.2,
          'LAYER_DROPOUT': 0.2,
          'KERNEL_INITIALIZER': 'GlorotNormal',
          'BIAS_INITIALIZER': 'zeros',
          'POS_PROBA_THRESHOLD': 0.5,
          'ADDED_LAYERS': 'Dense 256, Dense 32, Dropout 0.2',
          'LR_SCHEDULE': '5e-5 for 6 epochs, Fine-tune w/ adam for 2 epochs @2e-5',
          'FREEZING': 'All DistilBERT layers frozen for 6 epochs, then unfrozen for 2',
          'CALLBACKS': '[early_stopping w/ patience=0]',
          'RANDOM_STATE': 42
          }

In [19]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_77', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [6]:
def tokenize_func(examples):
    return tokenizer(examples["statement"], padding="max_length", truncation=True)

In [7]:
def batch_encode(_tokenizer, texts, batch_size=256, max_length=params['MAX_LENGTH']):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed
    into a pre-trained transformer model.
    Input:
        - _tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""

    input_ids = []
    attention_mask = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = _tokenizer.batch_encode_plus(batch,
                                              max_length=max_length,
                                              padding='max_length',
                                              truncation=True,
                                              return_attention_mask=True,
                                              return_token_type_ids=False
                                              )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [8]:
train = pd.concat([X_train, y_train], axis=1)

In [9]:
tokenizer(X_train[0])

{'input_ids': [101, 2057, 2342, 2000, 2417, 2098, 24695, 9731, 2000, 1996, 6481, 1998, 5682, 2005, 2029, 2023, 3029, 2001, 2631, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
X_train[0]

'We need to rededicate ourselves to the principles and purposes for which this Organization was founded.'

In [11]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

# Create train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

In [12]:
# Sort index
X_train.sort_index(inplace=True)
X_valid.sort_index(inplace=True)
X_test.sort_index(inplace=True)
y_train.sort_index(inplace=True)
y_valid.sort_index(inplace=True)
y_test.sort_index(inplace=True)

# Reset index
X_train.reset_index(drop=True, inplace=True)
X_valid.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print('Training data:   ', len(X_train.index), ' rows. Negatives:', (y_train==0).sum(), 'Positives:', (y_train==1).sum())
print('Validation data: ', len(X_valid.index), ' rows. Negatives:', (y_valid==0).sum(), 'Positives:', (y_valid==1).sum())
print('Test data:       ', len(X_test.index), ' rows. Negatives:', (y_test==0).sum(), 'Positives:', (y_test==1).sum())

Training data:    1600  rows. Negatives: 808 Positives: 792
Validation data:  400  rows. Negatives: 197 Positives: 203
Test data:        500  rows. Negatives: 245 Positives: 255


In [13]:
# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())

# Encode X_valid
X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist())

# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())

In [14]:
X_train_attention

<tf.Tensor: shape=(1600, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>

In [15]:
X_train_short = X_train[[1, 2, 501, 502]]
y_train_short = y_train[[1, 2, 501, 502]]

In [16]:
tokenizer(X_train_short.to_list(), padding=True, truncation=True)

{'input_ids': [[101, 2034, 1010, 1996, 6284, 2313, 1011, 2029, 1999, 13884, 2003, 2025, 2312, 1010, 2561, 2989, 2053, 2062, 2084, 2531, 1010, 2199, 1011, 14087, 3229, 2000, 2049, 2087, 8995, 3791, 1998, 5344, 1996, 4487, 11493, 2618, 29397, 1997, 2172, 1997, 2049, 3019, 4915, 11427, 2000, 1996, 3439, 3627, 1997, 11424, 16136, 20152, 1010, 1998, 2087, 3728, 2000, 1037, 2828, 1997, 23226, 20181, 2008, 2001, 16021, 6132, 13043, 2000, 2529, 13372, 1012, 102], [101, 2057, 2024, 2036, 8794, 2005, 1996, 6958, 2094, 2597, 2008, 2035, 5014, 2442, 2022, 7183, 1010, 2008, 3785, 2442, 2022, 2580, 2005, 1996, 2709, 1997, 2035, 1997, 2216, 2040, 2031, 2042, 10016, 1998, 2008, 1996, 16757, 1999, 11491, 1998, 2777, 11631, 14713, 2442, 2022, 12361, 1996, 2157, 2000, 2166, 1998, 2000, 4071, 1997, 2929, 1998, 2035, 2942, 1010, 2120, 1998, 3412, 22467, 1012, 102, 0, 0, 0, 0, 0, 0, 0], [101, 2009, 2097, 2022, 14203, 2005, 2033, 2000, 23120, 2588, 1996, 3809, 3171, 3471, 5307, 1996, 4975, 3032, 1998, 1996, 

In [17]:
y_train_short.to_list()

[0, 0, 1, 1]

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  mode='min',
                                                  min_delta=0,
                                                  patience=0,
                                                  restore_best_weights=True)

In [21]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(x=[X_train_ids, X_train_attention]
          , y=y_train.to_numpy()
          , epochs=3
          #, batch_size=params['BATCH_SIZE']
          #, steps_per_epoch=params['NUM_STEPS']
          , validation_data=([X_valid_ids, X_valid_attention], y_valid.to_numpy())
          #, callbacks=[early_stopping]
          #, verbose=2
         )

Epoch 1/3
 6/50 [==>...........................] - ETA: 16:03 - loss: 0.6751 - sparse_categorical_accuracy: 0.5365

KeyboardInterrupt: 

In [None]:
model.save_weights('./checkpoints/my_checkpoint')

In [24]:
model.load_weights('./checkpoints/my_checkpoint')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f308453fc40>

In [None]:
loss, acc = model.evaluate(test_images, test_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))

In [26]:
model.save('saved_model/my_model2', save_format='h5')

NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [None]:
loss, acc = model.evaluate(test_images, test_labels, verbose=2)

In [None]:
data_collator = DefaultDataCollator(return_tensors="tf")