In [None]:
import random
import pandas as pd
import numpy as np
from scipy.special import softmax
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, classification_report, confusion_matrix)
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification, AutoConfig


In [None]:
# Constants
PRETRAINED_MODEL_NAME = 'roberta-base'
LABELS_NUMBER = 2
MAX_LENGHT = 512
EPOCHS_NUMBER = 3
N_PREDICTIONS_TO_SHOW = 10

In [None]:
# Load data
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [None]:
# Functions for data preparation
tokenizer = RobertaTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

def prepare_sequence(text):
    prepared_sequence = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=MAX_LENGHT,
                            padding='max_length',
                            return_attention_mask=True
                            )
    return prepared_sequence

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
def map_example_to_dict(input_ids, attention_masks, label):
    """
    Map to the expected input to TFRobertaForSequenceClassification.
    """
    mapped_example = {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
    }
    return mapped_example, label

def encode_examples(texts_and_labels):
    """
    Prepare all sequences of text and build TF dataset.
    """

    input_ids_list = []
    attention_mask_list = []
    label_list = []

    for text, label in texts_and_labels:

        roberta_input = prepare_sequence(text)

        input_ids_list.append(roberta_input['input_ids'])
        attention_mask_list.append(roberta_input['attention_mask'])
        label_list.append([label])

    # Create TF dataset
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_ids_list, attention_mask_list, label_list)
    )
    # Map to the expected input to TFRobertaForSequenceClassification
    dataset_mapped = dataset.map(map_example_to_dict)
    return dataset_mapped


In [None]:
# Split data
X = train_data["text"]
y = train_data["target"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=1)
train_dataset = list(zip(X_train, y_train))
val_dataset = list(zip(X_val, y_val))

In [None]:
# Prepare datasets
ds_train_encoded = encode_examples(train_dataset).shuffle(10000)
ds_val_encoded = encode_examples(val_dataset)

In [None]:
n_training_examples = X_train.shape[0]
n_positive_training_examples = y_train.value_counts()[1]
n_negative_training_examples = y_train.value_counts()[0]
print(f'Number examples in training dataset: {n_training_examples}')
print(f'Number of positive examples in training dataset: {n_positive_training_examples}')
print(f'Number of negative examples in training dataset: {n_negative_training_examples}')

Number examples in training dataset: 6851
Number of positive examples in training dataset: 2947
Number of negative examples in training dataset: 3904


In [None]:
weight_for_0 = (1 / n_negative_training_examples)*(n_training_examples)/2.0
weight_for_1 = (1 / n_positive_training_examples)*(n_training_examples)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.88
Weight for class 1: 1.16


In [None]:
# Define model
def get_model():
    config = AutoConfig.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=LABELS_NUMBER)
    model = TFRobertaForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, config=config)
    return model

model = get_model()
optimizer = tf.keras.optimizers.Adam()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [None]:
# Hyperparameter tuning
batch_sizes = [4, 8, 16]
learning_rates = [1e-5, 2e-5, 3e-5]

best_auc = 0
best_batch_size = None
best_learning_rate = None

for batch_size in batch_sizes:
    for learning_rate in learning_rates:
        # Compile the model
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

        # Train the model
        model.fit(ds_train_encoded.batch(batch_size), epochs=EPOCHS_NUMBER, verbose=0,class_weight = class_weight)

        # Evaluate the model
        val_predictions = model.predict(ds_val_encoded)
        val_probabilities = softmax(val_predictions[0], axis=1)
        auc_score = roc_auc_score(y_val, val_probabilities[:,1:2], multi_class="ovr")

        # Check if this combination is the best so far
        if auc_score > best_auc:
            best_auc = auc_score
            best_batch_size = batch_size
            best_learning_rate = learning_rate

print(f'Best AUC: {best_auc}')
print(f'Best Batch Size: {best_batch_size}')
print(f'Best Learning Rate: {best_learning_rate}')

Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


In [None]:
# Get predictions in the validation dataset
val_predictions = model.predict(ds_val_encoded)
val_probabilities = softmax(val_predictions[0], axis=1)
y_val_predictions = np.argmax(val_probabilities, axis=1).flatten()  #returns the index of max value in an array

In [None]:
# Compute metrics to evaluate the model
classification_metrics = classification_report(y_val, y_val_predictions)
# Compute the area under the ROC curve
area_under_the_curve = roc_auc_score(y_val, val_probabilities[:,1:2], multi_class="ovr")
# Compute the confusion matrix
error_matrix = confusion_matrix(y_val, y_val_predictions)
print(f'Area under the ROC curve: {area_under_the_curve}')
print(f'Classification metrics:\n{classification_metrics}')
# Plot the confusion matrix
ax = plt.axes()
sns.heatmap(error_matrix, annot=True, fmt="d")
ax.set_title('Confusion matrix Validation set')

In [None]:
# Show some predictions in the validation dataset
X_test = test_data["text"]
for i in random.sample(range(len(val_dataset)), k=N_PREDICTIONS_TO_SHOW):
    print(f'\nText:       {X_test.values[i]}')
    print(f'Ground truth: {"Real disaster" if y_val.values[i]==1 else "Not real disaster"}')
    print(f'Predicted:    {"Real disaster" if y_val_predictions[i]==1 else "Not real disaster"}')

In [None]:
def encode_test_examples(texts):
    """
    Prepare all sequences of text and build TF dataset.
    """

    input_ids_list = []
    attention_mask_list = []

    for text in texts:

        roberta_input = prepare_sequence(text)

        input_ids_list.append(roberta_input['input_ids'])
        attention_mask_list.append(roberta_input['attention_mask'])

    # Create TF dataset
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_ids_list, attention_mask_list)
    )
    # Map to the expected input to TFRobertaForSequenceClassification
    dataset_mapped = dataset.map(map_test_example_to_dict)
    return dataset_mapped

def map_test_example_to_dict(input_ids, attention_masks):
    """
    Map to the expected input to TFRobertaForSequenceClassification.
    """
    mapped_example = {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
    }
    return mapped_example


In [None]:
X_test = test_data["text"]
test_dataset = list(X_test)
ds_test_encoded = encode_test_examples(test_dataset).batch(BATCH_SIZE)

In [None]:
test_predictions = model.predict(ds_test_encoded)
test_probabilities = softmax(test_predictions[0], axis=1)
y_test_predictions = np.argmax(test_probabilities, axis=1).flatten()

In [None]:
y_test_predictions