# RNN - transfer learning

Own exploration of tranfer learning for classification tasks.\
Content inspired by https://www.geeksforgeeks.org/fine-tuning-bert-model-for-sentiment-analysis/

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# preprocess text
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# #for transformers
# import transformers
# from transformers import BertTokenizerFast, BertForSequenceClassification, BertModel

# # deep ML
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
# from tensorflow.keras.optimizers import Adam
# # from tensorflow.keras import layers

[nltk_data] Downloading package stopwords to /home/dorota/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dorota/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2024-01-28 11:12:58.863735: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-28 11:12:58.863820: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-28 11:12:58.865097: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-28 11:12:58.874479: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized 

## Help functions and settings

In [2]:
def count_words_in_df(text):
    '''Count total number of words in dataframe'''
    rows = text.apply(lambda row: row.split(' ')) # create a list of words for each row
    word_count = [word for row in rows for word in row] # concatinate all words from whole df into one list
    word_uniqe = set(word_count)
    return len(word_count), len(word_uniqe)


def remove_stop_words_and_lemmatize(text, lemmatizer):
    '''Removes stop words and lemmatizes text (nouns and verbs)'''
    words = text.split(' ')
    words_cleaned = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))] # lemmatize nouns
    words_cleaned = [lemmatizer.lemmatize(word, pos='v') for word in words_cleaned] # lemmatize verbs
    text_cleaned = (' ').join(words_cleaned)
    return text_cleaned


def plot_training_history(train_losses, train_accuracies, val_losses, val_accuracies, test_loss, test_accuracy):
    plt.figure(figsize=(10, 3))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='train')
    plt.plot(val_losses, label='val')
    plt.title(f'Loss (test loss = {test_loss:.4f})')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')

    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='train')
    plt.plot(val_accuracies, label='val')
    plt.title(f'Accuracy (test accuracy = {test_accuracy:.4f})')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')

    plt.legend()
    plt.show()


def display_confusion_matrix(y_pred, y_test, labels_dict):
    labels_list = list(labels_dict.keys())
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    # Confusion Matrix
    cm = confusion_matrix(y_test_classes, y_pred_classes)
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels_list, yticklabels=labels_list)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

In [3]:
pd.set_option('display.max_colwidth', None)

## Data and data pre- processing
* lemmatize
* remove stop words
* one-hot encode labels
* train|val|test split

*For EDA of the data see Lab_RNN_base*

In [136]:
df = pd.read_csv('./data/emotions.csv')
df = df.head(5000)

In [137]:
lemmatizer = WordNetLemmatizer()
df.text = df.text.apply(lambda row_text: remove_stop_words_and_lemmatize(row_text, lemmatizer))

In [194]:
labels_dict = {"anger": 0, "fear": 1, "joy": 2, "love": 3, "sadness": 4, "surprise": 5}

df.label = df.label.replace(labels_dict) # str -> int
y_array = df.label.to_numpy() # pd.Series -> array
y_array = to_categorical(y_array, len(labels_dict)) # one-hot encoding NOTE: no one-hot encoding for Huggingface - transformers Trainer

In [195]:
X_array = df.text.to_numpy()

In [196]:
X_trainval, X_test, y_trainval, y_test =train_test_split(X_array, y_array, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val =train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((3000,), (1000,), (1000,), (3000, 6), (1000, 6), (1000, 6))

---
# Huggingface - transformers Trainer
https://medium.com/grabngoinfo/transfer-learning-for-text-classification-using-hugging-face-transformers-trainer-13407187cf89

In [159]:
import numpy as np
import tensorflow as tf
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import evaluate #!pip install transformers datasets evaluate

In [160]:
# make data into dataset to be able to handle larger amounts of data
train_dataset = Dataset.from_dict({'text': X_train, 'label': y_train}) # NOTE y data is not to be one-hot encoded
val_dataset = Dataset.from_dict({'text': X_val, 'label': y_val})
test_dataset = Dataset.from_dict({'text': X_test, 'label': y_test})

In [161]:
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [162]:
dataset['train'], dataset['train'][0]

(Dataset({
     features: ['text', 'label'],
     num_rows: 3000
 }),
 {'text': 'feel rude say better men', 'label': 0})

In [163]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
# Mapping between special tokens and their IDs.
print(f'The unknown token is {tokenizer.unk_token} and the ID for the unkown token is {tokenizer.unk_token_id}.')
print(f'The seperator token is {tokenizer.sep_token} and the ID for the seperator token is {tokenizer.sep_token_id}.')
print(f'The pad token is {tokenizer.pad_token} and the ID for the pad token is {tokenizer.pad_token_id}.')
print(f'The sentence level classification token is {tokenizer.cls_token} and the ID for the classification token is {tokenizer.cls_token_id}.')
print(f'The mask token is {tokenizer.mask_token} and the ID for the mask token is {tokenizer.mask_token_id}.')

In [164]:
# function to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["text"], max_length=35, truncation=True, padding='max_length')

In [165]:
# tokenize the train, val and test datasets
train_dataset_tokenized = dataset['train'].map(tokenize_dataset)
val_dataset_tokenized = dataset['validation'].map(tokenize_dataset)
test_dataset_tokenized = dataset['test'].map(tokenize_dataset)

Map: 100%|██████████| 3000/3000 [00:01<00:00, 2483.80 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2236.38 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2681.04 examples/s]


In [166]:
train_dataset_tokenized, val_dataset_tokenized, test_dataset_tokenized

(Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 3000
 }),
 Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 1000
 }),
 Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 1000
 }))

In [167]:
model_hugging = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [177]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./transfer_learning_transformer/",          
    logging_dir='./transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=3,              
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [178]:
# List all evaluation metrics/models in Hugging Face
# evaluate.list_evaluation_modules()

# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [179]:
# Set up trainer
trainer = Trainer(
    model=model_hugging,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=val_dataset_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

In [180]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.5664,1.523262,0.416
2,1.4363,1.367639,0.518
3,1.3158,1.311843,0.531


TrainOutput(global_step=282, training_loss=1.4395047992679244, metrics={'train_runtime': 2223.1567, 'train_samples_per_second': 4.048, 'train_steps_per_second': 0.127, 'total_flos': 161880779340000.0, 'train_loss': 1.4395047992679244, 'epoch': 3.0})

In [181]:
# Predictions
y_test_predict = trainer.predict(test_dataset_tokenized)
#print(y_test_predict)

In [182]:
y_test_predict.predictions[:3]

array([[-0.3267856 , -0.44103605,  1.7728944 ,  0.03134485,  0.06998684,
        -1.0767158 ],
       [-0.2656088 , -0.45882598,  0.9377078 , -0.3954832 ,  0.8267701 ,
        -1.7086649 ],
       [-0.37199643, -0.34076005,  1.7501726 ,  0.31945994, -0.04927007,
        -1.2711248 ]], dtype=float32)

In [183]:
# Predicted probabilities
y_test_probabilities = tf.nn.softmax(y_test_predict.predictions)
y_test_probabilities[:3]

<tf.Tensor: shape=(3, 6), dtype=float32, numpy=
array([[0.07437343, 0.06634367, 0.60715175, 0.10640252, 0.11059459,
        0.03513397],
       [0.10809388, 0.0891021 , 0.36007655, 0.09492867, 0.32226655,
        0.02553229],
       [0.07059461, 0.07283453, 0.5894104 , 0.14095068, 0.0974832 ,
        0.02872665]], dtype=float32)>

In [184]:
# Predicted labels
y_test_pred_labels = np.argmax(y_test_probabilities, axis=1)
y_test_pred_labels[:20]

array([2, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 2, 2])

In [185]:
# Actual labels
y_test_actual_labels = y_test_predict.label_ids
y_test_actual_labels[:20]

array([2, 3, 2, 2, 4, 4, 3, 4, 4, 5, 2, 3, 4, 0, 1, 4, 4, 0, 2, 2])

In [186]:
# evaluate model
trainer.evaluate(test_dataset_tokenized)

{'eval_loss': 1.2866343259811401,
 'eval_accuracy': 0.555,
 'eval_runtime': 66.046,
 'eval_samples_per_second': 15.141,
 'eval_steps_per_second': 0.485,
 'epoch': 3.0}

In [187]:
# Load and compute f1 metric
metric_f1 = evaluate.load("f1")
metric_f1.compute(predictions=y_test_pred_labels, references=y_test_actual_labels, average='weighted')

# Load and compute recall metric
metric_recall = evaluate.load("recall")
metric_recall.compute(predictions=y_test_pred_labels, references=y_test_actual_labels, average='weighted')

{'recall': 0.555}

In [None]:
# # Save tokenizer and model
# tokenizer.save_pretrained('./transfer_learning_transformer/')
# trainer.save_model('./transfer_learning_transformer/')

# # Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("./transfer_learning_transformer/")
# loaded_model = AutoModelForSequenceClassification.from_pretrained('./transfer_learning_transformer/')

## Hugging face
https://huggingface.co/docs/transformers/training\
https://huggingface.co/learn/nlp-course/chapter3/1?fw=tf\

In [28]:
# tokenize
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model_inputs = tokenizer(X_array.tolist(), padding='longest', truncation=True, return_tensors="tf") # tensor flow tensor returned
model_inputs

{'input_ids': <tf.Tensor: shape=(100, 33), dtype=int32, numpy=
array([[  101,  2514, 18138, ...,     0,     0,     0],
       [  101,  4921,  2063, ...,     0,     0,     0],
       [  101,  2036,  2514, ...,     0,     0,     0],
       ...,
       [  101,  5674,  2619, ...,     0,     0,     0],
       [  101,  4921,  2063, ...,     0,     0,     0],
       [  101,  3685,  2514, ...,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(100, 33), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(100, 33), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [33]:
model_TFhugging = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)

outputs = model_TFhugging(model_inputs)
print(outputs.logits)

# kernel crashes with 20000 samples -> data should be made into a dataset 

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tf.Tensor(
[[ 3.77859890e-01  3.31305638e-02 -4.14384872e-01  2.68686920e-01
  -2.25081965e-01 -3.92402738e-01]
 [ 1.47919461e-01 -9.85136181e-02 -7.66087174e-02  5.12365937e-01
  -3.03294241e-01 -3.16262960e-01]
 [ 1.58609882e-01 -1.00193717e-01 -1.99917838e-01  4.08303022e-01
  -2.44259611e-01 -3.46224546e-01]
 [ 2.42585778e-01  2.58706883e-03 -3.20779413e-01  2.95459986e-01
  -2.36818656e-01 -3.80875170e-01]
 [ 2.74985135e-01  2.92812437e-02 -3.78858566e-01  3.66466701e-01
  -2.82129407e-01 -3.62889767e-01]
 [ 2.56998003e-01 -2.88348608e-02 -2.88001716e-01  4.25435901e-01
  -1.65954813e-01 -3.19993556e-01]
 [ 1.65628806e-01 -1.00769818e-01 -1.51799291e-01  4.70328122e-01
  -3.11065435e-01 -2.85241604e-01]
 [ 2.19614893e-01 -1.32952351e-02 -2.22404927e-01  3.30655545e-01
  -1.78189039e-01 -3.96509230e-01]
 [ 4.01930720e-01  6.27020895e-02 -4.48277056e-01  1.74111009e-01
  -1.85422987e-01 -4.34740454e-01]
 [ 7.03607053e-02 -3.06887925e-02 -5.29971719e-03  5.84378242e-01
  -3.82330775e

In [34]:
predictions = tf.math.softmax(outputs.logits, axis=-1)
print(predictions)

tf.Tensor(
[[0.24582675 0.17414665 0.11131703 0.2204022  0.13451627 0.11379111]
 [0.18918979 0.14786765 0.1511424  0.27237973 0.12048641 0.11893395]
 [0.19895288 0.15358657 0.13900922 0.25538218 0.13297997 0.12008906]
 [0.21903628 0.17230026 0.12469518 0.2309293  0.13561675 0.11742225]
 [0.2222584  0.17384025 0.11558393 0.24354999 0.12732285 0.11744449]
 [0.21135485 0.15880954 0.1225524  0.2501289  0.13846058 0.11869378]
 [0.19576076 0.14997892 0.14251757 0.2654941  0.12153462 0.12471398]
 [0.20989755 0.16628605 0.13490888 0.23454799 0.14100786 0.11335171]
 [0.2547862  0.18148918 0.10887679 0.20287806 0.14160918 0.11036064]
 [0.1702389  0.15387695 0.15783376 0.28463858 0.10825735 0.12515436]
 [0.17564364 0.15290438 0.15229082 0.2881763  0.1050859  0.1258989 ]
 [0.18243852 0.15103796 0.15570231 0.27339303 0.10982525 0.12760302]
 [0.19000988 0.14489272 0.14519207 0.26702508 0.1194166  0.13346367]
 [0.22889699 0.17446333 0.12042102 0.22753339 0.13810222 0.11058301]
 [0.16831738 0.15241136

In [35]:
model_TFhugging.config.id2label

{0: 'LABEL_0',
 1: 'LABEL_1',
 2: 'LABEL_2',
 3: 'LABEL_3',
 4: 'LABEL_4',
 5: 'LABEL_5'}

---
# keras_nlp

In [188]:
import keras
import keras_nlp
from keras_nlp.models import BertTokenizer
import numpy as np

classifier_keras_nlp = keras_nlp.models.BertClassifier.from_preset(
    "bert_base_en",
    num_classes=6,
    preprocessor=None,
)

Using TensorFlow backend


In [189]:
classifier_keras_nlp.summary()

In [190]:
classifier_keras_nlp.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(1e-5),
    jit_compile=True,
    metrics=["accuracy"],
)
# Access backbone programmatically (e.g., to change `trainable`).
classifier_keras_nlp.backbone.trainable = False

In [None]:
# # This code results in tokenized sequences but gives only 'token_ids'
# tokenizer = BertTokenizer.from_preset('bert_base_en')

# max_length = 40

# X_train_tokenized = tokenizer(X_train.tolist())
# X_val_tokenized = tokenizer(X_val.tolist())
# X_test_tokenized = tokenizer(X_test.tolist())

In [191]:
# tokenizer returning 'token_ids', 'segment_ids', 'padding_mask'
preprocessor = keras_nlp.models.BertPreprocessor.from_preset('bert_base_en')

X_train_preprocessed = preprocessor(X_train.tolist())
X_val_preprocessed = preprocessor(X_val.tolist())
X_test_preprocessed = preprocessor(X_test.tolist())

In [192]:
X_test_preprocessed

{'token_ids': <tf.Tensor: shape=(1000, 512), dtype=int32, numpy=
 array([[  101,  1631,  2816, ...,     0,     0,     0],
        [  101,  1631,  2489, ...,     0,     0,     0],
        [  101,  1253,  1631, ...,     0,     0,     0],
        ...,
        [  101,  1631,  1176, ...,     0,     0,     0],
        [  101,  1631,  1463, ...,     0,     0,     0],
        [  101, 13280, 25906, ...,     0,     0,     0]], dtype=int32)>,
 'segment_ids': <tf.Tensor: shape=(1000, 512), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>,
 'padding_mask': <tf.Tensor: shape=(1000, 512), dtype=bool, numpy=
 array([[ True,  True,  True, ..., False, False, False],
        [ True,  True,  True, ..., False, False, False],
        [ True,  True,  True, ..., False, False, False],
        ...,
        [ True,  Tr

In [None]:
history = classifier_keras_nlp.fit(X_train_preprocessed, y_train, epochs=5, validation_data=(X_val_preprocessed, y_val), verbose=1)
train_loss, train_accuracy, val_loss, val_accuracy = history.history.values()
test_loss, test_accuracy = classifier_keras_nlp.evaluate(X_test_preprocessed, y_test, verbose=2)
plot_training_history(train_loss, train_accuracy, val_loss, val_accuracy, test_loss, test_accuracy)

# kernel crashes at too many samples -> data into datasets

---