# 0. Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import warnings

# Filter out the specific UserWarnings
warnings.filterwarnings("ignore", category=UserWarning, message="A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy")
warnings.filterwarnings("ignore", category=UserWarning, message="unable to load libtensorflow_io_plugins.so")
warnings.filterwarnings("ignore", category=UserWarning, message="file system plugins are not loaded")

In [None]:
# Hugging Face library
from transformers import AutoTokenizer, TFAutoModel

In [None]:
# Hugging Face library
from datasets import Dataset, DatasetDict

In [None]:
# Accuracy metrics from Scikit-Learn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [None]:
import tensorflow as tf

from tensorflow.keras.optimizers import AdamW

from tensorflow.keras.losses import SparseCategoricalCrossentropy

from tensorflow.keras.layers import Input, Dense, GlobalMaxPooling1D

from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

# 1. Load Datasets

In [None]:
# Create a function to import the data from csv format
def load_data(file_path):
    return pd.read_csv(file_path, header=None, delimiter='\t', names=['sentiment', 'text'])


train_path = '/kaggle/input/sentiment/train_bal_vdg_27_11.tsv'
test_path = '/kaggle/input/sentiment/test_bal_vdg_27_11.tsv'
val_path = '/kaggle/input/sentiment/valid_bal_vdg_27_11.tsv'

df_train = load_data(train_path)
df_test = load_data(test_path)
df_val = load_data(val_path)

In [None]:
# Since I'm gonna use the sparse categorical cross entropy loss, I map the labels to integers
encoded_dict = {'NEG':0, 'NEU':1, 'POS':2}

df_train['label'] = df_train['sentiment'].apply(lambda x: encoded_dict[x])
df_test['label'] = df_test['sentiment'].apply(lambda x: encoded_dict[x])
df_val['label'] = df_val['sentiment'].apply(lambda x: encoded_dict[x])


In [None]:
# To get an idea of the data
pd.set_option('display.max_colwidth', 150)
df_train.head()

In [None]:
# Both these functions can have as input a single label/id or a list of them

def label2id(label):
    if isinstance(label, list):
        return [encoded_dict[label] for label in label]
    else:
        return encoded_dict[label]

def id2label(id):
    encoded_dict_inv = {v: k for k, v in encoded_dict.items()}
    
    if isinstance(id, list):
        return [encoded_dict_inv[i] for i in id]
    else:
        return encoded_dict_inv[id]

In [None]:
# I'm combining the pandas dataframe to the dataset dictionary of Hugging Face

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
val_dataset = Dataset.from_pandas(df_val)

# Create the DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset, 'validation': val_dataset})

print(dataset)

In [None]:
# Removing duplicates

# Initialize a dictionary to store updated datasets
updated_datasets = {}

# Check for and remove duplicates in each split
for split in dataset.keys():
    split_data = dataset[split]
    
    # Access the 'text' column within the list
    text_column = split_data['text']
    
    # Initialize a set to track unique texts
    unique_texts = set()
    
    # Initialize lists to store the filtered data
    filtered_text = []
    
    # Iterate through the 'text' column and filter duplicates
    for text in text_column:
        if text not in unique_texts:
            unique_texts.add(text)
            filtered_text.append(text)
    
    # Create a new Dataset object with the filtered data
    updated_datasets[split] = split_data.select(list(range(len(filtered_text))))
    
    # Print the number of removed duplicates
    duplicate_count = len(text_column) - len(filtered_text)
    print(f"Duplicates removed in {split} split: {duplicate_count}\n")

# Update the dataset dictionary with the filtered datasets
dataset.update(updated_datasets)

# Print the updated dataset information
for split in dataset.keys():
    split_data = dataset[split]
    print(f"{split}: {len(split_data['text'])} rows")

print(dataset)

# 2. Preprocess data

In [None]:
bert = TFAutoModel.from_pretrained('Twitter/twhin-bert-base', from_pt=True)
tokenizer = AutoTokenizer.from_pretrained('Twitter/twhin-bert-base')

In [None]:
max_length = 256

def tokenize_text(dataset):
    return tokenizer(
        text=dataset['text'],
        add_special_tokens=True,
        return_token_type_ids=False,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf',
        verbose=True
    )

In [None]:
encoded_dataset = dataset.map(tokenize_text)

In [None]:
encoded_dataset

In [None]:
encoded_dataset = encoded_dataset.remove_columns(['sentiment','text'])

encoded_dataset

In [None]:
def preprocess_data(encoded_dataset, data_type):
    input_ids = np.array(encoded_dataset[data_type]['input_ids'])
    input_ids = np.squeeze(input_ids, axis=1)

    attention_mask = np.array(encoded_dataset[data_type]['attention_mask'])
    attention_mask = np.squeeze(attention_mask, axis=1)

    label = np.array(encoded_dataset[data_type]['label'])

    return input_ids, attention_mask, label

def main_processing(encoded_dataset):
    input_ids_train, attention_mask_train, label_train = preprocess_data(encoded_dataset, 'train')
    input_ids_val, attention_mask_val, label_val = preprocess_data(encoded_dataset, 'validation')
    input_ids_test, attention_mask_test, label_test = preprocess_data(encoded_dataset, 'test')

    return input_ids_train, attention_mask_train, label_train, input_ids_val, attention_mask_val, label_val, input_ids_test, attention_mask_test, label_test

# Usage
input_ids_train, attention_mask_train, label_train, input_ids_val, attention_mask_val, label_val, input_ids_test, attention_mask_test, label_test = main_processing(encoded_dataset)


# 3. Defining the model

In [None]:
input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

embeddings = bert.bert(input_ids, attention_mask = input_mask)[0]
out = GlobalMaxPooling1D(name="GlobalMaxPooling1d")(embeddings)
out = Dense(128, activation='relu',name="Dense_relu")(out)


y = Dense(3, activation='softmax',name="Dense_softmax")(out)
    
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
for i, layer in enumerate(model.layers):
    print(f"Layer {i}: {layer.name}")

In [None]:
optimizer = AdamW(
    learning_rate=1e-5,
    epsilon=1e-08,
    weight_decay=0.001,
    name="AdamW"
)

In [None]:
def scheduler(epoch,lr):
    if epoch <2:
        return lr
    else:
        return lr*tf.math.exp(-0.1)
    
lr_scheduler = LearningRateScheduler(scheduler)

In [None]:
loss = SparseCategoricalCrossentropy(
    from_logits=False,
    ignore_class=None,
    reduction="auto",
    name="sparse_categorical_crossentropy",
)

In [None]:
early_stop = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=4,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0,
)

In [None]:
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=['sparse_categorical_accuracy']
)

In [None]:
model.summary()

The output shape is shown as "(None, ...)" in the layer summary you provided because the specific batch size dimension is not fixed in the layer summary. In many deep learning frameworks, including TensorFlow, Keras, and others, when you define a model, you typically leave the batch size dimension as "None" in the layer summary. The "None" here indicates that the batch size is not specified at the model definition stage and will be determined dynamically during training or inference based on the input data.

input_ids and attention_mask:

Shape: (None, 256)
Explanation: These input layers are typically used for processing sequences, such as text data. The (None, 256) shape means that the model expects input sequences with a maximum length of 256 tokens, and the batch size can vary (indicated by "None").

bert (TFBertMainLayer):

Output Shape: (None, 256, 768)
Explanation: This is the output shape of the BERT model. It processes input sequences and produces embeddings for each token in the sequence. The first dimension "None" represents the batch size, the second dimension "256" represents the sequence length, and the third dimension "768" represents the size of the hidden representation for each token.

global_max_pooling1d (GlobalMaxPooling1D):

Output Shape: (None, 768)
Explanation: This layer performs global max-pooling over the token embeddings generated by BERT. It takes the maximum value across the sequence length dimension (256) for each of the 768 hidden units, resulting in a fixed-size representation for each input example. The "None" batch dimension remains unspecified.

dense (Dense):

Output Shape: (None, 128)
Explanation: This is a fully connected (dense) layer with 128 output units. It takes the output from the global max-pooling layer and transforms it into a lower-dimensional space. The "None" batch dimension indicates variable batch size.

dropout_37 (Dropout):

Output Shape: (None, 128)
Explanation: Dropout is a regularization technique where a fraction of input units is randomly set to zero during each update, helping to prevent overfitting. The "None" batch dimension remains unspecified.

dense_1 (Dense):

Output Shape: (None, 32)
Explanation: This is another fully connected layer with 32 output units. It further reduces the dimensionality of the data. The "None" batch dimension indicates variable batch size.

dense_2 (Dense):

Output Shape: (None, 3)
Explanation: This is the final dense layer with 3 output units. It produces the final predictions or scores for a classification task with 3 classes. The "None" batch dimension remains unspecified.

# 4. Training

In [None]:
history = model.fit(
    x = {'input_ids':input_ids_train, 'attention_mask':attention_mask_train},
    y = label_train,
    validation_data = ({'input_ids':input_ids_val, 'attention_mask':attention_mask_val},
                      (label_val)),
    epochs=15,
    batch_size=16,
    callbacks=[early_stop, lr_scheduler]
)

In [None]:
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

# 5. Metrics

In [None]:
predicted = model.predict({'input_ids': input_ids_test, 'attention_mask': attention_mask_test})
predicted_labels = np.argmax(predicted, axis=1)

In [None]:
predicted_labels = predicted_labels.tolist()
predicted_labels = id2label(predicted_labels)
predicted_labels[0:7]

In [None]:
label_test = label_test.tolist()
label_test = id2label(label_test)
label_test[0:7]

In [None]:
dataset['test']['text'][0:3]

In [None]:
print(classification_report(label_test, predicted_labels))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Compute the confusion matrix
conf_matrix = confusion_matrix(label_test, predicted_labels, labels=['NEG', 'NEU', 'POS'])

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['NEG', 'NEU', 'POS'], yticklabels=['NEG', 'NEU', 'POS'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


## 5.1 Accuracy Score

In [None]:
accuracy = accuracy_score(label_test, predicted_labels) # (TP+TN)/P+N i.e total number of corrected classified tweet over total number of tweets

print(accuracy)

## 5.2 Preision Score

In [None]:
precision = precision_score(label_test, predicted_labels,average=None, labels=['NEG','NEU','POS']) # TP/(TP+FP) i.e if predicted a certain class, which is the probability of being really that class?

print(precision)

## 5.3 Recall (sensitivity) Score

In [None]:
recall = recall_score(label_test, predicted_labels,average=None, labels=['NEG','NEU','POS']) # TP/(TP+FN) i.e the ability of the estimator to predict all the tweets of a given class

print(recall)

## 5.4 F1 Score

In [None]:
f1score = f1_score(label_test, predicted_labels,average=None, labels=['NEG','NEU','POS']) # 2*(precision*recall)/(precision+recall)

print(f1score)

# 6. Push To Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import push_to_hub_keras

push_to_hub_keras(model, 'FedeBerto/Griffith-Sentiment')

In [None]:
from huggingface_hub import from_pretrained_keras

model = from_pretrained_keras('FedeBerto/Griffith-Sentiment')