In [27]:
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification, DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import InputExample, InputFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('../data/train.txt', sep=';', header=None, names=['text', 'emotion'])
test_df = pd.read_csv('../data/test.txt', sep=';', header=None, names=['text', 'emotion'])
validation_df = pd.read_csv('../data/validation.txt', sep=';', header=None, names=['text', 'emotion'])

combined_df = pd.concat([train_df, test_df, validation_df], ignore_index=True)

In [3]:
label = LabelEncoder()

In [4]:
label.fit(train_df['emotion'])

train_df['emotion'] = label.transform(train_df['emotion'])
test_df['emotion'] = label.transform(test_df['emotion'])
validation_df['emotion'] = label.transform(validation_df['emotion'])

In [5]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [6]:
def convert_data_to_examples(train, val, data_column, label_column):
    
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, 
                                                             text_a = x[data_column], 
                                                             text_b = None, 
                                                             label = x[label_column]), axis = 1)

    val_InputExamples = val.apply(lambda x: InputExample(guid=None, 
                                                         text_a = x[data_column], 
                                                         text_b = None, 
                                                         label = x[label_column]), axis = 1)
    return train_InputExamples, val_InputExamples

In [7]:
train_InputExamples, val_InputExamples = convert_data_to_examples(train_df, validation_df, 'text', 'emotion')

In [8]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] 
    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True,
            truncation=True
        )
        input_ids, attention_mask = (input_dict["input_ids"], input_dict["attention_mask"])

        features.append(
            InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=e.label) # token_type_ids=token_type_ids, label=e.label)
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    # "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        (
            {
                "input_ids": tf.int32,
                "attention_mask": tf.int32,
                # "token_type_ids": tf.int32,
            },
            tf.int64,
        ),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                # "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [9]:
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)



In [10]:
val_data = convert_examples_to_tf_dataset(list(val_InputExamples), tokenizer)
val_data = val_data.batch(32)

In [11]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])



In [12]:
model.fit(
    train_data, 
    epochs=3, 
    validation_data=val_data,
    verbose=1
    )

Epoch 1/3
   1000/Unknown - 2438s 2s/step - loss: 0.3265 - sparse_categorical_accuracy: 0.8838

2025-02-04 05:32:45.466000: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


Epoch 2/3


2025-02-04 05:33:39.079542: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


Epoch 3/3


<tf_keras.src.callbacks.History at 0x381f16b50>

In [15]:
import joblib

In [40]:
joblib.dump(model, '../models/distilbert_model.pkl')



['../models/distilbert_model.pkl']

In [21]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  4614      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66958086 (255.42 MB)
Trainable params: 66958086 (255.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
for layer in model.layers:
    print(layer.name, layer.trainable)
    for weight in layer.weights:
        print(weight.name, weight.shape)

distilbert True
tf_distil_bert_for_sequence_classification/distilbert/embeddings/word_embeddings/weight:0 (30522, 768)
tf_distil_bert_for_sequence_classification/distilbert/embeddings/position_embeddings/embeddings:0 (512, 768)
tf_distil_bert_for_sequence_classification/distilbert/embeddings/LayerNorm/gamma:0 (768,)
tf_distil_bert_for_sequence_classification/distilbert/embeddings/LayerNorm/beta:0 (768,)
tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/q_lin/kernel:0 (768, 768)
tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/q_lin/bias:0 (768,)
tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/k_lin/kernel:0 (768, 768)
tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/k_lin/bias:0 (768,)
tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/v_lin/kernel:0 (768, 768)
tf_distil_bert_for_sequence_classification/di

In [18]:
train_InputExamples, test_InputExamples = convert_data_to_examples(train_df, test_df, 'text', 'emotion')

test_data = convert_examples_to_tf_dataset(list(test_InputExamples), tokenizer)
test_data = test_data.batch(32)



In [19]:
results = model.evaluate(test_data)
print(results)

[0.20194397866725922, 0.9294999837875366]


2025-02-04 20:48:24.453303: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


In [20]:
print('Test - Loss:', results[0], '\nTest - Accuracy:', results[1])

Test - Loss: 0.20194397866725922 
Test - Accuracy: 0.9294999837875366


In [26]:
import numpy as np


X_test = test_df['text'].values
y_test = test_df['emotion'].values

# Convert text data to sequences
X_test_sequences = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors="tf")

# Make predictions
y_pred_prob = model.predict(X_test_sequences)
y_pred = np.argmax(y_pred_prob.logits, axis=1)

# Generate classification report
report = classification_report(y_test, y_pred, target_names=label.classes_)
print(report)

              precision    recall  f1-score   support

       anger       0.92      0.92      0.92       275
        fear       0.87      0.95      0.91       224
         joy       0.95      0.95      0.95       695
        love       0.84      0.81      0.83       159
     sadness       0.97      0.96      0.96       581
    surprise       0.84      0.73      0.78        66

    accuracy                           0.93      2000
   macro avg       0.90      0.89      0.89      2000
weighted avg       0.93      0.93      0.93      2000

