# Fine-Tune DistilBERT For Multi-Class Text Classification Using Tensorflow and Keras

In [216]:
## Import required packages
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

import tensorflow as tf
import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from plotly.offline import iplot


## Preprocess Data

### Split into Train, Validation, Test using Stratified Sampling

In [None]:
# Import Data
root_path = 'data/full_dataset.csv'
df = pd.read_csv(root_path)
df.head()

# Encode the 'category' column into numerical labels
df['encoded_text'] = df['category'].astype('category').cat.codes

# Separate columns for splitting
data_texts = df['request'].to_list()  # 'request' is the text data
data_labels = df['encoded_text'].to_list()  # Encoded class labels
stratify_values = df['stratify_col'].to_list()  # Stratification column

# Split the data into Train/Validation sets with stratification
train_texts, val_texts, train_labels, val_labels, train_stratify, val_stratify = train_test_split(
    data_texts, data_labels, stratify_values, 
    test_size=0.2, stratify=stratify_values, random_state=0
)

# Split the Train set further into Train/Test with stratification
train_texts, test_texts, train_labels, test_labels = train_test_split(
    train_texts, train_labels, 
    test_size=0.1, stratify=train_stratify, random_state=0
)

Unique categories: 5
Encoded labels: [2 1 3 4 0]


### View test/train/validation Splits

In [223]:
# Map numerical labels back to category names
label_mapping = dict(enumerate(df['category'].astype('category').cat.categories))
print("\nLabel Mapping (Encoded -> Category):")

for encoded, category in label_mapping.items():
    print(f"{encoded}: {category}")

# Output dataset information
print("\nFinal dataset information:")
print(f"Train set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")
print(f"Test set size: {len(test_texts)}")

print(f"Example train_texts: {train_texts[:3]}") 
print(f"Example train_labels: {train_labels[:3]}")
print(f"Example val_texts: {val_texts[:3]}") 
print(f"Example val_labels: {val_labels[:3]}")
print(f"Example test_texts: {test_texts[:3]}") 
print(f"Example test_labels: {test_labels[:3]}")


Label Mapping (Encoded -> Category):
0: Facilities Management
1: Finance
2: HR
3: IT Support
4: Marketing

Final dataset information:
Train set size: 3596
Validation set size: 1000
Test set size: 400
Example train_texts: ['I’m gathering details about rewards for long-term employees and was hoping you could provide some insight. Let me know if you need further specifics from me.', 'Do you have the latest version of the diversity and inclusion policies handbook? I need it for a new hire orientation.', 'Could you share detailed insights on the performance metrics for keyword research for PPC campaigns? I’d like to use this data for our planning.']
Example train_labels: [2, 2, 4]
Example val_texts: ['Could you outline the steps to optimize our launching retargeting ads approach? Any case studies or examples would be helpful.', 'Need access to server maintenance.', 'Insights on customer retention strategies performance needed.']
Example val_labels: [4, 3, 4]
Example test_texts: ['I’m exper

### Optional: Save test/train/val to CSV

In [224]:
# Create DataFrames for each split
train_df = pd.DataFrame({
    'request': train_texts,
    'label': train_labels
})

val_df = pd.DataFrame({
    'request': val_texts,
    'label': val_labels
})

test_df = pd.DataFrame({
    'request': test_texts,
    'label': test_labels
})

# Save DataFrames to CSV files
train_df.to_csv("data/train.csv", index=False)
val_df.to_csv("data/validation.csv", index=False)
test_df.to_csv("data/test.csv", index=False)


## Tokenize Data For DistilBERT Model

In [217]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation = True, padding = True  )

val_encodings = tokenizer(val_texts, truncation = True, padding = True )

In [218]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).batch(32)


val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(32)

## Define and Train Model

In [None]:
import tf_keras

# Model setup
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
opt = tf_keras.optimizers.legacy.Adam(learning_rate=5e-5)
loss = tf_keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Raw logits expected
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])


# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    verbose=1
)

# Evaluate the model
evaluation_results = model.evaluate(val_dataset)
print(f"Validation Loss: {evaluation_results[0]}")
print(f"Validation Accuracy: {evaluation_results[1]}")
print(history.history.keys()) 

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Loss: [0.0016522674122825265, 1.0]


## Save Model

In [198]:
save_directory = "./saved_models" 

model.save_pretrained(save_directory)

tokenizer.save_pretrained(save_directory)

('./saved_models/tokenizer_config.json',
 './saved_models/special_tokens_map.json',
 './saved_models/vocab.txt',
 './saved_models/added_tokens.json')