In [1]:
from transformers import TFAutoModel, AutoTokenizer
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
import pickle

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model_save_path = 'model' 
tokenizer_save_path = 'tokenizer'
label_encoder_save_path = 'label_encoder.pkl'  

Generating synthetic data w/ timestamps for training

In [2]:
df = pd.read_csv('chatbot4.csv')

df_today = df.copy()
df_today['sentence'] = df_today['sentence'] + ' today'

df_this_month = df.copy()
df_this_month['sentence'] = df_this_month['sentence'] + ' this month'

df_this_year = df.copy()
df_this_year['sentence'] = df_this_year['sentence'] + ' this year'

extended_df = pd.concat([df_today, df_this_month, df_this_year], ignore_index=True)

extended_df.to_csv('data_with_time.csv', index=False)  


Model Training

In [3]:
df = pd.read_csv('chatbot4.csv')

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['label'])

num_unique_labels = len(label_encoder.classes_)

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    df['sentence'].tolist(),
    encoded_labels,
    test_size=0.2, 
    random_state=42, 
    stratify=encoded_labels 
)

train_encodings = tokenizer(train_sentences, padding=True, truncation=True, max_length=128, return_tensors="tf")
val_encodings = tokenizer(val_sentences, padding=True, truncation=True, max_length=128, return_tensors="tf")

train_labels_onehot = tf.keras.utils.to_categorical(train_labels, num_classes=num_unique_labels)
val_labels_onehot = tf.keras.utils.to_categorical(val_labels, num_classes=num_unique_labels)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels_onehot)).shuffle(len(train_sentences)).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels_onehot)).batch(16)

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_unique_labels)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.fit(train_dataset, validation_data=val_dataset, epochs=3)

model.safetensors: 100%|██████████| 440M/440M [00:38<00:00, 11.6MB/s] 





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3

Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2ce35def3d0>

Saving the model

In [9]:

model.save_pretrained(model_save_path)

tokenizer.save_pretrained(tokenizer_save_path)

with open(label_encoder_save_path, 'wb') as file:
    pickle.dump(label_encoder, file)

Loading the model

In [4]:
model = TFBertForSequenceClassification.from_pretrained(model_save_path)

tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)

with open(label_encoder_save_path, 'rb') as file:
    label_encoder = pickle.load(file)




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [6]:
def predict_class(sentence, model, tokenizer, label_encoder):
    inputs = tokenizer(sentence, padding=True, truncation=True, max_length=128, return_tensors="tf")

    logits = model(inputs.data).logits

    probabilities = tf.nn.softmax(logits, axis=-1)

    predicted_class_idx = tf.argmax(probabilities, axis=-1).numpy()[0]

    confidence_score = probabilities[0, predicted_class_idx].numpy()

    predicted_class = label_encoder.inverse_transform([predicted_class_idx])[0]
    
    return predicted_class, confidence_score

predicted_class, confidence_score = predict_class("who is our worst employee", model, tokenizer, label_encoder)
if confidence_score < 0.70 :
    predicted_class = "ambiguous"
print(f"The predicted class for the sentence is: {predicted_class} with a confidence score of {confidence_score}")

The predicted class for the sentence is: sales_report with a confidence score of 0.9802234172821045
