In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

In [2]:
df = pd.read_excel("../data/chatbot_conversation_history.xlsx" )

In [3]:
df.head()

Unnamed: 0,UserID,Start Time,End Time,Session ID,Query,Label,Language
0,45,2024-07-01 12:01:27,2024-07-01 16:35:13,S001,How to install Product3?,product3,English
1,48,2024-07-01 23:54:02,2024-07-01 23:55:02,S002,What is the documentation for Product1?,product1,English
2,65,2024-07-01 10:40:30,2024-07-01 16:32:16,S003,Can you help me?,smalltalk,French
3,68,2024-07-01 18:22:07,2024-07-01 18:23:07,S004,Help with Product2 issues.,product2,English
4,68,2024-07-01 00:30:15,2024-07-01 03:51:26,S005,Troubleshooting Product3.,product3,English


In [4]:
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["Label"])

In [5]:
df.head()

Unnamed: 0,UserID,Start Time,End Time,Session ID,Query,Label,Language,label_encoded
0,45,2024-07-01 12:01:27,2024-07-01 16:35:13,S001,How to install Product3?,product3,English,2
1,48,2024-07-01 23:54:02,2024-07-01 23:55:02,S002,What is the documentation for Product1?,product1,English,0
2,65,2024-07-01 10:40:30,2024-07-01 16:32:16,S003,Can you help me?,smalltalk,French,3
3,68,2024-07-01 18:22:07,2024-07-01 18:23:07,S004,Help with Product2 issues.,product2,English,1
4,68,2024-07-01 00:30:15,2024-07-01 03:51:26,S005,Troubleshooting Product3.,product3,English,2


In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label_encoded'])

In [7]:
# Tokenize the queries using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the data
train_encoded_inputs = tokenizer(
    train_df['Query'].tolist(), 
    padding=True, 
    truncation=True, 
    return_tensors='pt', 
    max_length=128
)




In [8]:
# Prepare the dataset for the BERT model
train_input_ids = train_encoded_inputs['input_ids']
train_attention_masks = train_encoded_inputs['attention_mask']
# labels = torch.tensor(df['label_encoded'].values)
train_labels = torch.tensor(train_df['label_encoded'].values, dtype=torch.long)

# Display the shapes of the input tensors
print(train_input_ids.shape, train_attention_masks.shape, train_labels.shape)

torch.Size([800, 13]) torch.Size([800, 13]) torch.Size([800])


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create a custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks, labels=None):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx]
        }
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item



# Create the train and test datasets
train_dataset = CustomDataset(train_input_ids, train_attention_masks, train_labels)
# test_dataset = CustomDataset(X_test, attention_masks_test, y_test)


In [12]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(df['label_encoded'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:

model.to(device)
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for the results
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
)


In [14]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
    # eval_dataset=test_dataset
)


In [15]:
# Train the model
trainer.train()


Step,Training Loss
10,1.3719
20,1.335
30,1.2935
40,1.2095
50,1.1228
60,0.9627
70,0.84
80,0.6611
90,0.4666
100,0.3728


TrainOutput(global_step=300, training_loss=0.33841514579330884, metrics={'train_runtime': 610.6969, 'train_samples_per_second': 3.93, 'train_steps_per_second': 0.491, 'total_flos': 16033617849600.0, 'train_loss': 0.33841514579330884, 'epoch': 3.0})

In [20]:
# Tokenize the test data
test_encoded_inputs = tokenizer(
    test_df['Query'].tolist(), 
    padding=True, 
    truncation=True, 
    return_tensors='pt', 
    max_length=128
)

test_input_ids = test_encoded_inputs['input_ids']
test_attention_masks = test_encoded_inputs['attention_mask']

# Create the test dataset
test_dataset = CustomDataset(test_input_ids, test_attention_masks)

# Predict labels for the test set
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(batch_predictions)

# Decode the predicted labels
predicted_labels = le.inverse_transform(predictions)

# Add predicted labels to the test dataframe
test_df['Predicted_Label'] = predicted_labels

# Print the test dataframe with predicted labels
print(test_df[['Query', 'Predicted_Label']])

# Optionally, save the test dataframe with predicted labels to an Excel file
test_df.to_excel('../data/chatbot_test_predictions.xlsx', index=False)

                                Query Predicted_Label
123          How to install Product3?        product3
632          How to install Product2?        product2
984                How's the weather?       smalltalk
66       Product1 installation guide.        product1
581         Troubleshooting Product3.        product3
..                                ...             ...
788  Guide for Product2 installation.        product2
932                     Good morning!       smalltalk
610                  Can you help me?       smalltalk
11                   Can you help me?       smalltalk
757           Product3 documentation.        product3

[200 rows x 2 columns]


In [22]:
model.save_pretrained('models/bert_model')
tokenizer.save_pretrained('models/bert_tokenizer')

print("Model and tokenizer saved successfully.")

Model and tokenizer saved successfully.
