In [None]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2## image processing
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import gensim.downloader as api
from PIL import Image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import (BinaryAccuracy, FalsePositives, FalseNegatives, TruePositives,
                                       TrueNegatives, Precision, Recall, AUC, binary_accuracy,Accuracy,
                                       TopKCategoricalAccuracy, CategoricalAccuracy,SparseCategoricalAccuracy)
from tensorflow.keras.optimizers import Adam
from datasets import load_dataset
from transformers import (BertTokenizerFast,TFBertTokenizer,BertTokenizer,RobertaTokenizerFast,
                          DataCollatorWithPadding,TFRobertaForSequenceClassification,TFBertForSequenceClassification,
                          TFBertModel,create_optimizer,TFDebertaForSequenceClassification,DebertaTokenizerFast)

In [6]:
BATCH_SIZE = 16

## data prep

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bitext/bitext-gen-ai-chatbot-customer-support-dataset")

print("Path to dataset files:", path)

In [None]:
dataset=load_dataset("csv", data_files="Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv")

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
intents = list(set(dataset['train']['intent']))
dict_intents = {intents[i]: i for i in range(len(intents))}
print(dict_intents)
print(len(intents))

In [22]:
def preprocess(dataset):
    return {'instruction': dataset['instruction'],
            'intent':dict_intents[dataset['intent']]}

In [None]:
prep_dataset = dataset.map(preprocess)

In [None]:
prep_dataset['train'][0]

In [None]:
model_id="microsoft/deberta-base"
tokenizer = DebertaTokenizerFast.from_pretrained(model_id)

In [26]:
def tokenizer_function(dataset):
    return tokenizer(dataset['instruction'],)

In [None]:
tokenized_dataset = prep_dataset.map(tokenizer_function)

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['train'][0]

In [31]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [32]:
tf_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns = ['input_ids', 'attention_mask', 'intent'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

In [33]:
def swap_position(dataset):
    return {'input_ids':dataset['input_ids'],
            'attention_mask':dataset['attention_mask']},dataset['intent']

In [34]:
tf_dataset = tf_dataset.map(swap_position)

In [35]:
train_dataset = tf_dataset.take(int(0.9*len(tf_dataset)))
val_dataset = tf_dataset.skip(int(0.9*len(tf_dataset)))

In [None]:
for i in val_dataset.take(1):
    print(i)

## Modeling

In [None]:
model=TFDebertaForSequenceClassification.from_pretrained(model_id,num_labels=len(intents))
model.summary()

### training

In [38]:
num_epochs = 2
batches_per_epoch = len(tokenized_dataset['train'])//BATCH_SIZE
total_train_steps = int(batches_per_epoch*num_epochs)

In [39]:
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [40]:
model.compile(optimizer=optimizer,
              metrics=['accuracy'])

In [None]:
history = model.fit(
    train_dataset,
    validation_data = val_dataset,
    epochs=2
)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Evaluation

### confusion matrix

In [None]:
predicted = []
labels = []

for input, label in val_dataset:
    predicted.append(model(**input).logits)
    labels.append(label.numpy)

In [None]:
print(predicted)
print(labels)

In [None]:
print(tf.argmax(predicted[:-1], axis=-1).numpy())
print(labels[:-1])

In [None]:
print(np.concatenate([np.array(labels[:-1]).flatten(),np.array(labels[-1]).flatten()]))
print(np.concatenate([np.argmax(predicted[:-1], axis = -1).flatten(), np.argmax(predicted[-1], axis = -1).flatten()]))

In [None]:
pred=np.concatenate([np.array(labels[:-1]).flatten(),np.array(labels[-1]).flatten()])
lab=np.concatenate([np.argmax(predicted[:-1], axis = -1).flatten(), np.argmax(predicted[-1], axis = -1).flatten()])

In [None]:
cm = confusion_matrix(lab, pred)
print(cm)
plt.figure(figsize=(16,16))

sns.heatmap(cm, annot=True,)
plt.title('Confusion matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')

## Testing

In [None]:
inputs = tokenizer(["Please how do i go about the account creation? ",
                    "After setting up my account, i feel like i need to change it. How do i go about that?",
                    "how do i know how much i need to pay?",
                    "purchased a product, which i now want to change"
                    ], padding=True,return_tensors="tf")

logits = model(**inputs).logits
outputs=tf.argmax(logits,axis=-1).numpy()

In [None]:
print(outputs)

In [None]:
reverse_dict_intents={i:intents[i] for i in range(len(intents))}
print(reverse_dict_intents)

In [None]:
for i in outputs:
  print(reverse_dict_intents[i])