### Text-Classification with RAG Base LLM Model

In [80]:
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from qdrant_client.http import models
import pandas as pd
from tenacity import (
    retry,
    stop_after_attempt,
    wait_fixed,
)
from pydantic import BaseModel
import instructor
import openai
from dotenv import load_dotenv
import os
from sklearn.model_selection import train_test_split
import time

In [36]:
# Embedding model to embed the text data
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Spawn a Qdrant client connection in memory
qdrant_client = QdrantClient(":memory:")
qdrant_client.create_collection('text-classification',vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE))
# Training CSV File
data_csv = pd.read_csv('../data/text-classification/train.csv')
# train-test split
train_csv, test_csv = train_test_split(data_csv, test_size=0.2,random_state=42)
# push train embeddings to qdrant
embeddings = embedding_model.encode(train_csv['text'].tolist(), show_progress_bar=True)
# Create and upload points to Qdrant
points = []
for idx, row in train_csv.iterrows():
    point = models.PointStruct(
        id=idx,  # Use the dataframe index as the point ID
        vector=embeddings[train_csv.index.get_loc(idx)].tolist(),  # Convert the embedding to a list
        payload={'label_text': row['label_text'] , "text":row['text']}  # Use the label_text as the payload
    )
    points.append(point)
qdrant_client.upload_points(collection_name='text-classification', points=points)

Batches: 100%|██████████| 200/200 [00:06<00:00, 31.95it/s]


In [37]:
def qdrant_search(query_text,top_k=5):
    query_vector = embedding_model.encode(query_text)
    search_response = qdrant_client.search(collection_name='text-classification', query_vector=query_vector, limit=top_k)
    return search_response

In [38]:
# these are all the category labels in the training set
category_labels = train_csv.label_text.unique()
categories_list = "- " + "\n- ".join(category_labels)
system_prompt = f"""
You are an agent that is specialized in classification tasks.\n
Along with the input text, you are provided with the top 10 documents retrieved from a Retrieval-Augmented Generation (RAG) model. 
Use this information to classify the input text into one of the following categories:
{categories_list}
Note: The documents are included in the user's message for context.
"""

In [61]:
category_labels

array(['iot', 'not toxic', 'play_music',
       'balance_not_updated_after_cheque_or_cash_deposit',
       'weather_query', 'weather', 'news', 'exchange_rate', 'alarm_set',
       'takeaway_query', 'play', 'datetime', 'automatic_top_up',
       'cancel_transfer', 'extra_charge_on_statement', 'general',
       'why_verify_identity', 'datetime_query', 'toxic', 'news_query',
       'alarm', 'wrong_amount_of_cash_received', 'top_up_limits',
       'transfer_not_received_by_recipient',
       'card_payment_not_recognised', 'audio_volume_mute', 'music_query',
       'takeaway', 'lost_or_stolen_card', 'card_not_working',
       'card_linking', 'pending_top_up', 'iot_coffee', 'pin_blocked',
       'music', 'pending_cash_withdrawal', 'alarm_remove',
       'unable_to_verify_identity', 'card_arrival',
       'getting_virtual_card', 'exchange_via_app', 'iot_hue_lightoff',
       'fiat_currency_support', 'supported_cards_and_currencies',
       'edit_personal_details', 'music_likeness',
       'ca

In [106]:
from typing import Literal
class CategoryModel(BaseModel):
    category: Literal[tuple(category_labels.tolist())]
    def check_category(cls, value):
        if value not in category_labels:
            raise ValueError(f"{value} is not a valid category")
        return value
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai_client = instructor.patch(openai.OpenAI(api_key=openai_api_key))
@retry(
    stop=stop_after_attempt(2),  # Stop after 2 attempts
    wait=wait_fixed(60),  # Wait 60 second between retries
) #Handle retries for the OpenAI API Rate Limit Calls
def classify_query_text(query_text,use_rag = True,chat_model="gpt-3.5-turbo") -> str:
    if use_rag:
        search_results = qdrant_search(query_text, top_k=10)
        sample_documents = [{
                "Text": result.payload['text'],
                "Label": result.payload['label_text'],
            } for result in search_results]

        user_message = f"""
        Reference Documents from RAG Model: {sample_documents}\n\n
        Input text to classify: {query_text}
        """
    else:
        user_message = f"""
        Input text to classify: {query_text}
        """
    openai_request_body = {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ],
        "model": chat_model,
        "response_model": CategoryModel,
        "temperature": 0.2,
        "max_tokens": 1000,
        "seed": 42,
    }

    try:
        chat_completion = openai_client.chat.completions.create(**openai_request_body)
        
    except Exception as e:
        raise e

    return chat_completion.category

In [107]:
from sklearn.metrics import classification_report
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Define a function to apply classification
def classify_text(text, use_rag, chat_model):
    return classify_query_text(text, use_rag, chat_model)

# Define a function to apply multi-threading and calculate accuracy
def classify_and_calculate_accuracy(test_csv, use_rag=True, chat_model="gpt-3.5-turbo"):
    # Use ThreadPoolExecutor to apply multi-threading with 12 threads
    with ThreadPoolExecutor(max_workers=12) as executor:
        # Wrap the executor.map function with tqdm for progress bar
        test_csv['predicted_category'] = list(tqdm(executor.map(classify_text, test_csv['text'], [use_rag]*len(test_csv), [chat_model]*len(test_csv)), total=len(test_csv), desc="Predicting"))

    # calculate the accuracy of the model
    test_csv['accuracy'] = test_csv['label_text'] == test_csv['predicted_category']
    print(f"Accuracy: {test_csv['accuracy'].sum() / len(test_csv)}")

    # print the classification report
    print("Classification Report:")
    c_r = classification_report(test_csv['label_text'], test_csv['predicted_category'])
    print(c_r)

    return c_r

In [73]:
classify_and_calculate_accuracy(test_csv)

Predicting: 100%|██████████| 1599/1599 [01:46<00:00, 15.06it/s]

Accuracy: 0.5128205128205128
Classification Report:
                                                  precision    recall  f1-score   support

                                       age_limit       0.85      1.00      0.92        11
                                           alarm       0.04      0.03      0.03        35
                                     alarm_query       0.08      0.08      0.08        12
                                    alarm_remove       0.50      0.75      0.60         4
                                       alarm_set       0.24      0.35      0.29        17
                                           audio       0.00      0.00      0.00        30
                               audio_volume_down       0.12      0.20      0.15         5
                               audio_volume_mute       0.35      0.60      0.44        10
                              audio_volume_other       0.00      0.00      0.00         1
                                 audio_volume_u


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'                                                  precision    recall  f1-score   support\n\n                                       age_limit       0.85      1.00      0.92        11\n                                           alarm       0.04      0.03      0.03        35\n                                     alarm_query       0.08      0.08      0.08        12\n                                    alarm_remove       0.50      0.75      0.60         4\n                                       alarm_set       0.24      0.35      0.29        17\n                                           audio       0.00      0.00      0.00        30\n                               audio_volume_down       0.12      0.20      0.15         5\n                               audio_volume_mute       0.35      0.60      0.44        10\n                              audio_volume_other       0.00      0.00      0.00         1\n                                 audio_volume_up       0.21      0.20      0.21        

## How to Finetune

### Prepare Data For Finetuning

In [75]:
# create training file for fine tuning , the data needs to be preprocessed in a specific way to finetune a model
import json
training_data = []
for row in train_csv.iterrows():
    label = row[1].label_text
    text = row[1].text
    training_data.append({
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text},
            {"role": "assistant", "content": label}
        ]
    })
training_file_name = "../data/text-classification/new_training_data.jsonl"
def prepare_data(dictionary_data, final_file_name):
    with open(final_file_name, 'w') as outfile:
        for entry in dictionary_data:
            json.dump(entry, outfile)
            outfile.write('\n')
prepare_data(training_data, training_file_name)

### Upload Data For Finetuning

In [76]:
# upload training file to openai
training_file_id = openai_client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)
print(f"Training File ID: {training_file_id}")

Training File ID: FileObject(id='file-dl40RvUHbTR5KWh3OD7WcFZk', bytes=14200775, created_at=1717400849, filename='new_training_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


### Create Finetuning Job

Some Hyperparameters for fine tuning you can change are:

- batch_size : Number of examples in each batch. A larger batch size means that model parameters are updated less frequently, but with lower variance.

- learning_rate_multiplier: Scaling factor for the learning rate. A smaller learning rate may be useful to avoid overfitting.

- n_epochs:The number of epochs to train the model for. An epoch refers to one full cycle through the training dataset.

In [86]:
# create fine tuning job on openai
response = openai_client.fine_tuning.jobs.create(
  training_file=training_file_id.id,
  model="gpt-3.5-turbo",
  hyperparameters={
    "n_epochs": 1,
  }
)
job_id = response.id
status = response.status
print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ftjob-K2hZChUK3QqLbjDZxI3XUgQj.
Training Response: FineTuningJob(id='ftjob-K2hZChUK3QqLbjDZxI3XUgQj', created_at=1717403916, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=1, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-bx8UmNWyClGMb1kBUdofHdpY', result_files=[], seed=1336267681, status='validating_files', trained_tokens=None, training_file='file-dl40RvUHbTR5KWh3OD7WcFZk', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
Training Status: validating_files


Takes ~70-80 mins to finetune on 6k rows

In [None]:
def check_fine_tuning_job_status(openai_client, job_id):
    def retrieve_job_status():
        return openai_client.fine_tuning.jobs.retrieve(job_id).status

    status = retrieve_job_status()
    while status not in ["succeeded", "failed"]:
        print(f"Job not in terminal status: {status}. Waiting.")
        time.sleep(10)
        status = retrieve_job_status()
        print(f"Status: {status}")

    print(f"Finetune job {job_id} finished with status: {status}")

    def check_other_jobs():
        result = openai_client.fine_tuning.jobs.list()
        print(f"Found {len(result.data)} finetune jobs.")

    check_other_jobs()

check_fine_tuning_job_status(openai_client, job_id)


### Text-Classification without RAG and Fine-Tuned LLM Model 

In [98]:
fine_tuned_model = openai_client.fine_tuning.jobs.retrieve(job_id).fine_tuned_model
print(fine_tuned_model)

ft:gpt-3.5-turbo-0125:personal::9VySD7OR


In [108]:
classify_and_calculate_accuracy(test_csv,use_rag=False,chat_model=fine_tuned_model)

Predicting: 100%|██████████| 1599/1599 [01:31<00:00, 17.56it/s]

Accuracy: 0.6823014383989994
Classification Report:
                                                  precision    recall  f1-score   support

                                       age_limit       1.00      1.00      1.00        11
                                           alarm       0.50      0.03      0.05        35
                                     alarm_query       0.45      0.83      0.59        12
                                    alarm_remove       0.50      1.00      0.67         4
                                       alarm_set       0.44      0.94      0.60        17
                                           audio       0.00      0.00      0.00        30
                               audio_volume_down       0.36      0.80      0.50         5
                               audio_volume_mute       0.43      1.00      0.61        10
                              audio_volume_other       1.00      1.00      1.00         1
                                 audio_volume_u


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'                                                  precision    recall  f1-score   support\n\n                                       age_limit       1.00      1.00      1.00        11\n                                           alarm       0.50      0.03      0.05        35\n                                     alarm_query       0.45      0.83      0.59        12\n                                    alarm_remove       0.50      1.00      0.67         4\n                                       alarm_set       0.44      0.94      0.60        17\n                                           audio       0.00      0.00      0.00        30\n                               audio_volume_down       0.36      0.80      0.50         5\n                               audio_volume_mute       0.43      1.00      0.61        10\n                              audio_volume_other       1.00      1.00      1.00         1\n                                 audio_volume_up       0.58      0.93      0.72        

### Text Classification with RAG and Finetuned LLM Model

In [109]:
classify_and_calculate_accuracy(test_csv,use_rag=True,chat_model=fine_tuned_model)

Predicting: 100%|██████████| 1599/1599 [01:39<00:00, 16.08it/s]

Accuracy: 0.5859912445278299
Classification Report:
                                                  precision    recall  f1-score   support

                                       age_limit       1.00      1.00      1.00        11
                                           alarm       0.07      0.03      0.04        35
                                     alarm_query       0.43      0.75      0.55        12
                                    alarm_remove       0.50      1.00      0.67         4
                                       alarm_set       0.28      0.41      0.33        17
                                           audio       0.00      0.00      0.00        30
                               audio_volume_down       0.31      0.80      0.44         5
                               audio_volume_mute       0.39      0.70      0.50        10
                              audio_volume_other       0.00      0.00      0.00         1
                                 audio_volume_u


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'                                                  precision    recall  f1-score   support\n\n                                       age_limit       1.00      1.00      1.00        11\n                                           alarm       0.07      0.03      0.04        35\n                                     alarm_query       0.43      0.75      0.55        12\n                                    alarm_remove       0.50      1.00      0.67         4\n                                       alarm_set       0.28      0.41      0.33        17\n                                           audio       0.00      0.00      0.00        30\n                               audio_volume_down       0.31      0.80      0.44         5\n                               audio_volume_mute       0.39      0.70      0.50        10\n                              audio_volume_other       0.00      0.00      0.00         1\n                                 audio_volume_up       0.50      0.87      0.63        

In [110]:
classify_and_calculate_accuracy(test_csv,use_rag=False,chat_model="gpt-3.5-turbo")

Predicting:  40%|████      | 645/1599 [00:54<02:02,  7.77it/s]Incomplete output detected, should increase max_tokens
Incomplete output detected, should increase max_tokens
Predicting: 100%|██████████| 1599/1599 [02:04<00:00, 12.79it/s]

Accuracy: 0.3902439024390244
Classification Report:
                                                  precision    recall  f1-score   support

                                       age_limit       1.00      0.27      0.43        11
                                           alarm       0.56      0.14      0.23        35
                                     alarm_query       0.50      0.75      0.60        12
                                    alarm_remove       0.50      0.75      0.60         4
                                       alarm_set       0.41      0.82      0.55        17
                                           audio       0.00      0.00      0.00        30
                               audio_volume_down       0.30      0.60      0.40         5
                               audio_volume_mute       0.41      0.70      0.52        10
                              audio_volume_other       0.20      1.00      0.33         1
                                 audio_volume_u


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'                                                  precision    recall  f1-score   support\n\n                                       age_limit       1.00      0.27      0.43        11\n                                           alarm       0.56      0.14      0.23        35\n                                     alarm_query       0.50      0.75      0.60        12\n                                    alarm_remove       0.50      0.75      0.60         4\n                                       alarm_set       0.41      0.82      0.55        17\n                                           audio       0.00      0.00      0.00        30\n                               audio_volume_down       0.30      0.60      0.40         5\n                               audio_volume_mute       0.41      0.70      0.52        10\n                              audio_volume_other       0.20      1.00      0.33         1\n                                 audio_volume_up       0.61      0.73      0.67        

### Finetuned Model Without RAG Classification Report

| Class                                          | Precision | Recall | F1-Score | Support |
|------------------------------------------------|-----------|--------|----------|---------|
| age_limit                                      | 1.00      | 1.00   | 1.00     | 11      |
| alarm                                          | 0.50      | 0.03   | 0.05     | 35      |
| alarm_query                                    | 0.45      | 0.83   | 0.59     | 12      |
| alarm_remove                                   | 0.50      | 1.00   | 0.67     | 4       |
| alarm_set                                      | 0.44      | 0.94   | 0.60     | 17      |
| audio                                          | 0.00      | 0.00   | 0.00     | 30      |
| audio_volume_down                              | 0.36      | 0.80   | 0.50     | 5       |
| audio_volume_mute                              | 0.43      | 1.00   | 0.61     | 10      |
| audio_volume_other                             | 1.00      | 1.00   | 1.00     | 1       |
| audio_volume_up                                | 0.58      | 0.93   | 0.72     | 15      |
| automatic_top_up                               | 1.00      | 1.00   | 1.00     | 13      |
| balance_not_updated_after_cheque_or_cash_deposit | 0.90      | 1.00   | 0.95     | 19      |
| calendar_query                                 | 0.50      | 1.00   | 0.67     | 1       |
| cancel_transfer                                | 1.00      | 0.92   | 0.96     | 13      |
| card_acceptance                                | 1.00      | 1.00   | 1.00     | 6       |
| card_arrival                                   | 0.75      | 0.79   | 0.77     | 19      |
| card_delivery_estimate                         | 0.56      | 0.62   | 0.59     | 8       |
| card_linking                                   | 0.95      | 1.00   | 0.98     | 20      |
| card_not_working                               | 1.00      | 0.70   | 0.82     | 10      |
| card_payment_fee_charged                       | 0.92      | 0.79   | 0.85     | 14      |
| card_payment_not_recognised                    | 1.00      | 0.89   | 0.94     | 19      |
| ...                                            | ...       | ...    | ...      | ...     |
| **accuracy**                                   |           |        | **0.68** | 1599    |
| **macro avg**                                  | 0.61      | 0.63   | 0.57     | 1599    |
| **weighted avg**                               | 0.69      | 0.68   | 0.62     | 1599    |

### Finetuned Model with RAG Classification Report
| Class                                          | Precision | Recall | F1-Score | Support |
|------------------------------------------------|-----------|--------|----------|---------|
| age_limit                                      | 1.00      | 1.00   | 1.00     | 11      |
| alarm                                          | 0.07      | 0.03   | 0.04     | 35      |
| alarm_query                                    | 0.43      | 0.75   | 0.55     | 12      |
| alarm_remove                                   | 0.50      | 1.00   | 0.67     | 4       |
| alarm_set                                      | 0.28      | 0.41   | 0.33     | 17      |
| audio                                          | 0.00      | 0.00   | 0.00     | 30      |
| audio_volume_down                              | 0.31      | 0.80   | 0.44     | 5       |
| audio_volume_mute                              | 0.39      | 0.70   | 0.50     | 10      |
| audio_volume_other                             | 0.00      | 0.00   | 0.00     | 1       |
| audio_volume_up                                | 0.50      | 0.87   | 0.63     | 15      |
| automatic_top_up                               | 1.00      | 1.00   | 1.00     | 13      |
| balance_not_updated_after_cheque_or_cash_deposit | 0.95      | 1.00   | 0.97     | 19      |
| calendar                                       | 0.00      | 0.00   | 0.00     | 0       |
| calendar_query                                 | 0.00      | 0.00   | 0.00     | 1       |
| cancel_transfer                                | 0.92      | 0.92   | 0.92     | 13      |
| card_acceptance                                | 1.00      | 1.00   | 1.00     | 6       |
| card_arrival                                   | 0.84      | 0.84   | 0.84     | 19      |
| card_delivery_estimate                         | 0.70      | 0.88   | 0.78     | 8       |
| card_linking                                   | 0.95      | 1.00   | 0.98     | 20      |
| card_not_working                               | 0.89      | 0.80   | 0.84     | 10      |
| card_payment_fee_charged                       | 0.88      | 1.00   | 0.93     | 14      |
| ...                                            | ...       | ...    | ...      | ...     |
| **accuracy**                                   |           |        | **0.59** | 1599    |
| **macro avg**                                  | 0.46      | 0.48   | 0.46     | 1599    |
| **weighted avg**                               | 0.54      | 0.59   | 0.55     | 1599    |

### Base Model With RAG For Few-Shot
| Class                                          | Precision | Recall | F1-Score | Support |
|------------------------------------------------|-----------|--------|----------|---------|
| age_limit                                      | 0.85      | 1.00   | 0.92     | 11      |
| alarm                                          | 0.04      | 0.03   | 0.03     | 35      |
| alarm_query                                    | 0.08      | 0.08   | 0.08     | 12      |
| alarm_remove                                   | 0.50      | 0.75   | 0.60     | 4       |
| alarm_set                                      | 0.24      | 0.35   | 0.29     | 17      |
| audio                                          | 0.00      | 0.00   | 0.00     | 30      |
| audio_volume_down                              | 0.12      | 0.20   | 0.15     | 5       |
| audio_volume_mute                              | 0.35      | 0.60   | 0.44     | 10      |
| audio_volume_other                             | 0.00      | 0.00   | 0.00     | 1       |
| audio_volume_up                                | 0.21      | 0.20   | 0.21     | 15      |
| automatic_top_up                               | 1.00      | 0.92   | 0.96     | 13      |
| balance_not_updated_after_cheque_or_cash_deposit | 0.79      | 1.00   | 0.88     | 19      |
| calendar                                       | 0.00      | 0.00   | 0.00     | 0       |
| calendar_query                                 | 0.00      | 0.00   | 0.00     | 1       |
| cancel_transfer                                | 0.92      | 0.92   | 0.92     | 13      |
| card_acceptance                                | 0.86      | 1.00   | 0.92     | 6       |
| card_arrival                                   | 0.81      | 0.68   | 0.74     | 19      |
| card_delivery_estimate                         | 0.64      | 0.88   | 0.74     | 8       |
| card_linking                                   | 1.00      | 1.00   | 1.00     | 20      |
| card_not_working                               | 0.80      | 0.80   | 0.80     | 10      |
| card_payment_fee_charged                       | 1.00      | 0.93   | 0.96     | 14      |
| ...                                            | ...       | ...    | ...      | ...     |
| **accuracy**                                   |           |        | **0.51** | 1599    |
| **macro avg**                                  | 0.41      | 0.43   | 0.41     | 1599    |
| **weighted avg**                               | 0.52      | 0.51   | 0.50     | 1599    |

### Finetuned Model without RAG 
| Class                                          | Precision | Recall | F1-Score | Support |
|------------------------------------------------|-----------|--------|----------|---------|
| age_limit                                      | 1.00      | 0.27   | 0.43     | 11      |
| alarm                                          | 0.56      | 0.14   | 0.23     | 35      |
| alarm_query                                    | 0.50      | 0.75   | 0.60     | 12      |
| alarm_remove                                   | 0.50      | 0.75   | 0.60     | 4       |
| alarm_set                                      | 0.41      | 0.82   | 0.55     | 17      |
| audio                                          | 0.00      | 0.00   | 0.00     | 30      |
| audio_volume_down                              | 0.30      | 0.60   | 0.40     | 5       |
| audio_volume_mute                              | 0.41      | 0.70   | 0.52     | 10      |
| audio_volume_other                             | 0.20      | 1.00   | 0.33     | 1       |
| audio_volume_up                                | 0.61      | 0.73   | 0.67     | 15      |
| automatic_top_up                               | 1.00      | 0.92   | 0.96     | 13      |
| balance_not_updated_after_cheque_or_cash_deposit | 0.78      | 0.95   | 0.86     | 19      |
| calendar_query                                 | 0.00      | 0.00   | 0.00     | 1       |
| cancel_transfer                                | 1.00      | 0.46   | 0.63     | 13      |
| card_acceptance                                | 0.86      | 1.00   | 0.92     | 6       |
| card_arrival                                   | 0.33      | 0.21   | 0.26     | 19      |
| card_delivery_estimate                         | 0.15      | 0.25   | 0.19     | 8       |
| card_linking                                   | 1.00      | 0.70   | 0.82     | 20      |
| card_not_working                               | 0.60      | 0.60   | 0.60     | 10      |
| card_payment_fee_charged                       | 0.83      | 0.36   | 0.50     | 14      |
| card_payment_not_recognised                    | 0.64      | 0.37   | 0.47     | 19      |
| ...                                            | ...       | ...    | ...      | ...     |
| **accuracy**                                   |           |        | **0.39** | 1599    |
| **macro avg**                                  | 0.46      | 0.41   | 0.37     | 1599    |
| **weighted avg**                               | 0.57      | 0.39   | 0.37     | 1599    |

| Technique                          | Accuracy | Macro Avg Precision | Macro Avg Recall | Macro Avg F1-Score | Weighted Avg Precision | Weighted Avg Recall | Weighted Avg F1-Score |
|------------------------------------|----------|---------------------|------------------|--------------------|------------------------|---------------------|-----------------------|
| Finetuned Model without RAG        | 0.68     | 0.61                | 0.63             | 0.57               | 0.69                   | 0.68                | 0.62                  |
| Finetuned Model with RAG           | 0.59     | 0.46                | 0.48             | 0.46               | 0.54                   | 0.59                | 0.55                  |
| Base Model with RAG    | 0.51     | 0.41                | 0.43             | 0.41               | 0.52                   | 0.51                | 0.50                  |
| Base Model without RAG  | 0.39     | 0.46                | 0.41             | 0.37               | 0.57                   | 0.39                | 0.37                  |