## Playing around with Intent Recognition

In [None]:
%pip install python-dotenv pandas openai ipywidgets litellm tenacity

In [None]:
from dotenv import load_dotenv
from pathlib import Path
load_dotenv()
import os
import pandas as pd
from tenacity import retry


In [None]:
!cd ./notebooks/data && \
    curl https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.0.tar.gz --output amazon-massive-dataset-1.0.tar.gz && \
    tar -xzvf amazon-massive-dataset-1.0.tar.gz && \
    ls

In [None]:
file_path = "./notebooks/data/1.0/data/en-US.jsonl"

# Read the JSONL file
df = pd.read_json(file_path, lines=True)
df

### Get all the intents from the fraction of a dataset

In [None]:
df_sampled = df[df["partition"].str.contains("test")]
print(len(df_sampled))
df_sampled = df_sampled.sample(frac=0.3, random_state=5)
print(len(df_sampled))
intents = set(df_sampled['intent'])
# print(len(intents))
test_set = df_sampled['utt']
print(len(intents))
print(len(test_set))
print(test_set)

print(intents)

In [None]:
import litellm
from litellm import completion
import json

from tenacity import stop_after_attempt, wait_exponential

model = "gpt-3.5-turbo"
# model = "groq/mixtral-8x7b-32768"
# model = "claude-3-haiku-20240307"

litellm.set_verbose=False

# to call chatgpt
def call_gpt(entry: str) -> str:
    try:
        response = completion(
            model=model,
            temperature=0.0,
            tools=[
                {
                    "type": "function",
                    "function": {
                        "name": "identify_intent",
                        "description": "Identify the intent of the message using the best match from the provided enum list",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "intent": {
                                    "type": "string",
                                    "description": "The intent of the user message, what is the message about.",
                                    "enum": list(intents),
                                },
                            },
                            "required": ["intent"],
                        },
                    },
                },
            ],
            messages=[
                {
                    "role": "system",
                    "content": f"You are an intent classification system. Your goal is to identify the intent of the message.",
                },
                {"role": "user", "content": f"{entry}"},
            ],
        )
        response = response.choices[0].message.tool_calls
        if response:
            try:
                intent = json.loads(response[0].function.arguments)["intent"]
            except Exception as e:
                return "No intent argument found"
        else:
            return "No tool call"
        return intent
    except:
        return "Error"


# single message test
call_gpt("brighten up the lighting")

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import time

def call_gpt_with_index(index_entry):
    index, entry = index_entry
    result = call_gpt(entry)
    return index, result

if "claude" in model:
    from tqdm import tqdm
    generated_intents = list()
    for entry in tqdm(test_set, 'processing'):
        time.sleep(0.1)
        intent = call_gpt(entry)
        generated_intents.append(intent)
else:
    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = [executor.submit(call_gpt_with_index, (index, entry)) for index, entry in enumerate(test_set)]
        results_with_index = [future.result() for future in tqdm(as_completed(futures), total=len(test_set), desc='Processing')]
        results_with_index.sort(key=lambda x: x[0])
        generated_intents = [result for _, result in results_with_index]

print(len(generated_intents), generated_intents)

In [None]:
df_sampled['generated_intent'] = generated_intents
df_sampled.to_csv(f"./notebooks/data/1.0/data/en-US-labeled-{model}.csv")
df_sampled

In [None]:
counter = 0
for i, row in df_sampled.iterrows():
    if row['intent'] == row['generated_intent']:
        counter += 1
    else:
        print(
            f"This is the predicted one - {row['generated_intent']} and it's the actual intent {row['intent']} at index - {i}"
        )

print(counter)
print(f"Ratio - {counter} out of {len(df_sampled)} are correct, accuracy is = {(counter / len(df_sampled)) * 100} %.")