In [None]:
import pandas as pd
from typing import List, Literal, Dict
from pydantic import BaseModel, Field, field_validator, model_validator, ValidationError
from openai import OpenAI
import time
import os
from dotenv import find_dotenv, load_dotenv
import logging
import tomli
import re
from sentence_transformers import SentenceTransformer


In [None]:

logging.basicConfig(level=logging.INFO)


# Load api key (Currently blank)

In [None]:

client = OpenAI(
    base_url="https://openrouter.ai/api/v1/",
    api_key=api_key
)

# Choose model

In [None]:
MODEL_NAME = 'openai/gpt-4o-mini'

# Load Sample data

In [None]:
def create_sample_data():
    sample_data = [
        {
            "Row_ID": 1,
            "ProblemDescription": "I want to know how much my flight bonus will be for next year.",
            "Resolution": "Here’s a link to the bonus calculation guidelines.",
            "CustomerFeedback": "I wasn’t contacted and got a generic response with no specifics."
        },
    
        {
            "Row_ID": 2,
            "ProblemDescription": "Why is my ticket cost higher than usual?",
            "Resolution": "The price reflects new seasonal rates.",
            "CustomerFeedback": "I don’t understand why the cost increased. No detailed explanation."
        },
        {
            "Row_ID": 3,
            "ProblemDescription": "When will my refund for the canceled flight be processed?",
            "Resolution": "Refunds typically take 4-6 weeks to process.",
            "CustomerFeedback": "I received a generic timeline. No specific details for my case."
        },
        {
            "Row_ID": 4,
            "ProblemDescription": "I can’t access my flight booking details.",
            "Resolution": "You can access them directly on the website by logging in.",
            "CustomerFeedback": "The website is not working, and the advice didn’t help."
        },
        {
            "Row_ID": 5,
            "ProblemDescription": "How do I reset my password for my booking account?",
            "Resolution": "Follow the steps in the attached guide to reset your password.",
            "CustomerFeedback": "The guide wasn’t clear, I couldn’t reset my password."
        },
        {
            "Row_ID": 6,
            "ProblemDescription": "Why hasn’t my refund been processed yet?",
            "Resolution": "Refunds are processed at the end of each month.",
            "CustomerFeedback": "I wasn’t informed why my refund was delayed."
        },
        {
            "Row_ID": 7,
            "ProblemDescription": "Can you explain the process for claiming a flight delay compensation?",
            "Resolution": "Please refer to the compensation policy document linked here.",
            "CustomerFeedback": "The document is too complicated, and I didn’t find any clear answers."
        },
        {
            "Row_ID": 8,
            "ProblemDescription": "I need help with making a group booking for my department.",
            "Resolution": "Here’s a guide for making group bookings on our website.",
            "CustomerFeedback": "The guide missed some important steps for large group bookings."
        },
        {
            "Row_ID": 9,
            "ProblemDescription": "Why hasn’t my booking for an extra seat been processed?",
            "Resolution": "It takes up to 48 hours to confirm extra seat requests.",
            "CustomerFeedback": "Agent didn’t check my specific request, just gave a standard response."
        },
        {
            "Row_ID": 10,
            "ProblemDescription": "I need help with changing my flight date.",
            "Resolution": "Here’s the link to change your booking online.",
            "CustomerFeedback": "The website process was unclear and I couldn’t change my flight."
        }
    ]
    return pd.DataFrame(sample_data)

sample_data = create_sample_data()

# Pydantic models and definitions

## Category and labels for multiple option selection

In [None]:
LABELS = Literal['TIME_WAITING', 'POLICY', 'SERVICE_PROCESS', 
                'QUALITY_OF_RESOLUTION', 'SELF_HELP_RESOURCES','AGENT_MANNERS', 
                'AGENT_KNOWLEDGE', 'TECHNOLOGY', 'REPEATED_FOLLOW_UP']

# Category definitions

CATEGORY_DEFINITIONS = """
    'TIME_WAITING': 'Feedback that EXPLICITY mentions long call waiting times, call queue lengths, or delays in receiving responses. This includes complaints about waiting for responses or resolution timeframes.'
    'POLICY': 'Feedback related to company policies, rules, or standard procedures that affect service delivery. This includes cases where policies are unclear, seem unfair, or limit service options.'
    'SERVICE_PROCESS': 'Feedback related to how processes in services are delivered or tasks are completed. This includes difficult processes or complicated workflows.'
    'QUALITY_OF_RESOLUTION': 'Feedback related to the customer\'s problem not being resolved, or answered. This also includes where the customer indicates the resolution was generic or incomplete.'
    'SELF_HELP_RESOURCES': 'Feedback related to QRG, website links, documentation, user guides, manuals, or other self-service materials. This includes unclear instructions, missing information, or difficult-to-use resources.'
    'AGENT_MANNERS': 'Feedback that EXPLICITLY mentions the agent\'s poor behavior towards customers. This includes specific mentions of rudeness, lack of empathy, being abrupt, dismissive, or any other unprofessional conduct. Do NOT apply this category for general complaints about resolution quality or service process.'
    'AGENT_KNOWLEDGE': 'Feedback related to the agent\'s expertise or understanding. This includes incorrect information, inability to explain clearly, or lack of technical knowledge.'
    'TECHNOLOGY': 'Feedback related to systems, software, or technical infrastructure. This includes system errors, software bugs, or lack of ease of use with digital tools.'
    'REPEATED_FOLLOW_UP': 'Feedback that EXPLICITLY mentions the customer having to follow-up multiple times on a request.'
"""


## Pydantic Models

In [None]:
class TicketClassification(BaseModel):
    '''Classification model for support tickets'''
    
    Category: List[LABELS] = Field(..., description='''
        Analyze the CustomerFeedback and select one or more labels that apply to categorise the feedback. Use the ProblemDescription and Resolution to provide context.
        Choose categories that best match the customer's feedback about their experience.
    ''')
    justification: str = Field(..., description='Explain why you selected these categories, referencing specific aspects of the ticket.')
    confidence: float = Field(..., description="Confidence score between 0 and 1.")
    Sub_Category: List[str] = Field(..., description='''
        Return one or two SUB-CATEGORY labels (sub-themes) that are most relevant to the specific topic being discussed.
        Each sub-category MUST be in UPPER_SNAKE_CASE.
        Use underscores between words, no punctuation or quotes.
        Examples:
        Feedback: I wasn’t informed why my refund was delayed.
        Sub-theme: REFUND_COMMUNICATION_GAP
        Feedback: The website process was unclear and I couldn’t change my flight.
        Sub-theme: ONLINE_BOOKING_DIFFICULTY
        
    ''')
    Sub_Category_Description: List[str] = Field(..., description="Short natural language descriptions for each sub-theme, used for clustering")
    
    @field_validator('Sub_Category')
    def validate_sub_category_format(cls, value):
            if not (1 <= len(value) <= 2):
                raise ValueError("sub_category must have 1 or 2 items only.")

            pattern = r'^[A-Z]+(_[A-Z]+)*$'
            for sub in value:
                if not re.match(pattern, sub):
                    raise ValueError(f"Invalid Sub_Category format: {sub}. Must be UPPER_SNAKE_CASE.")
            return value

    @model_validator(mode='after')
    def check_sub_not_in_category(self):
        overlap = set(self.Category) & set(self.Sub_Category)
        if overlap:
            raise ValueError(f"Sub-category must not repeat main Category label(s): {overlap}")
        return self
    
    Row_ID: int

# Classification loop with prompt

In [None]:
def classify_single_ticket(ticket: Dict) -> TicketClassification:
        for attempt in range(3):
            try:
                prompt = (
                f"Analyze this support ticket and classify the CustomerFeedback into its main category and sub-category. The Field structures are as follows:\n\n"
                f"1. CUSTOMER'S INITIAL REQUEST:\n"
                f"   {ticket['ProblemDescription']}\n"
                f"   (This is what the customer initially asked for or needed help with)\n\n"
                f"2. AGENT'S RESOLUTION TO REQUEST:\n"
                f"   {ticket['Resolution']}\n"
                f"   (This is how the support agent attempted to resolve the request)\n\n"
                f"3. CUSTOMER'S FEEDBACK:\n"
                f"   {ticket['CustomerFeedback']}\n"
                f"   (This is the customer's reaction to the support ticket, reflecting their satisfaction or dissatisfaction with the support experience, how they felt about the resolution, and any other comments regarding the service received)\n\n"
                f"Ticket ID: {ticket['Row_ID']}\n\n"
                f"Instructions for Classification:\n"
                f"1. FOCUS PRIMARILY ON THE CUSTOMERFEEDBACK AS IT IS THE PRIMARY DRIVER OF THE CLASSIFICATION\n"
                f"2. FOR THE MAIN CATEGORY, YOU MUST USE THE {CATEGORY_DEFINITIONS} to guide your classification. CONSIDER THESE CAREFULLY AND ADHERE TO THE DEFINITIONS WHEN CLASSIFYING.\n"
                f"3. FOR THE SUB-CATEGORY, PLEASE KEEP THE SUB-CATEGORY LABELS IN UPPER_SNAKE_CASE AND USE UNDERSCORES BETWEEN WORDS. Select AT MOST TWO SUB-CATEGORY LABELS that are most relevant to the specific topic being discussed\n"
                f"4  FOR THE SUB-CATEGORY, DONT REPEAT the exact same label from the {CATEGORY_DEFINITIONS}, the sub-category is a more specific topic and will be used to perform CLUSTERING\n" 
                f"5. Compare the initial request (ProblemDescription) with the resolution provided and USE THIS AS CONTEXT\n"
                f"6. Provide classification that reflects one or more of the following categories that best describe the customer's feedback. If multiple categories apply, please list them all (separate categories with commas):\n\n"
                f"7.For Sub_Category_Description, write a short, clear natural language sentence (1–2 lines max) that explains the core issue being described in the sub-category label — as if you were summarizing the customer's frustration or complaint to a service analyst. Make it specific, clusterable, and semantically rich (e.g., “The customer did not receive any update on the refund process after submitting the request.”)."
            )
            
                result = client.beta.chat.completions.parse(
                    model=MODEL_NAME,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0,
                    response_format=TicketClassification
                    )
                
                
                return result.choices[0].message.parsed

            except Exception as e:
                if attempt == 2:
                    result = TicketClassification(
                        Row_ID=ticket['Row_ID'],
                        Category=[],
                        Sub_Category=['UNKNOWN'],
                        confidence=0,
                        justification=f"Classification failed after 3 attempts: {str(e)}"
                    )
                    return result

            time.sleep(1)

# Main program execution control

In [None]:
if __name__ == "__main__":
    start_time = time.time()
    classified_results = []
    tickets = sample_data.to_dict(orient="records")
    print("sample_data")
    print(sample_data)
    for ticket in tickets:
        result = classify_single_ticket(ticket)
        classified_results.append(result.model_dump())
        
    results_df = pd.DataFrame(classified_results)
    print(results_df)
    final_df = pd.merge(
        sample_data[['Row_ID', 'ProblemDescription', 'Resolution', 'CustomerFeedback']],
        results_df,
        on='Row_ID',
        how='inner'
    )
    
    print(final_df)
    print("exporting to csv")
    final_df.to_csv("results_df.csv", index=False)
    print(f"Execution time: {time.time() - start_time} seconds")