# Exploring text classification using LLMs without any frameworks

This was my first attempt at exploring text classification using LLMs without any frameworks.

I was testing with deepseek and mistral, mainly 8B models as that all my computer could handle.

The examples are based on this dataset: https://www.kaggle.com/datasets/chaudharyanshul/airline-reviews 

This is a BA Airlines Dataset from Kaggle which was uploaded by Chaudhary Anshul & Muskan Raisinghani


NOTE: If you are running this, you need to have the relevant packages I am using installed AND ollama with the relevant modules. Please see the requirements.txt file

In [None]:
import pandas as pd
from openai import OpenAI
import logging
import datetime
from dotenv import load_dotenv
import os

load_dotenv("INSERTPATHHERE")
api_key = ""

logging.basicConfig(level=logging.INFO)


client = OpenAI(base_url='APIURL',api_key=api_key) #Use your API key and URL here, you can also use a local model if you want.


def create_sample_data():
    sample_data = [
        {
            "Row_ID": 1,
            "ProblemDescription": "I want to know how much my flight bonus will be for next year.",
            "Resolution": "Here’s a link to the bonus calculation guidelines.",
            "CustomerFeedback": "I wasn’t contacted and got a generic response with no specifics."
        },
        {
            "Row_ID": 2,
            "ProblemDescription": "Why is my ticket cost higher than usual?",
            "Resolution": "The price reflects new seasonal rates.",
            "CustomerFeedback": "I don’t understand why the cost increased. No detailed explanation."
        },
        {
            "Row_ID": 3,
            "ProblemDescription": "When will my refund for the canceled flight be processed?",
            "Resolution": "Refunds typically take 4-6 weeks to process.",
            "CustomerFeedback": "I received a generic timeline. No specific details for my case."
        },
        {
            "Row_ID": 4,
            "ProblemDescription": "I can’t access my flight booking details.",
            "Resolution": "You can access them directly on the website by logging in.",
            "CustomerFeedback": "The website is not working, and the advice didn’t help."
        },
        {
            "Row_ID": 5,
            "ProblemDescription": "How do I reset my password for my booking account?",
            "Resolution": "Follow the steps in the attached guide to reset your password.",
            "CustomerFeedback": "The guide wasn’t clear, I couldn’t reset my password."
        },
        {
            "Row_ID": 6,
            "ProblemDescription": "Why hasn’t my refund been processed yet?",
            "Resolution": "Refunds are processed at the end of each month.",
            "CustomerFeedback": "I wasn’t informed why my refund was delayed."
        }
        ]
    return pd.DataFrame(sample_data)


theme_descriptions = {
    'TIME_WAITING': 'Feedback related to long call waiting times, call queue lengths, or delays in receiving responses. This is specifically about the time taken to access customer support.',
    
    'POLICY': 'Feedback related to company policies, rules, or standard procedures. This includes cases where customers feel policies are unclear, seem unfair, or limit service options.',
    
    'SERVICE_OR_PROCESS': 'Feedback related to difficult processes or complicated workflows.',
    
    'RESOLUTION_DID_NOT_ANSWER_QUESTION': 'Feedback related to customer\'s problem not being resolved, it can be resolutions that are generic or incomplete or do not answer the initial question asked..',
    
    'SELF_HELP_RESOURCES': 'Feedback related to QRG, websites, documentation, user guides, manuals, or other self-service materials. This includes unclear instructions, missing information, or difficult-to-use resources.',
    
    'AGENT_MANNERS': 'Feedback related to agent\'s attitudes towards customers. This includes lack of empathy, being rude, abusive or abrupt.',
    
    'AGENT_KNOWLEDGE': 'Feedback related to customers being critical of an agent\'s expertise or understanding. This includes incorrect information, inability to explain clearly, or lack of technical knowledge.',
    
    'TECHNOLOGY': 'Feedback related to systems, software, or technical infrastructure. This includes system errors, software bugs, or lack of ease of use with digital tools.',
    
    'REPEATED_FOLLOW_UP': 'Feedback related to updates on existing requests logged with customer service teams. This means lack of updates, or the need for the customer to have enquired multiple times to get a response..'
}


def classify_single_theme(row, theme, model=None):
    text = f"Problem: {row['ProblemDescription']} Resolution: {row['Resolution']} Feedback: {row['CustomerFeedback']}"
    prompt = f"""Does the following text mention issues related to '{theme}'? ONLY respond with 'Yes' or 'No'.\n\nText: {text}.
                 In selecting the theme you MUST use the {theme_descriptions} to guide your selection
      """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1
        )
        answer = response.choices[0].message.content.strip()
        return answer
    except Exception as e:
        logging.error(f"Error processing row {row['Row_ID']}: {e}")
        return "Cannot classify"
    

def main():
    df = create_sample_data()

    MODEL = "openai/gpt-5-mini"  # CHANGE THIS TO USE OTHER MODELS
    
    themes = [
        "TIME_WAITING",
        "POLICY",
        "SERVICE_OR_PROCESS",
        "RESOLUTION_DID_NOT_ANSWER_QUESTION",
        "SELF_HELP_RESOURCES",
        "AGENT_MANNERS",
        "AGENT_KNOWLEDGE",
        "TECHNOLOGY",
        "REPEATED_FOLLOW_UP"
    ]
    
    
    def process_row(df,theme,model):
        result = []
        for _,row in df.iterrows():
            classification = classify_single_theme(row,theme,model)
            result.append(classification)
        df[theme] = result
        return df
    
    for theme in themes:
        logging.info(f"Processing theme: {theme}")
        df = process_row(df, theme, model=MODEL)
        #time.sleep(0.1)  
    
    current_time = datetime.datetime.now()
    current_datetime = current_time.strftime("%Y%m%d_%H%M%S")
    fix_model_name = MODEL.replace("/", "_") # Remove this line if you arent using OpenRouter Model API
    df.to_csv(f"feedback_analysis_results_{fix_model_name}_{current_datetime}.csv", index=False)
    logging.info(f"Analysis complete. Results saved")
if __name__ == "__main__":
    main()