# Text Modelling search classification script
* Script inspired and adapted from categorize_expenses_with_validation.ipynb, Thu Vu, https://github.com/thu-vu92/local-llms-analyse-finance/
* The dataset is from the Kaggle , details are as follows:  Anshul Chaudhary, and Muskan Risinghani. (2023). Airline Reviews [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DS/4044107

* This model is trained on the BA_AirlineReviews.csv dataset.
* The model goes through each row with the prompt and the question (which is what we want to theme by) and then classifies each row with a Yes or No. We also put the full LLMs explanation per row in a seperate column
* At the moment we are just passing 100 rows from the dataset.
* Please DO NOT run the script without fully understanding the code. In Particular knowing the capabilities of your CPU and adjusting the number of workers and the number of rows being passed to the model accordingly


In [None]:
# Classifying with LangChain - multiprocessing

import time
from langchain.schema import HumanMessage
import pandas as pd
from langchain_ollama import ChatOllama
from concurrent.futures import ThreadPoolExecutor
import time

df = pd.read_csv("BA_AirlineReviews.csv")
df = df.sample(n=100)
df = df.rename(columns={"Unnamed: 0":"RowID"})
def init_chat_ollama(model_name):
    """
    Initialize a ChatOllama object.

    Args:
        model_name (str): The name of the local Llama model.

    Returns:
        ChatOllama: An instance of the ChatOllama model.
    """
    return ChatOllama(model=model_name,temperature=0.2)
def classify_row_with_ollama(row, theme, chat_model):
    """
    Classifies a single row based on a theme.

    Args:
        row from the DataFrame.
        theme (str): The theme to analyze for the row.
        chat_model (what we passed from the init_chat_ollama)

    Returns:
        tuple: (RowID, classification result)
    """
    #pick up the ReviewBody row and prepare the prompt to be passed
    text = row["ReviewBody"]
    prompt = f"Does the following text mention issues explicitly related to the '{theme}'? Respond with 'Yes' or 'No'. Text: '{text}'"
    
    # we now use the chat model and pass the prompt as a Human message. If there is an unclassified row, pass it as Cant Classify
    try:
        response = chat_model([HumanMessage(content=prompt)])
        response_text = response.content.strip()  
    except Exception as e:
        print(f"Error processing row {row['RowID']}: {e}")
        response_text = "Cant classify"
    
    return row['RowID'], response_text

def thread_classify_allrows(df, theme, chat_model):
    """
    Uses the classify_row_with_ollama to classify all rows using multi-threading

    Args:
        df (pd.DataFrame): Input DataFrame with a "TextData" column.
        theme (str): The theme to analyze for each row.
        chat_model: the chat model being passed

    Returns:
        pd.DataFrame: DataFrame with an additional column named after the theme.
    """

    # two lists initiated theme_tags to get the 'yes', 'no' value. theme_tags_explain is for the explanation given by the model.
    theme_tags = []
    theme_tags_explain = []
    #For my PC using 10 workers, need to adjust based on your max cores
    #Returns key with async tasks and values are the RowIDs
    with ThreadPoolExecutor(max_workers=10) as executor:
        async_tasks = {executor.submit(classify_row_with_ollama, row, theme, chat_model): row["RowID"] for _, row in df.iterrows()}
        
        #Looping through the async_task to get the tuple results 
        for result in async_tasks:
            row_id = async_tasks[result]
            try:
                # This should now return (RowID, response_text)
                row_result = result.result()  
                theme_tags_explain.append(row_result[1])
                #print(f"Row {row_result[0]}: Response: {row_result[1]}")  # Optional, for real-time feedback, might remove this
                if row_result[1].lower().startswith("yes"):
                    theme_tags.append("yes")
                elif row_result[1].lower().startswith("no"):
                    theme_tags.append("no")
                else:
                    theme_tags.append("Cant classify")
            except Exception as e:
                print(f"Cant classify row {row_id}: {e}")
                theme_tags.append("Cant classify")
    
    explain_column_name = f"{theme}_EXPLAIN" #Name the column name
    # Dynamically name the column based on the theme
    df[theme] = theme_tags
    df[explain_column_name] = theme_tags_explain
    return df
  
# Timing the entire process
start_time = time.time()  # Record the start time

# Example using Llama 3.18b
model_name = "llama3.1:8b"
theme_to_search = "cleanliness-related issues, specifically dirty or unclean conditions"

# Initialize the ChatOllama session
chat_ollama = init_chat_ollama(model_name)

# Classify rows in the DataFrame using multiprocessing
tagged_df_second_model = thread_classify_allrows(df, theme_to_search, chat_ollama)

# Save the results to a CSV
tagged_df_second_model.to_csv("cleanliness_test_second_model_with_multiprocessing.csv")

end_time = time.time()  # Record the end time

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"Total execution time: {execution_time:.2f} seconds")