# What

As discussed in https://github.com/1jamesthompson1/TAIC-report-summary/issues/138 each recommendation that TAIC issues also has a response. These responses have a few categories.

## How to do it

I have a dataset file that are all of the recommendations from TAIC. 

I also have 10 random examples of categorized recommendations.

I will start by just asking it and seeing how it does against this exmaples. Then I can use the examples as exmpaels within the prompt

## All of the modules needed

To keep things as transparent as possible I will add all of the dependencies at the top.

In [None]:
# From the engine
from engine.OpenAICaller import openAICaller

# Third party
import pandas as pd
import docx

# Built in
import os
import re
import importlib

pd.options.mode.copy_on_write = True

In [None]:
response_categories = [
    {"category": "Accepted and Implemented", "definition": "The recommendation was accepted (wholly) and has been implemented"},
    {"category": "Accepted", "definition": "The recommendation was accepted (wholly) and is being, or will be implemented"},
    {"category": "Under consideration", "definition": "The recipient has acknowledged that the recommendation is received and will consider it."},
    {"category": "Rejected", "definition": "The recommendation will not be implemented"}
]

# Getting datasets

I will get Ingrid from TAIC to give me some examples

## TAIC recommendations dataset

In [None]:
# Get a sample where recommendation and reply is not empty
def clean_TAIC_recommendations(df):
    df.dropna(subset=["Recommendation", "Inquiry", "Number", "Reply Text"], inplace = True)
    df = df[(df['Made'] >= '2010-01-01')]

    # structure the inquiry to be able to match with rest of project
    inquiry_regex = r'^(((AO)|(MO)|(RO))-[12][09][987012]\d-[012]\d{2})$'
    df = df[df['Inquiry'].str.match(inquiry_regex)]
    df['report_id'] = df['Inquiry'].apply(lambda x: "_".join(x.split('-')[1:3]))
    return df

taic_recommendations = pd.read_excel('TAIC_recommendations_04_04_2024.xlsx')

cleaned_taic_recommendations = clean_TAIC_recommendations(taic_recommendations)


## Ingrid responses 

In [None]:
# Get examples from Ingrid

ingrid_categories = pd.read_excel('example_recommendation_categories_ingrid_responses.xlsx')

ingrid_categories['response_category'] = ingrid_categories['response_category'].apply(lambda x: response_categories[x+1]['category'])

# Update 011/17 as N/A
ingrid_categories.loc[ingrid_categories['Number'] == '011/17', 'response_category'] = "N/A"

## TAIC response categorizations

In [None]:
def tables_to_dataframes(docx_file):
    document = docx.Document(docx_file)
    tables = document.tables
    dataframes = []
    for table in tables:
        df = pd.DataFrame([[cell.text for cell in row.cells] for row in table.rows])
        dataframes.append(df)

    return dataframes

def process_tables(config):
    tables = tables_to_dataframes(config['filename'])

    all_tables_df = pd.DataFrame()
    for table_idx in config['tables_to_read']:
        table = tables[table_idx]

        # Turn first row into column names
        table.columns = table.iloc[0]

        table_df = pd.DataFrame({
            "Number": table[config['id']], 
            "response_category": table['Response'],
                                })
        
        all_tables_df = pd.concat([all_tables_df, table_df])

    # Clean up recommendation column

    all_tables_df['Number'] = all_tables_df['Number'].apply(lambda x: re.sub(r'\n', '', x))

    # Remove all rows that dont match recommendation regex

    all_tables_df = all_tables_df[all_tables_df['Number'].str.match(r'\d{3}/\d{2}')]

    # Clean up response_category column

    all_tables_df['response_category'] = all_tables_df['response_category'].apply(lambda x: re.sub(r'Accepted and being implemented', 'Accepted', x))


    return all_tables_df

# Rather than come up with a smart way of doing it I think it would be quicker to just quickly read this and come up with how to read the tables.

export_config = [
    {
        "filename": "2023-06-30.SR.Report.to.Minister.docx",
        "tables_to_read": [0, 2, 4, 6],
        "id": "Number"
    },
    {
        "filename": "2022-06-30.SR.Report.to.Minister.docx",
        "tables_to_read": [1, 3, 5],
        "id": "Rec no."
    }
]

official_responses =pd.concat([process_tables(config) for config in export_config], ignore_index=True)

# Drop empty cateories

official_responses = official_responses[official_responses['response_category'] != '']

## Merge exmaples and all recommendations togather

In [None]:
# Merge ingrid and official responses

example_categories = official_responses.merge(ingrid_categories[['Number', 'response_category']], how='outer', indicator='origin')

example_categories['origin'] = example_categories['origin'].apply(lambda x: 'official' if x == 'left_only' else 'ingrid')

example_categories

In [None]:
# Merge example categories with TAIC recommendations

recommendations_df = cleaned_taic_recommendations.merge(example_categories, how='left')

recommendations_df = recommendations_df[['report_id', 'Number', 'Recommendation', 'Reply Text' ,'Made', 'Recipient', 'response_category', 'origin']]

# change all columns to lower case and repalce spaces with _
recommendations_df.columns = recommendations_df.columns.str.lower().str.replace(' ', '_')

recommendations_df

# Performing categorization



## Assign Categories



In [None]:
def assign_response_category(response, recommendation, recommendation_num):
    categories = '\n'.join([f"{element['category']} - {element['definition']}" for element in response_categories])

    system_prompt = f"""
You are helping me put responses into categories.

These responses are to recommendations that were made in a transport accident investigation report. These recommendations are issued directly to a particular party.

There are three categories:

{categories}

However if there are responses that don't fit into any of the categories then you can put them as N/A. These may be responses that request further information or want recommendation to be sent elsewhere.

Your response should just be the name of the category with nothing else.
"""
    user_prompt = f"""
Which category is this response in?

"
{response}
"

in regards to recommendation {recommendation_num}
"""

    # print(f"system prompt is:\n{system_prompt} and user prompt is:\n{user_prompt}")

    openai_response = openAICaller.query(
        system_prompt,
        user_prompt,
        model = "gpt-4",
        temp = 0
    )

    if openai_response in [category['category'] for category in response_categories] + ['N/A']:
        return openai_response
    else:
        print(f"Did not match any of the categories - {openai_response}")
        return None

In [None]:
sample_recommendations_df = recommendations_df.sample(25, random_state=42)

sample_recommendations_df['response_category'] = sample_recommendations_df.apply(lambda x: assign_response_category(x['reply_text'], x['number']), axis=1)


In [None]:

examples_recommendations_df = recommendations_df[~recommendations_df['response_category'].isna()]
examples_recommendations_df['response_category_inferred'] = examples_recommendations_df.apply(lambda x: assign_response_category(x['reply_text'], x['recommendation'], x['number']), axis=1)

In [None]:
# Compare the results

def printout_accuracy(df):
    # Compare columns case insenstive

    df['matching'] = df['response_category'].str.lower() == df['response_category_inferred'].str.lower()

    changed_df = df[df['matching'] == False]

    print(f"{100* (len(df)-len(changed_df)) / len(df):.2f}% accuracy when comparing {len(df)} responses")

    changed_df['change'] = changed_df['response_category'] + ' -> ' + changed_df['response_category_inferred']

    value_counts = changed_df['change'].value_counts()

    # Sort this series by the value of the left side then the right in the response_categories list

    display(value_counts)

    return changed_df


mismatched_categories = printout_accuracy(examples_recommendations_df)[['report_id', 'number', 'recommendation', 'reply_text', 'response_category', 'response_category_inferred', 'origin']]

mismatched_categories


In [None]:
# Given the good results I will do a complete run through for their sake

recommendations_df['response_category'] = recommendations_df.apply(lambda x: (x['reply_text'], x['number']), axis=1)

recommendations_df[["number", "report_id", "recommendation", "reply_text", "response_category"]].to_excel('recommendation_categories.xlsx', index=False)

In [None]:
recommendations_df.groupby('response_category').count()

# Unsupervised classification

I am going to try out some unsupervised classifications.

I also want to try clustering of the responses.



## Lbl2Vec

After working with this for a bit it didn't seem to be working. Due to this error https://github.com/sebischair/Lbl2Vec/issues/4. Until this is resolved I will have to move on to other techniques.

This technique will involve me defining some categories based off keywords. It will then learn from unlabeled texts. This unlabeled text is exactly what I have here.

Two concerns, I have a very small data set and I have quite short text in comparison.

In [None]:
labels = pd.DataFrame([
    {
        'class_name': "Accepted",
        'keywords': ['accepted', 'implemented', 'accept', 'implement']
    },
    {
        'class_name': "Under consideration",
        'keywords': ['under', 'consideration', 'consider']
    },
    {
        'class_name': 'Rejected',
        'keywords': ['rejected']
    }
])

labels['numbers_of_keywords'] = labels['keywords'].apply(lambda x: len(x))

labels

In [None]:
import gensim
importlib.reload(gensim)

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from gensim.models.doc2vec import TaggedDocument

# doc: document text string
# returns tokenized document
# strip_tags removes meta tags from the text
# simple preprocess converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long 
# simple preprocess also removes numerical values as well as punktuation characters
def tokenize(doc):
    return simple_preprocess(strip_tags(doc), deacc=True, min_len=2, max_len=15)

# Split into train and test
test_df = cleaned_taic_recommendations.sample(10, random_state=42)
train_df = cleaned_taic_recommendations[~cleaned_taic_recommendations['Number'].isin(sample_df['Number'])]

test_df['data_set_type'] = "test"
train_df['data_set_type'] = "train"

# concat train and test data
full_corpus = pd.concat([train_df,test_df]).reset_index(drop=True)


# tokenize and tag documents for Lbl2Vec training
full_corpus['tagged_responses'] = full_corpus.apply(lambda row: TaggedDocument(tokenize(row['Reply Text']), [str(row.name)]), axis=1)

# add doc_key column
full_corpus['doc_key'] = full_corpus.index.astype(str)

display(full_corpus)

In [None]:
from lbl2vec import Lbl2Vec

# init model with parameters
Lbl2Vec_model = Lbl2Vec(keywords_list=list(labels.keywords), tagged_documents=full_corpus['tagged_responses'][full_corpus['data_set_type'] == 'train'], label_names=list(labels.class_name), similarity_threshold=0.43, min_num_docs=5, epochs=10)

# train model
Lbl2Vec_model.fit()

## Clustering

I will just try to cluster them using normal techniques.

I tried using clustering which was a bit of fun but not necessary as I have gotten good results with LLM. 

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Assuming your responses are stored in a list called 'responses'
# Clean the responses by removing punctuation and converting to lowercase
clustered_recommendations = taic_recommendations.copy()

clustered_recommendations = clustered_recommendations[clustered_recommendations['Made'] >= '2010-01-01']


cleaned_responses = [response.lower().replace('.', '').replace(',', '') for response in clustered_recommendations['Reply Text']]


nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
tokenized_responses = [word_tokenize(response) for response in cleaned_responses]
filtered_responses = [[word for word in tokenized_response if word not in stop_words] for tokenized_response in tokenized_responses]


preprocessed_responses = [' '.join(filtered_response) for filtered_response in filtered_responses]


In [None]:
# Fit the transform

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_responses)


In [None]:

num_clusters = 3


kmeans = KMeans(n_clusters=num_clusters, random_state=42)

kmeans.fit(tfidf_matrix)

cluster_labels = kmeans.labels_

silhouette_avg = silhouette_score(tfidf_matrix, cluster_labels)
print("Average silhouette_score:", silhouette_avg)


In [None]:

clustered_recommendations['cluster'] = cluster_labels


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce dimensionality for visualization
pca = PCA(n_components=2)
tfidf_matrix_pca = pca.fit_transform(tfidf_matrix.toarray())

# Visualize clusters
plt.scatter(tfidf_matrix_pca[:, 0], tfidf_matrix_pca[:, 1], c=cluster_labels, cmap='viridis')
plt.title('Clustering of Responses')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
# Check with the examples from Ingrid

ingrid_categories['cluster'] = ingrid_categories['Reply Text'].apply(lambda x: kmeans.predict(tfidf_vectorizer.transform([x]).toarray()))

ingrid_categories