# What

As establised in this [notebook](./safey_themes_from_safety_issues.ipynb). BERTopic seems to be the most promising method for generating safety themes from safety issues.

There are a few problems that need to be address.
- Lots of outliers
- only 3 topics being generated
- The topics are two general
- The topic representations and clustering doesnt really make any sense.

## Modules

In [2]:
# local
from engine.OpenAICaller import openAICaller

# third parties

import regex as re
import yaml
import pandas as pd
import numpy as np

import plotly.express as px

from dotenv import load_dotenv

import voyageai
import openai
import tiktoken

from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.representation import OpenAI
from cuml.cluster import HDBSCAN
# from cuml.metrics.cluster import silhouette_score
from cuml.manifold import UMAP

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from tqdm.auto import tqdm
tqdm.pandas()

from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Alignment

# builtin
import os
from itertools import product
import multiprocessing
from collections import namedtuple
import copy


pd.options.mode.copy_on_write = True  
openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))



# Getting safety issue data

In [22]:
safety_issues_df = pd.read_csv('safety_issues.csv')

# Confirm it has the right columns report_id, si and mode

if not safety_issues_df.columns.isin(['report_id', 'si', 'mode']).any():
    print("Safety issues dataset is missing columns")
    del safety_issues_df

In [33]:
tokenizer = tiktoken.encoding_for_model('gpt-4')
safety_issues_df['si'].apply(lambda x: len(tokenizer.encode(x))).sum()

21789

# Getting potential labels

As the problems that are identfied in https://github.com/1jamesthompson1/TAIC-report-summary/issues/144#issuecomment-2132416143 are proving to be difficult. I am going to try and do some zero shot classification

In [3]:
# Reading the watchlisted safety issues from the TAIC website 2024

safety_issue_watchlist_2024 = yaml.safe_load(open('watchlist_2024.yaml', 'r'))['watchlist']

safety_issue_watchlist_2024 = pd.DataFrame(safety_issue_watchlist_2024)

safety_issue_watchlist_2024

Unnamed: 0,title,description
0,Safety for workers in the rail corridor,The Transport Accident Investigation Commissio...
1,The road-rail interface,Inquiries have highlighted safety improvements...
2,Recreational boat users: essential knowledge a...,Strategies focus on encouraging self-reliance ...
3,Recreational boat users: impairment from subst...,The effects of drugs and alcohol on cognitive ...
4,Technologies to track and to locate,Tracking and locating technologies can improve...
5,Robinson helicopters: mast bumping accidents i...,"Mast bumping in Robinson helicopters, caused b..."


In [54]:
vo = voyageai.Client()

safety_issue_watchlist_2024['si_with_desc_embedding'] = vo.embed(
    [
        f"{title}: {description}" for title, description in zip(safety_issue_watchlist_2024['title'], safety_issue_watchlist_2024['description'])
    ],
    model = 'voyage-large-2-instruct'
).embeddings
safety_issue_watchlist_2024['si_embedding'] = vo.embed(
    safety_issue_watchlist_2024['title'].tolist(),
    model = 'voyage-large-2-instruct'
).embeddings

In [56]:
safety_issue_watchlist_2024.to_pickle('safety_issue_watchlist_2024.pkl')

# Getting embeddings to be used for clustering

In [2]:
# Move embedding_files to a folder

embeddings_folder_name = 'embeddings'

os.makedirs(embeddings_folder_name, exist_ok=True)

for file in os.listdir():
    if file.endswith("embeddings.pkl"):
        os.rename(file, os.path.join(embeddings_folder_name, file))

os.listdir(embeddings_folder_name)

['voyageai_only_exact_embeddings.pkl',
 'QWEN_embeddings.pkl',
 'openai_embeddings.pkl',
 'voyageai_embeddings.pkl',
 'SFR_embeddings.pkl',
 'voyageai_reccontext_embeddings.pkl',
 'gtelarge_embeddings.pkl']

In [3]:
def column_to_2darray(column):
    return np.array([np.array(x) for x in column.to_numpy()])


embeddings_files = [os.path.join(embeddings_folder_name,file) for file in os.listdir(embeddings_folder_name) if file.endswith("embeddings.pkl")]

all_embeddings = {os.path.splitext(file)[0].replace(f"{embeddings_folder_name}/", "").replace("_embeddings", ""): pd.read_pickle(file) for file in embeddings_files}

embeddings_2darrays = {k: column_to_2darray(v['si_embedding']) for k, v in all_embeddings.items()}

In [4]:
all_embeddings = {k: v for k, v in all_embeddings.items() if k in ['voyageai', 'voyageai_reccontext']}
embeddings_2darrays = {k: embeddings_2darrays[k] for k in all_embeddings.keys()}

In [5]:
tokenizer = tiktoken.encoding_for_model('gpt-4')
all_embeddings['voyageai']['si'].apply(lambda x: len(tokenizer.encode(x))).mean()

38.90892857142857

# BERTopic models

I played around abit manually trying to find the best ones. However the search space is just too large.

I have found out what I can tweak but in ranges of reasonble values and going to let it automatically go through and searh for them.

The list of thigns to tweak is:

- UMAP and the number of components and neighbors. This is the dimension reduction step
- HDBSCAN and the min_cluster_size. This is the clusterting algorithm
- Whether it is merged from individual models or trained on all embeddings at once.
- The embeddings that it is trained on.

## Needed functions

In [9]:

openai_base_representation_model = OpenAI(
    model = 'gpt-4o',
    client = openai_client,
    chat = True,
    generator_kwargs = {'temperature': 0},
    nr_docs = 50
)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [20]:
def runBERTopic(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, reduce_outliers=True, embeddings_array = None):

    topic_model = BERTopic(
        representation_model = representation_model,
        umap_model = umap_model,
        hdbscan_model = hdbscan_model,
        calculate_probabilities=True)

    if embeddings_name is not None:
        topics, probs = topic_model.fit_transform(
            df[docs_name],
            embeddings_array if not embeddings_array is None else column_to_2darray(df[embeddings_name]))
    else:
        topics, probs = topic_model.fit_transform(df[docs_name])
        
    if reduce_outliers:
        topics = topic_model.reduce_outliers(
            documents=df[docs_name].to_list(),
            topics=topics, 
            probabilities=probs,
            strategy="probabilities")

        topic_model.update_topics(
            df[docs_name].to_list(),
            topics=topics,
            representation_model=representation_model)
        
    df['topic'] = topics
    
    df = pd.concat([df, pd.DataFrame(probs)], axis=1)

    return topic_model, df

def assign_topics_and_probabilities(df, model, embeddings):

    cleaned_df = df[['report_id', 'si', 'mode', 'si_embedding']]

    documents = cleaned_df['si'].to_list()

    transform = model.transform(documents, embeddings)

    cleaned_df['topic'] = transform[0]

    return pd.concat([cleaned_df.reset_index(drop=True), pd.DataFrame(transform[1])], axis=1)

def run_merged_model(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, min_similarity, embeddings_array):
    mode_groups = df.groupby('mode')

    modes_dfs = [mode_groups.get_group(x).reset_index(drop=True) for x in mode_groups.groups]

    mode_arrays = [embeddings_array[rows] for i, rows in mode_groups.groups.items()]

    try:
        umap_model.n_components = min(umap_model.n_components, min([arr.shape[0] for arr in mode_arrays])-1)
    except:
        print("base model")


    models = [runBERTopic(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, False, array)[0] for df, array in zip(modes_dfs, mode_arrays)]

    merged_model = BERTopic.merge_models(models, min_similarity=min_similarity)

    merged_df = pd.concat(modes_dfs, ignore_index=True)
    
    merged_df = merged_model.get_document_info(merged_df[docs_name], merged_df)

    return merged_model, merged_df

def add_config_columns(df):

    def get_cluster_config(x):
        if isinstance(x, HDBSCAN):
            return (x.min_cluster_size, x.min_samples, x.max_cluster_size)
        elif isinstance(x, KMeans):
            return (x.n_clusters)

    df['cluster_config'] = df['hdbscan_model'].apply(get_cluster_config)

    def get_dimension_reduction_config(x):
        if isinstance(x, UMAP):    
            return (x.n_components, x.n_neighbors)
        else:
            return (None)

    df['dimension_reduction_config'] = df['umap_model'].apply(get_dimension_reduction_config)   

    return df

def hyper_parameter_search(embeddings_arrays, embeddings_dfs, UMAP_models, HDBSCAN_models, merged_ranges, current_df = None):
    print("Performing hyper parameter search of BERTo['cluster_config', 'dimension_reduction_config', 'embedding_type', 'model_type', 'merged_min_similarity']pic models...")

    ###
    ### Get model arguements ready into a dataframe
    ###

    model_types = ['merged', 'group']

    df = pd.DataFrame(
        list(product(UMAP_models, HDBSCAN_models, embeddings_dfs, model_types)),
        columns=['umap_model', 'hdbscan_model', 'embedding_type', 'model_type']
    )

    df = add_config_columns(df)

    df['merged_min_similarity'] = df['model_type'].apply(lambda x: merged_ranges['min_similarity'] if x == 'merged' else None)
    df = df.explode('merged_min_similarity', ignore_index=True)
    
    df['embedding_2darray'] = df['embedding_type'].apply(lambda x: embeddings_arrays[x])
    df['embedding_df'] = df['embedding_type'].apply(lambda x: embeddings_dfs[x])

    print(f"There are {df.shape[0]} models to run with given aruements")

    ###
    ### Compare arguments dataframe with existing results df and see what rows have already been calculated.
    ###

    # Find rows that are in df but not in current_df these are the new_rows that need to be computed

    matching_columns = ['cluster_config', 'dimension_reduction_config', 'embedding_type', 'model_type', 'merged_min_similarity']
    current_df = pd.DataFrame(columns=matching_columns) if current_df.empty else current_df

    new_rows = df.merge(current_df,
                         on=matching_columns,
                         how='left', indicator=True, suffixes=(None, "_to_delete")).query('_merge == "left_only"').drop(columns=['_merge'])
    # Delete all columns that are full of NaN
    new_rows = new_rows.dropna(axis=1, how='all')

    if new_rows.shape[0] == 0:
        print("No new models to run")
        return current_df

    ### 
    ### Run model ###
    ###

    print(f"Only {new_rows.shape[0]} new models to run")

    def run_model(row): 
        if row['model_type'] == 'group':
            return runBERTopic(
                row['embedding_df'],
                'si',
                'si_embedding',
                None,
                row['umap_model'],
                row['hdbscan_model'],
                False,
                row['embedding_2darray']) 
        else:
            return run_merged_model(
                row['embedding_df'],
                'si',
                'si_embedding',
                None,
                row['umap_model'],
                row['hdbscan_model'],
                row['merged_min_similarity'],
                row['embedding_2darray']
            )

    new_rows['model'] = new_rows.progress_apply(run_model, axis=1)

    new_rows.to_pickle('bertopic_models_temp.pkl')
    
    new_rows['embedding_df'] = new_rows['model'].apply(lambda x: x[1])

    new_rows['model'] = new_rows['model'].apply(lambda x: x[0])

    df = pd.concat([current_df, new_rows], ignore_index=True)    

    print(f"With new rows added to current df there are {df.shape[0]}")

    return df

## Performing search

The search will be done using the `perform_hyper_parameter_search` function with the results saved ina pickle file.

In [6]:
calculated_results = pd.read_pickle('hyper_parameter_search_results.pkl')

In [13]:
UMAP_models = [
    UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42)
    for n_neighbors, n_components in
    product(range(3,6), range(9,25, 2))
]+ [BaseDimensionalityReduction()]

HDBSCAN_models = [
    HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_sampe_size, max_cluster_size=max_cluster_size, metric="euclidean", cluster_selection_method='eom', prediction_data=True)
    for min_cluster_size, min_sampe_size , max_cluster_size in
    product(range(5,15,5), range(5,15,5), [15, 30, 50])
]

KMEANS_models = [
    KMeans(n_clusters=n_clusters, random_state=42)
    for n_clusters in
    range(5,20,4)
]

clustering_models = HDBSCAN_models
# remove all hdbscan_moels that have the same min_cluster_size and min_samples

clustering_models = [x for x in HDBSCAN_models if x.min_cluster_size != x.min_samples]
display(clustering_models)

results = hyper_parameter_search(
    embeddings_2darrays,
    all_embeddings,
    UMAP_models,
    clustering_models,
    merged_ranges = {'min_similarity': [e / 100 for e in list(range(90,100, 2))]},
    current_df = calculated_results)

# results.to_pickle('hyper_parameter_search_results.pkl')

results

# del calculated_results

[HDBSCAN(), HDBSCAN(), HDBSCAN(), HDBSCAN(), HDBSCAN(), HDBSCAN()]

Performing hyper parameter search of BERTo['cluster_config', 'dimension_reduction_config', 'embedding_type', 'model_type', 'merged_min_similarity']pic models...
There are 1800 models to run with given aruements
Only 600 new models to run


  0%|          | 0/600 [00:00<?, ?it/s]

base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
base model
With new rows added to current df there are 17145


Unnamed: 0,cluster_config,dimension_reduction_config,embedding_type,model_type,merged_min_similarity,umap_model,hdbscan_model,embedding_2darray,embedding_df,model
0,"(5, 10, 0)","(5, 3)",voyageai,merged,0.9,UMAP(),HDBSCAN(),"[[0.00557063240557909, 0.008644572459161282, -...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."
1,"(5, 10, 0)","(5, 3)",voyageai,merged,0.92,UMAP(),HDBSCAN(),"[[0.00557063240557909, 0.008644572459161282, -...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."
2,"(5, 10, 0)","(5, 3)",voyageai,merged,0.94,UMAP(),HDBSCAN(),"[[0.00557063240557909, 0.008644572459161282, -...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."
3,"(5, 10, 0)","(5, 3)",voyageai,merged,0.96,UMAP(),HDBSCAN(),"[[0.00557063240557909, 0.008644572459161282, -...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."
4,"(5, 10, 0)","(5, 3)",voyageai,merged,0.98,UMAP(),HDBSCAN(),"[[0.00557063240557909, 0.008644572459161282, -...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."
...,...,...,...,...,...,...,...,...,...,...
17140,"(10, 5, 15)",,voyageai_reccontext,merged,0.92,<bertopic.dimensionality._base.BaseDimensional...,HDBSCAN(),"[[-0.0004589550953824073, 0.03601374477148056,...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."
17141,"(10, 5, 15)",,voyageai_reccontext,merged,0.94,<bertopic.dimensionality._base.BaseDimensional...,HDBSCAN(),"[[-0.0004589550953824073, 0.03601374477148056,...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."
17142,"(10, 5, 15)",,voyageai_reccontext,merged,0.96,<bertopic.dimensionality._base.BaseDimensional...,HDBSCAN(),"[[-0.0004589550953824073, 0.03601374477148056,...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."
17143,"(10, 5, 15)",,voyageai_reccontext,merged,0.98,<bertopic.dimensionality._base.BaseDimensional...,HDBSCAN(),"[[-0.0004589550953824073, 0.03601374477148056,...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_..."


In [30]:
display(results.loc[17100, ])
display(results.loc[17100, ]['model'].get_topic_info())
check_mode_cluster_distribution(results.loc[17100, ]['embedding_df'])

cluster_config                                                      (5, 10, 15)
dimension_reduction_config                                              (23, 5)
embedding_type                                                         voyageai
model_type                                                               merged
merged_min_similarity                                                      0.96
umap_model                                                               UMAP()
hdbscan_model                                                         HDBSCAN()
embedding_2darray             [[0.00557063240557909, 0.008644572459161282, -...
embedding_df                      report_id                                 ...
model                         BERTopic(calculate_probabilities=True, ctfidf_...
Name: 17100, dtype: object

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,460,-1_the_of_to_and,"[the, of, to, and, in, not, for, that, was, pi...",
1,0,12,0_the_gear_landing_to,"[the, gear, landing, to, nose, down, of, in, a...",
2,1,10,1_the_of_safety_spark,"[the, of, safety, spark, as, springs, in, defe...",
3,2,9,2_atc_their_of_the,"[atc, their, of, the, aerodrome, avsec, airpor...",
4,3,13,0_braking_the_conditions_not,"[braking, the, conditions, not, systems, brake...",
5,4,13,1_the_station_of_to,"[the, station, of, to, wellington, risk, train...",
6,5,8,2_road_level_crossings_the,"[road, level, crossings, the, and, vehicles, f...",
7,6,14,0_the_bridge_not_in,"[the, bridge, not, in, of, and, to, did, team,...",
8,7,13,1_the_easy_rider_was,"[the, easy, rider, was, to, vessel, not, for, ...",
9,8,8,2_the_pilotage_to_of,"[the, pilotage, to, of, ports, on, and, failur...",


mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,175,151,134
0,12,13,14
1,10,13,13
2,9,8,8


## Parsing results

All of the models dont have any metrics to them. But topic models in general do have ways of measuring their quality in various ways. Using something like https://github.com/MIND-Lab/OCTIS would help use the various evalulation tools.

There are certain metrics I could look at and these are: https://github.com/MIND-Lab/OCTIS#available-metrics

Here are the metrics I will look at.

| metric | description |
| ------ | ----------- |
| outliers_percent | This measures how many outliers are found when doing the clustering. It is only relevant when working with HDBSCAN as other clutsering methods don't identify outliers. |
| num_topics | It is important that there are a reasonable amount of topics created. This reasonable number is somewhere around 15. |
| topic_membership_counts_std | There is a problem of having a few themes that have most of the issues and then some reall small ones. This standard deviation combined with num_topics can help have a uniformly distributed amount of issues. | 

### DIY

These are my initial attempts at my own metrics

In [11]:
def get_stats(df):

    df['topic_membership_counts'] = df['model'].apply(lambda x: x.get_topic_info()['Count'].to_list())
    df['topic_membership_counts_std'] = df['topic_membership_counts'].apply(np.std)
    df['num_topics'] = df['topic_membership_counts'].apply(len)
    # Count percent of issues that have topic as -1 in embedding_df
    def temp(x):
        try:
            return (x['topic'] == -1).mean() * 100
        except:
            display(x)
    df['outlier_percent'] = df['embedding_df'].apply(temp)

    df['cluster_model_type'] = df['hdbscan_model'].apply(lambda x: "HDBSCAN" if isinstance(x, HDBSCAN) else "Kmeans")
    df['dimmension_reduction_type'] = df['umap_model'].apply(lambda x: "UMAP" if isinstance(x, UMAP) else "Base")

    df['id'] = df.index

    return df

In [15]:
results = get_stats(results)
results

NameError: name 'results' is not defined

### Topic Cohernece

This is the measure of much each issue is like the other issues within a topic.

As per recommendation from the creater of BERTopic I will use NPMI.

Given that OCTIS is not being actively maintained I will have to ahve a look at gensim

#### Solihouette score

This is the measure of much safety issues are like the cluster compared to other clusters. The maker of BERTopic advises against this but provides a simple implementation here  https://github.com/MaartenGr/BERTopic/issues/428#issuecomment-1027647827

Becuase it is taking so long for these too be computed I am going to filer out the undesirable ones.

In [52]:
silhouette_filtered = results.query('num_topics > 5 & outlier_percent <= 10')

silhouette_filtered

NameError: name 'results' is not defined

In [3]:
silhouette_filtered = pd.read_pickle('temp.pkl')

In [3]:
## This was here as there was problems with the IDS no matching.
def update_ids(df, df_to_be_updated):
    subset_columns = ['cluster_config', 'dimension_reduction_config', 'embedding_type', 'model_type', 'merged_min_similarity', 'cluster_model_type', 'dimmension_reduction_type']  # Specify the columns you want to use for matching

    df = df[subset_columns+["id"]]
    df_to_be_updated = df_to_be_updated[subset_columns+["id"]]

    # Merge the DataFrames based on the subset of columns
    merged_results = pd.merge(df, df_to_be_updated, on=subset_columns, how='inner')
    merged_results

    # Find ones where the id mismtaches?
    mismatched_ids = merged_results[merged_results['id_x'] != merged_results['id_y']]

    display(mismatched_ids.query('id_x == 4969 | id_y == 4969'))

    id_mapping = dict(zip(df.apply(lambda row: tuple(row[subset_columns]), axis=1), df['id']))
    
    df_to_be_updated['id'] = df_to_be_updated.apply(lambda row: id_mapping.get(tuple(row[subset_columns]), row['id']), axis=1)

    return df_to_be_updated['id']
    
display(silhouette_filtered.query('id == 4969'))
display(loaded_results.query('id in [4969, 10835]'))

loaded_results['id'] = update_ids(silhouette_filtered, loaded_results)

loaded_results.query('id in [4969, 10835]')



NameError: name 'silhouette_filtered' is not defined

In [4]:
def get_accepted_indicies(df):
    df['accepted_indicies'] = df.progress_apply(lambda row: [index for index, topic in enumerate(row['embedding_df']['topic'].tolist()) if topic != -1], axis = 1)

    return df

def silhouette_score_row(row):
    if len(row['accepted_indicies']) < 5:
        return "No accepted_indicies"
    
    X = row['reduced_embeddings']

    labels = row['embedding_df']['topic'][row['accepted_indicies']]

    if len(labels.unique()) < 2:
        return "Too few topics"

    print(row)

    return silhouette_score(X, labels)

def add_silhouette_score(df, preivously_calculated):
    if preivously_calculated is None:
        new_rows = df
    else:
        new_rows = df.merge(preivously_calculated,
                            on=['cluster_config', 'dimension_reduction_config', 'embedding_type', 'model_type', 'merged_min_similarity', 'cluster_model_type', 'dimmension_reduction_type'],
                            how='left', indicator=True, suffixes=(None, "_to_delete")).query('_merge == "left_only"').drop(columns=['_merge'])
        # Delete all columns that are full of NaN
        new_rows = new_rows.dropna(axis=1, how='all')

    print(f"Total {df.shape[0]} rows needing silhouette scores")
    if new_rows.shape[0] == 0:
        print("No new silheuttes to run")
        return preivously_calculated
    else:
        print(f"Only {new_rows.shape[0]} new silhuettes to run")

    new_rows = get_accepted_indicies(new_rows)

    new_rows['reduced_embeddings'] = new_rows.progress_apply(
        lambda row: 
        row['embedding_2darray'] if not isinstance(row['umap_model'], UMAP) else row['umap_model'].fit_transform(row['embedding_2darray'])[row['accepted_indicies']]
        if len(row['accepted_indicies']) >= 5 else
        "Too few accepted"
        ,axis = 1
    )

    
    new_rows['silhouette_score'] = new_rows.progress_apply(silhouette_score_row, axis = 1)

    if preivously_calculated is None:
        return new_rows
    else:
        return pd.concat([preivously_calculated, new_rows], ignore_index=True)


try:
    loaded_results = pd.read_pickle('silhouette_scores.pkl')
except FileNotFoundError as e:
    print(e)
    loaded_results = None


calculated_results = add_silhouette_score(silhouette_filtered, loaded_results)

calculated_results.to_pickle('silhouette_scores.pkl')

calculated_results

Total 226 rows needing silhouette scores
No new silheuttes to run


Unnamed: 0,umap_model,hdbscan_model,embedding_type,model_type,merged_min_similarity,embedding_df,model,cluster_config,dimension_reduction_config,embedding_2darray,topic_membership_counts,topic_membership_counts_std,num_topics,outlier_percent,id,accepted_indicies,reduced_embeddings,silhouette_score,cluster_model_type,dimmension_reduction_type
0,UMAP(),HDBSCAN(),openai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 5, 0)","(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[36, 97, 93, 11, 39, 25, 21, 16, 14, 12, 12, 1...",36.469924,17,1.607143,1,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14...","[[-2.5475276, -2.985186, -0.879056, -0.0050687...",0.161992,HDBSCAN,UMAP
1,UMAP(),HDBSCAN(),openai,merged,0.96,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 5, 0)","(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[55, 189, 10, 6, 39, 25, 21, 16, 14, 12, 12, 1...",40.730654,19,0.357143,3,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-2.449398, -3.0897288, -0.9104028, 0.1285584...",0.077534,HDBSCAN,UMAP
2,UMAP(),HDBSCAN(),voyageai,merged,0.9,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 5, 0)","(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 203, 22, 18, 16, 21, 10, 9, 76, 7, 2...",55.305392,13,4.464286,6,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-3.759121, 3.431325, 1.8554363, -1.8523803, ...",-0.028394,HDBSCAN,UMAP
3,UMAP(),HDBSCAN(),voyageai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 5, 0)","(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 185, 22, 18, 16, 21, 10, 9, 76, 7, 6...",48.268779,16,4.464286,7,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-3.9462743, 3.2983723, 2.3500576, -1.100832,...",-0.012372,HDBSCAN,UMAP
4,UMAP(),HDBSCAN(),voyageai,merged,0.94,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 5, 0)","(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 46, 22, 18, 16, 21, 10, 9, 37, 7, 6,...",28.783814,21,3.392857,8,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-4.651314, 3.0769386, 1.6765718, -1.3375773,...",0.030043,HDBSCAN,UMAP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10355,UMAP(),HDBSCAN(),voyageai_reccontext,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 10, 30)","(23, 5)","[[-0.0004589550953824073, 0.03601374477148056,...","[342, 23, 57, 13, 57, 7, 6, 6, 5, 23, 21]",93.792280,11,6.785714,16504,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[2.416909, 2.1022851, -0.63195777, -0.6580636...",0.150917,HDBSCAN,UMAP
10356,UMAP(),HDBSCAN(),voyageai_reccontext,merged,0.94,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 10, 30)","(23, 5)","[[-0.0004589550953824073, 0.03601374477148056,...","[342, 23, 58, 13, 11, 7, 6, 6, 5, 23, 21, 26, 19]",87.344113,13,6.428571,16505,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[1.114247, 1.7201066, -1.5075276, 0.35830307,...",0.256466,HDBSCAN,UMAP
10357,UMAP(),HDBSCAN(),voyageai_reccontext,merged,0.96,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 10, 30)","(23, 5)","[[-0.0004589550953824073, 0.03601374477148056,...","[298, 23, 13, 13, 11, 7, 6, 6, 5, 27, 26, 21, ...",63.758069,19,6.428571,16506,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-1.025646, -0.7558737, -1.3521194, -0.066558...",0.241971,HDBSCAN,UMAP
10358,UMAP(),HDBSCAN(),voyageai_reccontext,merged,0.98,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...","(5, 10, 30)","(23, 5)","[[-0.0004589550953824073, 0.03601374477148056,...","[353, 23, 13, 13, 11, 7, 6, 6, 5, 29, 21, 20, ...",80.301249,17,6.428571,16507,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-1.017009, -1.3334711, 1.6844773, 0.7099472,...",0.181686,HDBSCAN,UMAP


## Interpreting results

The goal here is going to be deciding what out of the thousands of models are the bests ones.

There are going to be a few ways that I come at the notion of "Useful accurate theme generation" and these were generated in the [Previous section](#parsing-results)

## Finding potential models

In [5]:
def check_mode_cluster_distribution(df):
    safety_issues_df_topic_mode = df.pivot_table(index='topic', columns='mode', values='si', aggfunc='count').fillna(0)
    
    return safety_issues_df_topic_mode

def visualize_mode_distrbution_across_topics(df):
    table = check_mode_cluster_distribution(df)
    df_long = table.reset_index().melt(id_vars='topic', var_name='mode', value_name='value')
    fig = px.bar(df_long, x='topic', y='value', color='mode', barmode='group', title='Distribution of Modes for Each Topic')
    fig.show()

In [6]:
def make_visualization(model, df, save = False, name = 'topic model visual'):

    array_embeddings = column_to_2darray(df['si_embedding'])

    reduced_array_embeddings = UMAP(n_neighbors=7, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

    visualization = model.visualize_documents(df['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

    if save:

        with open(name, 'w',  encoding='utf-8') as f:
            visualization.write_html(f)

    return visualization


This is where I will provide infomration on decisions on what I have done:

- Kmeans seems to struggle to make meaning full clusters atleast from the topic representation. I looked at these models 6480, 6116, 6747 but have removed them from the running.
- 

In [7]:
# Assuming your 'lengthened_combined_topic_counts' DataFrame is available

import plotly.colors as pc

# Define a colorblind-friendly palette
colorblind_palette = pc.qualitative.Safe

fig = px.scatter(calculated_results,
                 x='silhouette_score', y='topic_membership_counts_std',
                 color='embedding_type', size='num_topics',
                 hover_data=['id', 'topic_membership_counts', 'cluster_config'],
                 color_discrete_sequence=colorblind_palette)

fig.update_layout(
    hovermode='closest'
)

fig.show()

There are many that are good. It all depends on what consititues a good topic model. I feel that one that has a small amount of outliers as well as a decent amount of topics is quite suitable.

Here are some that seem intersting

In [8]:
prompt = """
I have the following safety issues in a topic: [DOCUMENTS]

Generate a short, specific topic label that accurately reflects the safety issues described. Refer to the definitions below:

- Safety issue: A factor that can affect the safety of future operations, characteristic of an organization, system, or environment at a specific point in time.
- Safety theme: Recurring circumstances or causes across transport modes or over time, covering one or more related safety issues.

Examples of good topic labels:
- topic: Electrical Malfunctions in Aircraft
- topic: Inadequate Safety Training Procedures
- topic: Faulty Emergency Equipment Checks

Examples of bad topic labels:
- topic: Safety Issues
- topic: Various Problems
- topic: General Maintenance Concerns

Format your response as:
topic: <topic label>
"""

openai_base_representation_model = OpenAI(
    openai_client,
    prompt = prompt,
    model="gpt-4o",
    chat=True,
    nr_docs = 50)

In [9]:
/home/james/code/TAIC-report-summary/notebooks/safety themes_from_safety_issues/topic_visuals
    display(model.get_topic_info())

    return row

In [10]:
updated_rows = {i: inspect_model(row) for i, row in rows.items()}

Looking at model: 9917. Facts:
model type - group
embedding - voyageai_reccontext
dimension reduction - UMAP:(11, 4)  
clustering - HDBSCAN:(5, 5, 0)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,48,-1_Inadequate Training and Competency Validati...,[Inadequate Training and Competency Validation...,[Safety issue: The signaller was certified and...
1,0,147,0_Maritime Safety Management and Compliance Is...,[Maritime Safety Management and Compliance Iss...,[Safety issue: The operation of L'Austral's EC...
2,1,55,1_Rail Operation and Communication Failures,[Rail Operation and Communication Failures],[Safety issue: Miscommunication between the tr...
3,2,55,2_Helicopter Safety Risks and Regulatory Compl...,[Helicopter Safety Risks and Regulatory Compli...,[Safety issue: All three Robinson helicopter m...
4,3,41,3_Air Traffic Control and Aerodrome Safety Pro...,[Air Traffic Control and Aerodrome Safety Proc...,[Safety issue: Civil Aviation Rule 91.223 can ...
5,4,21,4_Inadequate Maintenance and Verification Proc...,[Inadequate Maintenance and Verification Proce...,"[Safety issue: The use of ""threat and error ma..."
6,5,21,5_Inadequate Risk Management and Safety Protoc...,[Inadequate Risk Management and Safety Protoco...,[Safety issue: A large percentage of pedestria...
7,6,20,6_Inadequate Safety Training and Non-technical...,[Inadequate Safety Training and Non-technical ...,[Safety issue: The number 4 cylinder upper spa...
8,7,18,7_Communication and Control Failures in Train ...,[Communication and Control Failures in Train O...,"[Safety issue: The train controller, who was p..."
9,8,14,8_Inadequate Safety Measures and Training in W...,[Inadequate Safety Measures and Training in We...,[Safety issue: There are a number of reasonabl...


Looking at model: 10924. Facts:
model type - merged
embedding - voyageai_reccontext
dimension reduction - UMAP:(13, 5)  
clustering - HDBSCAN:(5, 5, 0)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,80,-1_Inconsistent Compliance with Safety Procedu...,[Inconsistent Compliance with Safety Procedure...,
1,0,46,0_Inadequate Management of Safety-Critical Rol...,[Inadequate Management of Safety-Critical Role...,
2,1,23,1_Safety Issues in Air Traffic Management and ...,[Safety Issues in Air Traffic Management and E...,
3,2,19,2_Inadequate Safety Procedures and Training in...,[Inadequate Safety Procedures and Training in ...,
4,3,13,3_Inadequate Crew Fatigue Management and Autom...,[Inadequate Crew Fatigue Management and Automa...,
5,4,13,4_Flight Operations and Safety Procedures at Q...,[Flight Operations and Safety Procedures at Qu...,
6,5,13,5_Inadequate Oversight and Management in Parac...,[Inadequate Oversight and Management in Parach...,
7,6,11,6_Inadequate Verification and Safety Procedure...,[Inadequate Verification and Safety Procedures...,
8,7,8,7_Inadequate Runway Entry Procedures and Training,[Inadequate Runway Entry Procedures and Training],
9,8,8,8_Inadequate Safety Measures in Aviation and M...,[Inadequate Safety Measures in Aviation and Ma...,


Looking at model: 6658. Facts:
model type - merged
embedding - voyageai_reccontext
dimension reduction - UMAP:(19, 4)  
clustering - HDBSCAN:(10, 5, 50)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,87,-1_Design and Maintenance Failures in Aircraft...,[Design and Maintenance Failures in Aircraft a...,
1,0,38,0_Inadequate Train Control Safety Practices an...,[Inadequate Train Control Safety Practices and...,
2,1,36,1_Inconsistent Aerodrome Procedures and Commun...,[Inconsistent Aerodrome Procedures and Communi...,
3,2,27,2_Systemic Training and Compliance Failures Ac...,[Systemic Training and Compliance Failures Acr...,
4,3,18,3_Inadequate Safety Systems and Operational Pr...,[Inadequate Safety Systems and Operational Pro...,
5,4,13,4_Inadequate Operational Safety Measures in Av...,[Inadequate Operational Safety Measures in Avi...,
6,5,13,5_Inadequate Communication and Coordination in...,[Inadequate Communication and Coordination in ...,
7,6,13,6_Inadequate Oversight and Safety Standards in...,[Inadequate Oversight and Safety Standards in ...,
8,7,10,7_Maritime and Rail Operations Safety Deficien...,[Maritime and Rail Operations Safety Deficienc...,
9,8,10,8_Operational and Procedural Safety Issues at ...,[Operational and Procedural Safety Issues at Q...,


## Comparing potential models

In [13]:
def inspect_topic_assignments(model, df, num = 5):
    # Get 10 rows for each topic
    grouped = df.groupby('topic')
    group_dfs = [group.sample(min(len(group), num), random_state=42) for _, group in grouped]

    has_outliers = group_dfs[0]['topic'].values[0] == -1

    # Remove topic -1
    # group_dfs = [group for group in group_dfs if group['topic'].values[0] != -1]

    topics = model.get_topic_info()['Representation']

    counts = model.get_topic_info()['Count']
    
    printout_string = ""
    
    # if has_outliers:
    #     topics = topics[1:]
    #     counts = counts[1:]

    for i, samples, topic, count  in zip(range(0, len(group_dfs)+1), group_dfs, topics, counts):
        if has_outliers and i == 0:
            printout_string += f"Topi -1: This is the outlier topic and contains {counts[0]} safety issues\n"
        else:
            printout_string += f"Topic {i-1 if  has_outliers else i}: '{topic[0]}' containing {count} safety issues\n"
        printout_string += "Here are the safety issues inside the topic:\n\n"
        examples = ["report_id: {}: {}".format(report_id, si) for report_id, si in zip(samples['report_id'], samples['si'])]
        printout_string += "\n\n".join(examples)
        printout_string += "\n" + "="*150 + "\n"  # Separator for clarity

    return printout_string

model_id = 6658


print(inspect_topic_assignments(updated_rows[model_id]['model'], updated_rows[model_id]['embedding_df']))

Topi -1: This is the outlier topic and contains 87 safety issues
Here are the safety issues inside the topic:

report_id: 2014_005: Safety issue: There was a risk of not knowing an aircraft's capability when using standard passenger weights, and therefore of pilots operating close to the limits of their aircraft's performance.




report_id: 2010_010: Safety issue: The false green light on the verification system misled the pilots of ZK-NEB into believing that the nose landing gear was fully down and locked.

There were these recommendations made to address the safety issues.
Recommendation 027/12: "On date the Commission recommended to the Director of Civil Aviation that he urge Transport Canada to:
- note the instances of false verification of landing gear position reported for the Q300 and some related aeroplanes and the potential for a false indication to cause an accident, and
- require Bombardier Aerospace to take action to improve the reliability and dependability of the down-lo

In [15]:
mode_counts = check_mode_cluster_distribution(updated_rows[model_id]['embedding_df'])

# For each topic figure out which mode is most prevalant

mode_counts['topic_mode'] = mode_counts.apply(lambda row: row.idxmax(), axis = 1)

mode_counts['error'] = mode_counts.apply(lambda row: (sum(row[0:3])-row[int(row['topic_mode'])]) / sum(row[0:3]), axis = 1)

mode_counts

mode,0,1,2,topic_mode,error
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,9.0,0.0,0.0,0,0.0
0,35.0,0.0,0.0,0,0.0
1,43.0,0.0,0.0,0,0.0
2,27.0,1.0,2.0,0,0.1
3,19.0,0.0,0.0,0,0.0
4,13.0,0.0,2.0,0,0.133333
5,16.0,0.0,0.0,0,0.0
6,15.0,0.0,0.0,0,0.0
7,16.0,10.0,3.0,0,0.448276
8,12.0,0.0,0.0,0,0.0


In [26]:
display(updated_rows[model_id]['model'].get_topic_info())
updated_rows[model_id]['embedding_df'].groupby('topic').count()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,48,-1_Inadequate Safety Protocols and Training in...,[Inadequate Safety Protocols and Training in R...,[Safety issue: The signaller was certified and...
1,0,147,0_Maritime Safety Management System Deficiencies,[Maritime Safety Management System Deficiencies],[Safety issue: The operation of L'Austral's EC...
2,1,55,1_Inadequate Communication and Procedural Comp...,[Inadequate Communication and Procedural Compl...,[Safety issue: Miscommunication between the tr...
3,2,55,2_Helicopter Operational Safety and Compliance...,[Helicopter Operational Safety and Compliance ...,[Safety issue: All three Robinson helicopter m...
4,3,41,3_Systemic Issues in Air Traffic Control and A...,[Systemic Issues in Air Traffic Control and Ae...,[Safety issue: Civil Aviation Rule 91.223 can ...
5,4,21,4_Inadequate Aircraft Equipment Checks and Mai...,[Inadequate Aircraft Equipment Checks and Main...,"[Safety issue: The use of ""threat and error ma..."
6,5,21,5_Inadequate Rail Level Crossing Safety Measur...,[Inadequate Rail Level Crossing Safety Measure...,[Safety issue: A large percentage of pedestria...
7,6,20,6_Inadequate Safety and Maintenance Protocols ...,[Inadequate Safety and Maintenance Protocols i...,[Safety issue: The number 4 cylinder upper spa...
8,7,18,7_Ineffective Communication and Risk Managemen...,[Ineffective Communication and Risk Management...,"[Safety issue: The train controller, who was p..."
9,8,14,8_Inadequate Safety Measures for Wellington Tr...,[Inadequate Safety Measures for Wellington Tra...,[Safety issue: There are a number of reasonabl...


Unnamed: 0_level_0,report_id,si,quality,mode,recommendations,si_embedding,0,1,2,3,...,12,13,14,15,16,17,18,19,20,21
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,48,48,48,48,48,48,48,48,48,48,...,48,48,48,48,48,48,48,48,48,48
0,147,147,147,147,147,147,147,147,147,147,...,147,147,147,147,147,147,147,147,147,147
1,55,55,55,55,55,55,55,55,55,55,...,55,55,55,55,55,55,55,55,55,55
2,55,55,55,55,55,55,55,55,55,55,...,55,55,55,55,55,55,55,55,55,55
3,41,41,41,41,41,41,41,41,41,41,...,41,41,41,41,41,41,41,41,41,41
4,21,21,21,21,21,21,21,21,21,21,...,21,21,21,21,21,21,21,21,21,21
5,21,21,21,21,21,21,21,21,21,21,...,21,21,21,21,21,21,21,21,21,21
6,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
7,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
8,14,14,14,14,14,14,14,14,14,14,...,14,14,14,14,14,14,14,14,14,14


In [14]:
make_visualization(updated_rows[model_id]['model'], updated_rows[model_id]['embedding_df'])

NameError: name 'column_to_2darray' is not defined

## Trying out hierarchal model of potential models

There is a problem of there being two uses for the topic model.
1. For an investigator to find quite specifcic 'safety themes' this would be the Wellington train station example
2. The researchers trying to find more systemic system 'safety themes'. These would be like communication, oversight etc.

To solve this with one model we coudl take it and merge some of the similar topics togather to get few but more general topics.

In [12]:
target_model_id  = 6658

target_model = updated_rows[target_model_id]['model']
docs = updated_rows[target_model_id]['embedding_df']['si']
hierarchical_topics = target_model.hierarchical_topics(docs)

hierarchical_topics

100%|██████████| 19/19 [00:25<00:00,  1.32s/it]


Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
18,38,Inadequate Crewing Standards and Operational O...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",8,Operational Safety and Procedural Issues at Qu...,37,Inadequate Regulatory Oversight and Compliance...,0.832716
17,37,Inadequate Regulatory Oversight and Compliance...,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14...",34,Inadequate Training and Competency Verificatio...,36,Incomplete Safety Records for Vessel Management,0.775404
16,36,Incomplete Safety Records for Vessel Management,"[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]",27,Gaps in Regulatory Compliance and Safety Manag...,35,Inadequate Training and Procedures for Emergen...,0.676809
15,35,Inadequate Training and Procedures for Emergen...,"[14, 15, 16, 17, 18, 19]",17,Inadequate Non-Technical Skills and Safety Com...,33,Inadequate Safety Management Systems in Mariti...,0.634879
14,34,Inadequate Training and Competency Verificatio...,"[0, 1, 2, 3, 4, 5, 6, 7]",30,Deficient Safety Management and Oversight in M...,32,Inadequate Safety Procedures and Training in T...,0.624104
13,33,Inadequate Safety Management Systems in Mariti...,"[14, 15, 16, 18, 19]",28,Inadequate Safety and Risk Management Systems ...,19,Inadequate Safety Measures and Risk Assessment...,0.600795
12,32,Inadequate Safety Procedures and Training in T...,"[0, 1, 2, 3, 4, 5]",31,Deviation and Monitoring of Passage Plans in M...,4,Inadequate Safety Measures Across Transport Modes,0.563088
11,31,Deviation and Monitoring of Passage Plans in M...,"[0, 1, 2, 3, 5]",29,Inadequate Communication and Training in Rail ...,24,Systemic Safety Management Failures in Transpo...,0.509006
10,30,Deficient Safety Management and Oversight in M...,"[6, 7]",6,Inadequate Oversight and Safety Management in ...,7,Maritime and Railway Operational Safety Concerns,0.506736
9,29,Inadequate Communication and Training in Rail ...,"[1, 2, 5]",1,Communication and Procedure Misinterpretations...,25,Inadequate Crew Resource Management and Commun...,0.477288


In [14]:
print(target_model.get_topic_tree(hierarchical_topics))

.
├─■──Operational Safety and Procedural Issues at Queenstown Aerodrome ── Topic: 8
└─Inadequate Regulatory Oversight and Compliance in Transport Operations
     ├─Inadequate Training and Competency Verification in Transport Safety Systems
     │    ├─Deficient Safety Management and Oversight in Maritime and Aviation Operations
     │    │    ├─■──Inadequate Oversight and Safety Management in Aviation and Maritime Operations ── Topic: 6
     │    │    └─■──Maritime and Railway Operational Safety Concerns ── Topic: 7
     │    └─Inadequate Safety Procedures and Training in Transport Operations
     │         ├─Deviation and Monitoring of Passage Plans in Maritime Navigation
     │         │    ├─Inadequate Communication and Training in Rail and Aviation Operations
     │         │    │    ├─■──Communication and Procedure Misinterpretations in Aviation and Rail Operations ── Topic: 1
     │         │    │    └─Inadequate Crew Resource Management and Communication Procedures
     │       

In [15]:
target_model.visualize_hierarchy()

In [64]:
def merge_topics_togather(docs, model, hierachical_topics, num_output_topics = None, distance = None):
    
    if num_output_topics is None and distance is None:
        raise ValueError("Please provide either num_output_topics or distance")
    
    current_topics = set(hierachical_topics.loc[len(hierachical_topics)-1, 'Topics'].copy())
    current_index = 0
    if num_output_topics is None: num_output_topics = len(current_topics)
    if distance is None: distance = 1
    while num_output_topics < len(current_topics) or distance > hierachical_topics.loc[current_index, 'Distance']:
        current_topics = find_next_merge(hierachical_topics, current_topics, current_index)
        current_index += 1
        if current_index >= len(hierachical_topics):
            break
    
    topics_to_merge = [topic for topic in current_topics if topic > len(hierachical_topics)]
    topics_to_merge = [
        hierachical_topics[hierachical_topics['Parent_ID'] == str(topic)]['Topics'].tolist()[0]
        for topic in topics_to_merge    
    ]
    print(topics_to_merge)


    # Make temp save of model to get copy
    model.representation_model = None
    model.save("temp_model")

    new_merged_topic_model = BERTopic.load("temp_model")

    os.remove("temp_model")
    new_merged_topic_model.representation_model = openai_base_representation_model
    model.representation_model = openai_base_representation_model
    new_merged_topic_model.merge_topics(docs, topics_to_merge)

    return new_merged_topic_model

def find_next_merge(hierachical_topics, current_topics, current_index):
    # Get last row
    last_row = hierachical_topics.loc[current_index,]
    last_row_merge = last_row['Topics']
    current_topics.add(int(last_row['Parent_ID']))
    # Remove old topics
    current_topics = current_topics - set(last_row_merge)
    # Remove old merged topics
    for i, row in hierachical_topics.loc[current_index-1:,].iterrows():
        if set(row['Topics']).issubset(set(last_row_merge)):
            current_topics.discard(int(row['Parent_ID']))

    return current_topics

merged_model = merge_topics_togather(docs,target_model, hierarchical_topics, distance =  0.5)
merged_model.get_topic_info()



[[0, 3], [9, 10, 11, 12, 13], [14, 15, 16, 18], [1, 2, 5]]


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,87,-1_Inadequate Safety Protocols and Training Ac...,[Inadequate Safety Protocols and Training Acro...,[Safety issue: The operation of L'Austral's EC...
1,0,159,0_Insufficient Emergency Systems and Pilot Com...,[Insufficient Emergency Systems and Pilot Comp...,[Safety issue: The National Rail System Standa...
2,1,113,1_Inadequate Safety Management and Risk Mitiga...,[Inadequate Safety Management and Risk Mitigat...,[Safety issue: The operator had had four serio...
3,2,76,2_Communication Failures and Procedural Noncom...,[Communication Failures and Procedural Noncomp...,[Safety issue: If the issues of mechanically u...
4,3,56,3_Systemic Deficiencies in Maritime and Rail S...,[Systemic Deficiencies in Maritime and Rail Sa...,"[Safety issue: The train controller, who was p..."
5,4,13,4_Inadequate Safety Protocols and Oversight in...,[Inadequate Safety Protocols and Oversight in ...,[Safety issue: The similarity of text in adjac...
6,5,13,5_Inadequate Oversight and Compliance in Marit...,[Inadequate Oversight and Compliance in Mariti...,[Safety issue: the management of the weight an...
7,6,13,6_Inadequate Non-Technical Skills and Safety E...,[Inadequate Non-Technical Skills and Safety Eq...,[Safety issue: The operator's planned water em...
8,7,10,7_Maritime and Transport Safety Management Def...,[Maritime and Transport Safety Management Defi...,[Safety issue: The maintenance and testing pro...
9,8,10,8_Inconsistent Aircraft Approach Procedures at...,[Inconsistent Aircraft Approach Procedures at ...,[Safety issue: The visual circuit procedure pu...


In [65]:
updated = merged_model.get_document_info(docs, updated_rows[target_model_id]['embedding_df'])

merged_df = updated_rows[target_model_id]['embedding_df'].copy()
merged_df['topic'] = updated['Topic']
visualize_mode_distrbution_across_topics(updated_rows[target_model_id]['embedding_df'])
visualize_mode_distrbution_across_topics(merged_df)

## Using GPT-4o to provide feedback of models

As all of the safety issues only have a token length of 22,000 I thought about giving all of them to the topics and seeing what gpt-4o would say about each of these models.
The SI with recommendations context tyakes about 55,000 tokens.

In [15]:
prompt = f"""
I have used BERTopic to create a topic model of these safety issues to generate safety themes.

I would like you to help me with wo things.
- Firstly help me come up with better titles and descriptions of each of the topics.
- Provide a review of how good the model is at classifying these topics.

Here are all of the topics with their respective safety issues:

'''
{inspect_topic_assignments(updated_rows[model_id]['model'], updated_rows[model_id]['embedding_df'], 1000)}
'''

Your response should be in this format

### Topic titles and descirptions
[new topic titles and descriptions in yaml format. The topics and desciptions should be in the same order as the list given above. Further more all topics need to be named and given a descriptions, even if the title will stay the same.]

### Topic reviews
[Review of the topic model as a whole. How well has it dont on identifying the safety themes present.]
"""

response = openAICaller.query(
    system = """
    You are a assistant who helps evaluate topic models.
    There are two important definitions that you should use:
    - Safety issue: A factor that can affect the safety of future operations, characteristic of an organization, system, or environment at a specific point in time.
    - Safety theme: Recurring circumstances or causes across transport modes or over time, covering one or more related safety issues.
    """,
    user = prompt,
    model = 'gpt-4'
)

print(response)

### Topic titles and descriptions
```yaml
- title: Outlier Issues in Aviation
  description: This topic includes various safety issues related to the reliability and operational practices of aircraft systems, particularly focusing on landing gear performance, flight simulator realism, and unexpected mechanical failures. These could not be neatly classified into any specific category.

- title: Risk Management and Crew Resource Practices in Rail and Maritime Operations
  description: This topic addresses deficiencies in risk management, crew practices, and operational protocols across rail and maritime sectors. It includes issues such as helicopter operation beyond limits, reconciling unique rotor designs, and properly implementing training for flight in different conditions.

- title: Inconsistent Aviation Procedures and Communication Protocols
  description: This topic covers inconsistencies and ambiguities in aviation protocols and communication, including misunderstandings in air tr

In [16]:
yaml_regex = re.compile(r"```yaml\n([\s\S]+)```", flags = re.MULTILINE)
yaml_string = yaml_regex.findall(response)[0]

yaml_obj = yaml.safe_load(yaml_regex.findall(response)[0])

yaml_obj

[{'title': 'Outlier Issues in Aviation',
  'description': 'This topic includes various safety issues related to the reliability and operational practices of aircraft systems, particularly focusing on landing gear performance, flight simulator realism, and unexpected mechanical failures. These could not be neatly classified into any specific category.'},
 {'title': 'Risk Management and Crew Resource Practices in Rail and Maritime Operations',
  'description': 'This topic addresses deficiencies in risk management, crew practices, and operational protocols across rail and maritime sectors. It includes issues such as helicopter operation beyond limits, reconciling unique rotor designs, and properly implementing training for flight in different conditions.'},
 {'title': 'Inconsistent Aviation Procedures and Communication Protocols',
  'description': 'This topic covers inconsistencies and ambiguities in aviation protocols and communication, including misunderstandings in air traffic control 

## Maually looking at models

### Running it on all safety issues


I want to generate the safety themes from all of the safety issues I have available.

#### Simple minilm embeddings

This seems to of failed. I believe this is mainly due to the fact that each documents are really short.

In [None]:

topic_model, _ = runBERTopic(
    safety_issues_df, 'si', None, openai_base_representation_model, umap_model, reduce_outliers=False)

topic_model.get_topic_info()


There is a bit of a problem where the number of outliers is quite great.

I will try to merge the outliers

In [None]:
topic_model, _ = runBERTopic(
    safety_issues_df, 'si', None, openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,166,0_Rail Safety and Operational Issues in New Ze...,[Rail Safety and Operational Issues in New Zea...,[The training that drivers received for transi...
1,1,64,1_Maritime Safety and Navigation Management Is...,[Maritime Safety and Navigation Management Iss...,[The voyage planning for the time in the Snare...
2,2,36,2_Maritime Safety and Regulations Compliance I...,[Maritime Safety and Regulations Compliance Is...,[The skipper did not have the requisite knowle...
3,3,53,3_Safety and Maintenance Issues in Engineering...,[Safety and Maintenance Issues in Engineering ...,[There was a lack of clear communication and a...
4,4,53,4_Maritime and Aviation Safety Management and ...,[Maritime and Aviation Safety Management and E...,[It could not be established why the chief off...
5,5,50,5_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],[Had the controllers realised that the low clo...
6,6,27,6_Robinson Helicopter Safety and Accident Anal...,[Robinson Helicopter Safety and Accident Analy...,"[Due to their unique main rotor design, during..."
7,7,62,7_Aviation Safety and Regulatory Compliance Is...,[Aviation Safety and Regulatory Compliance Iss...,[The standard of pilot training and the superv...
8,8,26,8_Aircraft Landing Gear and Door Lock Failures,[Aircraft Landing Gear and Door Lock Failures],[Had the pilots known that the nose landing ge...
9,9,23,9_Deficiencies in Safety and Regulatory Compli...,[Deficiencies in Safety and Regulatory Complia...,[There were no established procedures for ente...


The main problem here is that the the distribution is not great. It seems that most of the rail are in the first topic then martime and aviation take up the rest.

#### VoyageAI embeddings

In [None]:
topic_model, voyageai_clusters_df = runBERTopic(
    voyageai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()

NameError: name 'voyageai_embeddings' is not defined

In [None]:


check_mode_cluster_distribution(voyageai_clusters_df)

mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,204,8,167
1,2,177,2


This has created two topics wiht one being avaiation and martime and the other being rail.

#### OpenAI embeddings

In [None]:
topic_model, openai_clusters_df = runBERTopic(
    openai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()[['Count', 'Name']]



Unnamed: 0,Count,Name
0,203,0_Aviation Safety and Compliance Issues
1,189,1_Rail Safety and Operational Issues in New Ze...
2,142,2_Maritime Safety and Navigation Management Flaws
3,26,3_Maritime Safety and Compliance Issues of the...


In [None]:
check_mode_cluster_distribution(openai_clusters_df)

mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,191.0,4.0,8.0
1,5.0,181.0,3.0
2,4.0,0.0,138.0
3,6.0,0.0,20.0


This has also made a cleanish split between modes of transport. I can eithe try to force it not to do this and/or run the model on each mode then merge the models.

In [None]:
umap_model_tweaked = UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

topic_model, openai_clusters_tweaked_df = runBERTopic(
    openai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model_tweaked, reduce_outliers=True)

display(topic_model.get_topic_info()[['Count', 'Name']])

check_mode_cluster_distribution(openai_clusters_tweaked_df)



Unnamed: 0,Count,Name
0,167,0_Rail Safety and Operational Failures
1,115,1_Maritime Safety and Resource Management Defi...
2,50,2_Safety and Compliance in Transport and Marit...
3,51,3_Aviation Safety and Regulatory Compliance Is...
4,41,4_Helicopter Safety and Operational Issues
5,52,5_Aviation Safety and Air Traffic Control Issues
6,27,6_Safety Issues in Rail Operations
7,30,7_Aircraft Landing Gear and Maintenance Issues
8,11,8_Aviation Safety Issues Related to Door Locki...
9,16,9_Safety and Maintenance Challenges in Maritim...


mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.0,157.0,7.0
1,7.0,0.0,108.0
2,15.0,2.0,33.0
3,47.0,0.0,4.0
4,40.0,1.0,0.0
5,51.0,0.0,1.0
6,2.0,25.0,0.0
7,30.0,0.0,0.0
8,11.0,0.0,0.0
9,0.0,0.0,16.0


I will try to tune the hyper paramters and see if I can get the right sort of safety themes

In [None]:
topic_model, openai_clusters_tuned_df = runBERTopic(
    openai_embeddings,
    'si',
    'si_embedding',
    openai_base_representation_model,
    UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine', random_state=42),
    reduce_outliers=True)

topic_model.get_topic_info()[['Count', "Name","Representative_Docs"]]



Unnamed: 0,Count,Name,Representative_Docs
0,167,0_Rail Safety and Operational Issues in New Ze...,[The training that drivers received for transi...
1,115,1_Maritime Safety and Resource Management Defi...,[The standard of passage planning on board the...
2,50,2_Maritime and Aviation Safety Regulations and...,[The absence of a visual indicator in the whee...
3,51,3_Aviation Safety and Regulatory Compliance Is...,[The operator's system for training its pilots...
4,41,4_Helicopter Safety and Maintenance Issues,"[Due to their unique main rotor design, during..."
5,52,5_Aviation Safety and Operational Procedures a...,[While ATC sequences an IFR aeroplane to land ...
6,27,6_Safety Issues and Management Deficiencies in...,[The train controller made an assumption about...
7,30,7_Aircraft Landing Gear and Maintenance Issues,[Had the pilots known that the nose landing ge...
8,11,8_Aviation Safety and Equipment Malfunction,"[The use of ""threat and error management"" (TEM..."
9,16,9_Maintenance and Risk Management in Marine Sa...,[A clear placard should be placed at the contr...


### Run cluster on just one mode

It would make sense that if the clustering is finding the transport modes then splitting into the modes first might help find the themes within each mode.

In [None]:
def printout_each_modes_topics(results):
    for res in results:
        print("Cluster names: ")
        for i, count in zip(res[0].get_topic_info()['Name'], res[0].get_topic_info()['Count']):
            print(f"{count}, {i}")

#### VoyageAI

In [None]:
voyageai_modes_dfs = [voyageai_embeddings[voyageai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

In [None]:
results = [runBERTopic(df, 'si', 'si_embedding', openai_base_representation_model, umap_model) for df in voyageai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
54, 0_Aviation Safety and Operational Procedures
34, 1_Safety Challenges and Risks in Robinson Helicopter Operations
62, 2_Aviation Safety and Regulatory Compliance Issues
25, 3_Aircraft Landing Gear and Door System Failures
31, 4_Aircraft Maintenance and Safety Concerns
Cluster names: 
166, 0_Rail Safety and Management Issues
19, 1_Safety and Regulatory Issues at Rail Level Crossings
Cluster names: 
28, 0_Maritime Safety and Bridge Resource Management Deficiencies
33, 1_Maritime Safety and Management Failures
43, 2_Maritime Safety and Navigation Standards Compliance
30, 3_Maritime Safety and Emergency Response Deficiencies
22, 4_Maritime Safety Violations and Consequences aboard the Easy Rider
13, 5_Propulsion System Failures and Maintenance Issues in Marine Operations


In [None]:
checking = results[2][1]

In [None]:
merged_moode_models = BERTopic.merge_models([result[0] for result in results], min_similarity=0.9)


merged_moode_models.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,52,0_Aviation Safety and Air Traffic Control Proc...,[Aviation Safety and Air Traffic Control Proce...,
1,1,47,1_Helicopter Safety and Accident Analysis,[Helicopter Safety and Accident Analysis],
2,2,41,2_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],
3,3,33,3_Aircraft Safety and Maintenance Issues,[Aircraft Safety and Maintenance Issues],
4,4,17,4_Aviation Safety and Regulatory Compliance in...,[Aviation Safety and Regulatory Compliance in ...,
5,5,16,5_Safety and Regulatory Issues in New Zealand ...,[Safety and Regulatory Issues in New Zealand A...,
6,6,44,0_KiwiRail Safety and Compliance Issues,[KiwiRail Safety and Compliance Issues],
7,7,40,1_Rail Safety and Communication Failures,[Rail Safety and Communication Failures],
8,8,25,2_Safety and Management Issues in Rail Operations,[Safety and Management Issues in Rail Operations],
9,9,20,3_Safety and Regulatory Issues at Road-Rail Le...,[Safety and Regulatory Issues at Road-Rail Lev...,


#### OpenAI

In [None]:
openai_modes_dfs = [openai_embeddings[openai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

for df in openai_modes_dfs:
    display(df)

Unnamed: 0,report_id,si,mode,si_embedding
0,2011_003,The New Zealand regulatory system has not prov...,0,"[0.0187440924346447, -0.000433413457358256, -0..."
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[0.01013844646513462, -0.03145159035921097, -0..."
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[0.005347656551748514, -0.022685393691062927, ..."
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[0.014976576901972294, 0.015324870124459267, -..."
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.0042054359801113605, 0.04125332459807396, ..."
...,...,...,...,...
201,2015_001,Parachute drop pilots were not required to wea...,0,"[0.0253401268273592, -0.02335318550467491, -0...."
202,2011_006,The council had not evaluated the effects of t...,0,"[-0.02767498977482319, 0.01624125801026821, -0..."
203,2011_006,The standard of pilot training and the supervi...,0,"[0.015297695063054562, -0.018917182460427284, ..."
204,2011_006,The CAA had had recurring concerns for the man...,0,"[0.001043604570440948, 0.00177335599437356, 0...."


Unnamed: 0,report_id,si,mode,si_embedding
0,2019_106,No procedures were in place to direct train cr...,1,"[0.017140474170446396, 0.03509647026658058, -0..."
1,2013_107,"The high incidence of brake block replacement,...",1,"[-0.0018233972368761897, 0.020808950066566467,..."
2,2013_107,The visual inspection regime for wheel-bearing...,1,"[0.0025237964000552893, 0.027265744283795357, ..."
3,2013_107,"The RailBAM system, while operational, did not...",1,"[0.006221923511475325, 0.025432679802179337, -..."
4,2013_107,The lack of a dedicated RailBAM analyst positi...,1,"[-0.004680005367845297, 0.013756909407675266, ..."
...,...,...,...,...
180,2017_101,KiwiRail did not have a mature fatigue risk ma...,1,"[-0.006654317956417799, 0.029867829754948616, ..."
181,2017_101,The eProtect KMC module on board the locomotiv...,1,"[-0.003919209353625774, 0.022458476945757866, ..."
182,2017_104,Transdev had no policies or procedures in plac...,1,"[0.013986819423735142, 0.01571197435259819, -0..."
183,2020_104,Implementation of an administrative control me...,1,"[-0.013463953509926796, -0.007039009593427181,..."


Unnamed: 0,report_id,si,mode,si_embedding
0,2019_202,There is limited data to quantify the extent o...,2,"[-0.0015265028923749924, 0.013446947559714317,..."
1,2019_201,the operator's planned maintenance programme d...,2,"[0.03530280664563179, 0.027329862117767334, 0...."
2,2019_201,the operator's hazard identification system ha...,2,"[0.009704935364425182, 0.02645685337483883, 0...."
3,2019_204,The operator had not included predefined weath...,2,"[0.038087889552116394, 0.000508625409565866, 0..."
4,2019_204,The operator of the Henerata had not assessed ...,2,"[0.018176013603806496, 0.026440272107720375, 0..."
...,...,...,...,...
164,2017_203,Technicians who are authorised to conduct mand...,2,"[0.002318679355084896, 0.015887508168816566, -..."
165,2013_201,The firefighting drills held on board the Taok...,2,"[0.006056208163499832, 0.01051066443324089, -0..."
166,2014_201,crew awareness of the operating limitations of...,2,"[-0.029451534152030945, 0.026009364053606987, ..."
167,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.021512825042009354, 0.029569942504167557, ..."


In [None]:
results = [runBERTopic(df, 'si', 'si_embedding', openai_base_representation_model, umap_model) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
46, 0_Aviation Safety and Operational Procedures Issues
42, 1_Aircraft Maintenance and Safety Issues
37, 2_Challenges and Safety Issues in Robinson Helicopter Operations
51, 3_Aviation Safety and Regulatory Oversight in New Zealand
30, 4_Aviation Safety and Emergency Response
Cluster names: 
49, 0_KiwiRail Safety and Compliance Issues
28, 1_Rail Safety and Inspection Inefficiencies
42, 2_Rail Safety and Communication Issues
27, 3_Safety and Oversight Concerns in Train Operations
21, 4_Road and Rail Safety at Level Crossings
18, 5_Risk Management and Safety Issues in Wellington Station Train Operations
Cluster names: 
150, 0_Maritime Safety and Crew Management Deficiencies
19, 1_Maritime Safety and Compliance Issues


I will try instead to do with no dimension reduction, or atleast decrease the amount of dimension reduction.

In [None]:
from bertopic.dimensionality import BaseDimensionalityReduction

results = [runBERTopic(df,
                       'si',
                       'si_embedding',
                       openai_base_representation_model,
                       BaseDimensionalityReduction()
                       ) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
46, 0_Aviation Safety and Air Traffic Management Issues
47, 1_Aircraft Maintenance and Safety Issues
41, 2_Safety and Training Issues in Robinson Helicopter Operations
53, 3_Aviation Safety and Compliance Issues
19, 4_Safety and Regulatory Oversight in Aviation and Parachuting Operations
Cluster names: 
47, 0_Issues in KiwiRail's Safety and Operational Procedures
62, 1_Rail Safety and Incident Analysis
29, 2_Rail Safety and Signal Management Issues in Wellington Station Approaches
20, 3_Safety Issues at Rail Level Crossings
27, 4_Safety and Risk Management in Rail Operations
Cluster names: 
149, 0_Maritime Safety and Resource Management Issues
20, 1_Maritime Safety Violations and the Sinking of the Easy Rider


This reuslts in just one cluster for each as the curse of dimensionality is prudent here. I will instead try to tune the hyper parameters of OPenAI

In [None]:
from bertopic.dimensionality import BaseDimensionalityReduction

results = [runBERTopic(df,
                       'si',
                       'si_embedding',
                       None,
                       UMAP(n_neighbors=6, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
                       ) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
52, 0_the_to_of_and
47, 1_the_to_of_and
41, 2_the_of_to_and
33, 3_the_gear_landing_to
17, 4_the_to_for_water
16, 5_zealand_new_of_the
Cluster names: 
44, 0_the_to_work_of
40, 1_the_train_to_and
25, 2_train_of_the_and
20, 3_road_level_crossings_the
20, 4_the_brake_braking_conditions
15, 5_the_in_of_wellington
21, 6_the_rail_of_to
Cluster names: 
120, 0_the_of_and_to
27, 1_the_to_of_easy
11, 2_co2_the_could_be
11, 3_the_fish_crew_of


In [None]:
merged_moode_models.get_topic_info()[['Count', "Name"]]

NameError: name 'merged_moode_models' is not defined


I have had a look at both single run and individual models.

I think the next step is to do some hypter paramter tuning.

 As there are not noticable differences between voyageAI and openAI I will go with openAI embedding model.

# Checking if merged model are the sames as trasnformed and from the merged model

I ahve noticed that counts returned by the `get_topic_info()` is not the same as the ones within the embedding_df dataframe. I will try to manually make a merged model to check if it is something about the hyper parameter serach functions.

In [39]:
embeddings = all_embeddings['voyageai'].copy()

display(embeddings)

mode_groups = embeddings.groupby('mode')
mode_dfs = [mode_groups.get_group(i).reset_index(drop=True) for i in range(3)]

mode_models = [BERTopic() for _ in mode_dfs]

for model, df in zip(mode_models, mode_dfs):
    model.fit_transform(
        df['si'],
        np.array([np.array(x) for x in df['si_embedding'].to_numpy()])
)
    display(model.get_topic_info())

merged_model = BERTopic.merge_models(mode_models, min_similarity=0.9)

display(merged_model.get_topic_info())

embeddings['topic'] = merged_model.transform(embeddings['si'], np.array([np.array(x) for x in embeddings['si_embedding'].to_numpy()]))[0]

embeddings['topic'].value_counts()
merged_model.get_document_info(embeddings['si'], embeddings)

Unnamed: 0,report_id,si,mode,si_embedding
0,2019_106,No procedures were in place to direct train cr...,1,"[0.00557063240557909, 0.008644572459161282, -0..."
1,2013_107,"The high incidence of brake block replacement,...",1,"[0.0043902406468987465, 0.015069461427628994, ..."
2,2013_107,The visual inspection regime for wheel-bearing...,1,"[0.02037428691983223, 0.025948768481612206, -0..."
3,2013_107,"The RailBAM system, while operational, did not...",1,"[0.017969856038689613, 0.0032469567377120256, ..."
4,2013_107,The lack of a dedicated RailBAM analyst positi...,1,"[0.02813386544585228, 0.020084526389837265, -0..."
...,...,...,...,...
555,2011_006,The CAA had had recurring concerns for the man...,0,"[0.0078355073928833, 0.028676746413111687, 0.0..."
556,2017_003,The maintenance inspection programme for the l...,0,"[0.009246028028428555, 0.021894149482250214, -..."
557,2017_104,Transdev had no policies or procedures in plac...,1,"[0.013308736495673656, 0.015287939459085464, -..."
558,2020_104,Implementation of an administrative control me...,1,"[-3.3115891710622236e-05, 0.01492010336369276,..."



[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.[0m



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,15,-1_the_to_in_and,"[the, to, in, and, rescue, not, emergency, alt...",[The failure of the emergency locator transmit...
1,0,74,0_the_of_to_and,"[the, of, to, and, for, not, in, that, was, at]","[There are four factors that were not, but sho..."
2,1,62,1_the_of_to_and,"[the, of, to, and, in, landing, that, gear, wa...",[The verification system for checking if the l...
3,2,55,2_the_of_to_and,"[the, of, to, and, for, helicopter, in, helico...",[The New Zealand regulatory system has not pro...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,167,0_the_to_of_and,"[the, to, of, and, train, not, in, was, for, t...",[The training that drivers received for transi...
1,1,18,1_level_road_crossings_the,"[level, road, crossings, the, and, crossing, v...",[The Beach Road/ State Highway 1 intersection ...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,68,-1_the_of_to_and,"[the, of, to, and, not, was, in, that, system,...",[The owner and the master on board the Anatoki...
1,0,34,0_the_of_and_on,"[the, of, and, on, safety, not, management, th...",[The rescue response from the vessel's master ...
2,1,21,1_and_the_of_to,"[and, the, of, to, new, zealand, that, in, mar...",[When an accident or incident occurs that requ...
3,2,21,2_the_bridge_of_to,"[the, bridge, of, to, not, in, and, team, navi...",[The vessel's bridge team and the pilot did no...
4,3,13,3_easy_the_rider_was,"[easy, the, rider, was, not, to, did, persons,...",[Although the Easy Rider had been entered into...
5,4,12,4_co2_the_fire_systems,"[co2, the, fire, systems, fixed, of, in, not, ...",[The maintenance and testing procedures for th...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,357,-1_the_to_in_and,"[the, to, in, and, rescue, not, emergency, alt...",
1,0,74,0_the_of_to_and,"[the, of, to, and, for, not, in, that, was, at]",
2,1,74,1_the_of_to_and,"[the, of, to, and, in, landing, that, gear, wa...",
3,2,55,2_the_of_to_and,"[the, of, to, and, for, helicopter, in, helico...",


Unnamed: 0,report_id,si,mode,si_embedding,topic,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Representative_document
0,2019_106,No procedures were in place to direct train cr...,1,"[0.00557063240557909, 0.008644572459161282, -0...",-1,No procedures were in place to direct train cr...,2,2_the_of_to_and,"[the, of, to, and, for, helicopter, in, helico...",,the - of - to - and - for - helicopter - in - ...,False
1,2013_107,"The high incidence of brake block replacement,...",1,"[0.0043902406468987465, 0.015069461427628994, ...",1,"The high incidence of brake block replacement,...",2,2_the_of_to_and,"[the, of, to, and, for, helicopter, in, helico...",,the - of - to - and - for - helicopter - in - ...,False
2,2013_107,The visual inspection regime for wheel-bearing...,1,"[0.02037428691983223, 0.025948768481612206, -0...",1,The visual inspection regime for wheel-bearing...,2,2_the_of_to_and,"[the, of, to, and, for, helicopter, in, helico...",,the - of - to - and - for - helicopter - in - ...,False
3,2013_107,"The RailBAM system, while operational, did not...",1,"[0.017969856038689613, 0.0032469567377120256, ...",-1,"The RailBAM system, while operational, did not...",-1,-1_the_to_in_and,"[the, to, in, and, rescue, not, emergency, alt...",,the - to - in - and - rescue - not - emergency...,False
4,2013_107,The lack of a dedicated RailBAM analyst positi...,1,"[0.02813386544585228, 0.020084526389837265, -0...",1,The lack of a dedicated RailBAM analyst positi...,1,1_the_of_to_and,"[the, of, to, and, in, landing, that, gear, wa...",,the - of - to - and - in - landing - that - ge...,False
...,...,...,...,...,...,...,...,...,...,...,...,...
555,2011_006,The CAA had had recurring concerns for the man...,0,"[0.0078355073928833, 0.028676746413111687, 0.0...",0,The CAA had had recurring concerns for the man...,-1,-1_the_to_in_and,"[the, to, in, and, rescue, not, emergency, alt...",,the - to - in - and - rescue - not - emergency...,False
556,2017_003,The maintenance inspection programme for the l...,0,"[0.009246028028428555, 0.021894149482250214, -...",1,The maintenance inspection programme for the l...,1,1_the_of_to_and,"[the, of, to, and, in, landing, that, gear, wa...",,the - of - to - and - in - landing - that - ge...,False
557,2017_104,Transdev had no policies or procedures in plac...,1,"[0.013308736495673656, 0.015287939459085464, -...",-1,Transdev had no policies or procedures in plac...,-1,-1_the_to_in_and,"[the, to, in, and, rescue, not, emergency, alt...",,the - to - in - and - rescue - not - emergency...,False
558,2020_104,Implementation of an administrative control me...,1,"[-3.3115891710622236e-05, 0.01492010336369276,...",-1,Implementation of an administrative control me...,-1,-1_the_to_in_and,"[the, to, in, and, rescue, not, emergency, alt...",,the - to - in - and - rescue - not - emergency...,False


# Guided bertopic model

This allows us to give it some nudge in the direction of the type of topics we might see. However this importantly doesnt restrict the models that could be discovered.

In [35]:
# Getting seed topic list

safety_issue_watchlist_2024 = yaml.safe_load(open('watchlist_2024.yaml', 'r'))['watchlist']
safety_issue_watchlist_2024

response = openAICaller.query(
    system = "You are a helpful system who is going to extract seed wrods to be used for the some guided topic modelling with BERTopic.",
    user = f"""
    Here are a list of watchlisted items from 2024. Can you please return a yaml list of seed topic words for each of the watch list items

    please just return the topic lists with out titles.

    {safety_issue_watchlist_2024}
    """,
    temp = 0,
    model = 'gpt-4'
)

print(response)

```yaml
- safety, workers, rail, corridor, accidents, procedures, standards, miscommunication, fatigue
- road-rail, interface, safety, improvements, level crossings, responsibility, rules, standards, traffic, accidents
- recreational, boat, users, knowledge, skills, self-reliance, skipper, responsibility, safety, education, rules, recommendations
- recreational, boat, users, impairment, substance use, drugs, alcohol, cognitive abilities, accidents
- technologies, track, locate, safety, aviation, rail, maritime, advanced technologies, awareness, regulation
- Robinson helicopters, mast bumping, accidents, NZ, low-G flight, turbulence, in-flight accidents, investigation
```


In [39]:
seed_topic_yaml = yaml.safe_load(re.sub('(```yaml\n)|(```)', '', response))

seed_topic_yaml

seed_topic_list = [seed_topic_list.split(', ') for seed_topic_list in seed_topic_yaml]

seed_topic_list

[['safety',
  'workers',
  'rail',
  'corridor',
  'accidents',
  'procedures',
  'standards',
  'miscommunication',
  'fatigue'],
 ['road-rail',
  'interface',
  'safety',
  'improvements',
  'level crossings',
  'responsibility',
  'rules',
  'standards',
  'traffic',
  'accidents'],
 ['recreational',
  'boat',
  'users',
  'knowledge',
  'skills',
  'self-reliance',
  'skipper',
  'responsibility',
  'safety',
  'education',
  'rules',
  'recommendations'],
 ['recreational',
  'boat',
  'users',
  'impairment',
  'substance use',
  'drugs',
  'alcohol',
  'cognitive abilities',
  'accidents'],
 ['technologies',
  'track',
  'locate',
  'safety',
  'aviation',
  'rail',
  'maritime',
  'advanced technologies',
  'awareness',
  'regulation'],
 ['Robinson helicopters',
  'mast bumping',
  'accidents',
  'NZ',
  'low-G flight',
  'turbulence',
  'in-flight accidents',
  'investigation']]

In [37]:

topic_model = BERTopic(
    embedding_model=None, 
    min_topic_size=5,
    seed_topic_list=seed_topic_list,
    representation_model=openai_base_representation_model
)
topics, _ = topic_model.fit_transform(all_embeddings['voyageai']['si'], embeddings_2darrays['voyageai'])

topic_model.get_topic_info()

[['safety',
  'workers',
  'rail',
  'corridor',
  'accidents',
  'procedures',
  'standards',
  'miscommunication',
  'fatigue'],
 ['road-rail',
  'interface',
  'safety',
  'improvements',
  'level crossings',
  'responsibility',
  'rules',
  'standards',
  'traffic',
  'accidents'],
 ['recreational',
  'boat',
  'users',
  'knowledge',
  'skills',
  'self-reliance',
  'skipper',
  'responsibility',
  'safety',
  'education',
  'rules',
  'recommendations'],
 ['recreational',
  'boat',
  'users',
  'impairment',
  'substance use',
  'drugs',
  'alcohol',
  'cognitive abilities',
  'accidents'],
 ['technologies',
  'track',
  'locate',
  'safety',
  'aviation',
  'rail',
  'maritime',
  'advanced technologies',
  'awareness',
  'regulation'],
 ['Robinson helicopters',
  'mast bumping',
  'accidents',
  'NZ',
  'low-G flight',
  'turbulence',
  'in-flight accidents',
  'investigation']]



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,58,-1_Aviation and Rail Safety Issues and Regulat...,[Aviation and Rail Safety Issues and Regulator...,[There was a risk of not knowing an aircraft's...
1,0,148,0_Maritime Safety and Management Issues,[Maritime Safety and Management Issues],[The bilge pumping system on the Jubilee was n...
2,1,71,1_Train Safety and Communication Failures,[Train Safety and Communication Failures],[The passenger train was not electronically vi...
3,2,65,2_Aircraft Maintenance and Landing Gear Issues,[Aircraft Maintenance and Landing Gear Issues],[Had the pilots known that the nose landing ge...
4,3,61,3_KiwiRail Safety and Operational Issues,[KiwiRail Safety and Operational Issues],[A failure by KiwiRail to follow its operating...
5,4,33,4_Robinson Helicopter Safety and Training Issues,[Robinson Helicopter Safety and Training Issues],"[Due to their unique main rotor design, during..."
6,5,30,5_Air Traffic Control and Pilot Communication ...,[Air Traffic Control and Pilot Communication I...,[The procedure for circling below the minimum ...
7,6,20,6_Safety and Risk Management at Road and Rail ...,[Safety and Risk Management at Road and Rail L...,[Level crossing assessments do not require the...
8,7,15,7_Safety Issues in Aviation and Rail Component...,[Safety Issues in Aviation and Rail Component ...,[Repairing and replacing defective components ...
9,8,12,8_Runway Safety and Procedures Outside ATC Hours,[Runway Safety and Procedures Outside ATC Hours],[The practice of using the runway for perimete...


# Zero shot bertopic model

This is going to be done to perform a type of guided unsuprivsed learning. That is becuase we can give it some safety issues that we think are present then see how many mathc these and what other topics can be found.



In [24]:

topic_model = BERTopic(
    embedding_model=None, 
    min_topic_size=5,
    zeroshot_topic_list=safety_issue_watchlist_2024['title'],
    zeroshot_min_similarity=.85,
    representation_model=openai_base_representation_model
)
topics, _ = topic_model.fit_transform(all_embeddings['voyageai']['si'], embeddings_2darrays['voyageai'])

topic_model.get_topic_info()

TypeError: list indices must be integers or slices, not str

In [64]:
make_visualization(topic_model)

TypeError: make_visualization() missing 1 required positional argument: 'df'

# Preparing model for manual inspection and evaluation

In [77]:
pd.DataFrame(updated_rows)

Unnamed: 0,3143,5909,6041,10835,10841,14638
umap_model,UMAP(),UMAP(),UMAP(),UMAP(),UMAP(),UMAP()
hdbscan_model,"KMeans(n_clusters=17, random_state=42)","KMeans(n_clusters=17, random_state=42)","KMeans(n_clusters=17, random_state=42)","KMeans(n_clusters=17, random_state=42)",HDBSCAN(),HDBSCAN()
embedding_type,voyageai,openai,voyageai,voyageai_reccontext,voyageai_reccontext,voyageai_only_exact
model_type,group,group,group,group,group,group
merged_min_similarity,,,,,,
embedding_df,report_id ...,report_id ...,report_id ...,report_id ...,report_id ...,report_id ...
model,"BERTopic(calculate_probabilities=True, ctfidf_...","BERTopic(calculate_probabilities=True, ctfidf_...","BERTopic(calculate_probabilities=True, ctfidf_...","BERTopic(calculate_probabilities=True, ctfidf_...","BERTopic(calculate_probabilities=True, ctfidf_...","BERTopic(calculate_probabilities=True, ctfidf_..."
cluster_config,17,17,17,17,5,10
dimension_reduction_config,"(7, 4)","(5, 5)","(7, 5)","(7, 5)","(9, 5)","(19, 3)"
embedding_2darray,"[[0.00557063240557909, 0.008644572459161282, -...","[[0.017140474170446396, 0.03509647026658058, -...","[[0.00557063240557909, 0.008644572459161282, -...","[[-0.0004589550953824073, 0.03601374477148056,...","[[-0.0004589550953824073, 0.03601374477148056,...","[[0.0030111961532384157, 0.006824926473200321,..."


In [56]:
def get_readme_text(num_topics, embedding_type, model_type, merged_min_similarity, cluster_model_type, cluster_config, dimmension_reduction_type, dimension_reduction_config, **kwargs):
    return f"""

Thank you for taking the time to do this model evaluation.

You are going to do help us validate and come up with a statistic of how accurate our current model is.

The model has read all 560 safety issues from 2010-2020 and come up with {num_topics} "themes".
Each safety issues has assigned to one of these safety themes.

Your job is going to be to read some of the safety issues and either say "Yes it is in the correct theme" or "No its not in the correct theme and here is the correc theme".

To do this you will find in the Topic descriptions tab there are the topics and their description along with the topic id. Then in the data tab you will find the safety issues with an empty column "human_topic" and the topic id in the column "topic".
You should fill in this column with the corresponding topic id.

Below is just some config information that you can ignore but helps us to know the exact parameters for this model

embedding type: {embedding_type}
model type: {model_type}
merged min similarity: {merged_min_similarity}
cluster model type: {cluster_model_type}
cluster config: {cluster_config}
dimmension reduction type: {dimmension_reduction_type}
dimension reduction config: {dimension_reduction_config}

"""

In [111]:
def make_model_validation_folder(model_row, output_name):

    # Get dataframes togather

    topics_df = model_row['model'].get_topic_info()[['Topic', 'Count', 'Representation']]

    topics_df['Representation'] = topics_df['Representation'].apply(lambda x: x[0] if len(x) == 1 else '\n'.join(x))

    df = model_row['embedding_df'].assign(human_topic = None)

    display(df)

    df = df[['report_id', 'si', 'topic', 'human_topic'] + (list(range(0,model_row['num_topics']-1)) if model_row['cluster_model_type'] == 'HDBSCAN' else [])]

    # Printout dataframes into a xlsx file.

    folder_path = f"Theme_generation_model_evalulation_for_{output_name}"

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Use openpyxl to handle the Excel file
    with pd.ExcelWriter(os.path.join(folder_path, f'{output_name}_model_evaluation.xlsx'), engine='openpyxl') as writer:

        # Create a README sheet
        workbook = writer.book
        readme_sheet = workbook.create_sheet(title='README')

        # Merge all cells in the README sheet to create a large text box
        readme_text = get_readme_text(**model_row)
        max_row = 50  # Define the size of the merged cell
        max_col = 10  # Define the size of the merged cell
        readme_sheet.merge_cells(start_row=1, start_column=1, end_row=max_row, end_column=max_col)
        readme_cell = readme_sheet.cell(row=1, column=1)
        readme_cell.value = readme_text

        # Adjust cell alignment
        readme_cell.alignment = Alignment(horizontal='left', vertical='top', wrap_text=True)

        topics_df.to_excel(writer, sheet_name='Topic descriptions', index=False)
        
        df.to_excel(writer, sheet_name='Data', index=False)

    make_visualization(model_row['model'], model_row['embedding_df'], save = True, name = os.path.join(folder_path, f'{output_name}_model_visual.html'))

for model in [10841, 14638, 6041]:

    model_row = updated_rows[model]

    # display(list(range(0,model_row['num_topics']-1)))

    make_model_validation_folder(model_row, str(model))

Unnamed: 0,report_id,si,quality,mode,recommendations,si_embedding,topic,0,1,2,...,8,9,10,11,12,13,14,15,16,human_topic
0,2010_001,Safety issue: Airways also required controller...,inferred,0,[],"[-0.0004589550953824073, 0.03601374477148056, ...",5,0.009985,0.009674,0.010796,...,0.029192,0.062149,0.012053,0.055320,0.024390,0.030821,0.027764,0.034520,0.033585,
1,2010_001,Safety issue: Airways required all pireps to b...,inferred,0,[],"[0.00291510671377182, 0.025161081925034523, 0....",5,0.012390,0.011963,0.013355,...,0.034829,0.067415,0.015279,0.084865,0.030896,0.038930,0.034947,0.042619,0.039486,
2,2010_001,Safety issue: Limitations in the operator's di...,inferred,0,[],"[0.0031803413294255733, 0.021290481090545654, ...",5,0.012236,0.011809,0.013189,...,0.034486,0.067047,0.015048,0.082033,0.030409,0.038282,0.034404,0.041976,0.039113,
3,2010_001,Safety issue: The first air traffic controller...,inferred,0,[],"[-0.008666416630148888, 0.03651179000735283, 9...",5,0.007892,0.007783,0.008586,...,0.022975,0.048232,0.009775,0.049312,0.020207,0.025903,0.023045,0.028906,0.026592,
4,2010_001,Safety issue: The report of loose objects on o...,inferred,0,[],"[0.013113349676132202, 0.0227280892431736, -0....",8,0.009584,0.010809,0.013501,...,0.057782,0.037299,0.007989,0.013494,0.012039,0.013898,0.013267,0.015052,0.065911,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,2020_103,Safety issue: There was no evidence that the r...,exact,1,[{'recommendation': 'The Commission recommende...,"[0.013070201501250267, 0.010166815482079983, 0...",6,0.009371,0.008191,0.008612,...,0.009852,0.009558,0.008667,0.008388,0.009708,0.009828,0.009782,0.009856,0.009884,
556,2020_104,Safety issue: Implementation of an administrat...,exact,1,[],"[0.006289663724601269, 0.016407432034611702, -...",3,0.020365,0.046137,0.028158,...,0.026124,0.022050,0.011244,0.014572,0.013267,0.014425,0.014080,0.015234,0.024885,
557,2020_104,Safety issue: The use of an administrative con...,exact,1,"[{'recommendation': 'On 8 December 2021, the C...","[0.019125834107398987, 0.012542759999632835, 0...",1,0.017457,0.690329,0.032275,...,0.021090,0.019206,0.011881,0.014318,0.013405,0.014344,0.014011,0.014913,0.020965,
558,2020_201,Safety issue: The deckhand keeping watch on bo...,exact,2,[{'recommendation': 'On 27 May 2021 the Commis...,"[-0.010788842104375362, 0.022805511951446533, ...",0,0.685494,0.019271,0.019368,...,0.022472,0.021025,0.017419,0.018137,0.017707,0.017985,0.018259,0.018528,0.022054,


Unnamed: 0,report_id,si,quality,mode,si_embedding,topic,0,1,2,3,4,human_topic
0,2019_106,No procedures were in place to direct train c...,exact,1,"[0.0030111961532384157, 0.006824926473200321, ...",3,1.008580e-39,1.442792e-39,1.388317e-39,1.000000e+00,1.616389e-39,
1,2011_003,The New Zealand regulatory oversight provide d...,exact,0,"[0.01196909137070179, 0.01995033212006092, 0.0...",2,7.370041e-02,7.275212e-02,4.294002e-01,7.458595e-02,1.584558e-01,
2,2011_003,The format of the Robinson R22 helicopter flig...,exact,0,"[0.0027158926241099834, 0.01725800521671772, 0...",2,6.980876e-02,6.966791e-02,4.823041e-01,7.315153e-02,1.599004e-01,
3,2011_003,The rate of R22 in -flight break -up accidents...,exact,0,"[0.0060677495785057545, 0.013805699534714222, ...",2,1.213255e-39,1.226325e-39,1.000000e+00,1.318220e-39,3.826607e-39,
4,2012_105,The radio communication between the train cont...,exact,1,"[0.00872959103435278, 0.014815738424658775, -0...",3,2.745259e-02,3.799712e-02,3.721213e-02,7.414460e-01,4.245497e-02,
...,...,...,...,...,...,...,...,...,...,...,...,...
183,2011_006,The CAA had had recurring concerns for the man...,exact,0,"[0.010886683128774166, 0.024842770770192146, 0...",2,1.229697e-39,1.248239e-39,1.000000e+00,1.289297e-39,3.687729e-39,
184,2017_003,The maintenance inspection program me for the ...,exact,0,"[0.006436922587454319, 0.02227625995874405, -0...",-1,5.917898e-02,1.829640e-01,5.717386e-02,5.001040e-02,9.269159e-02,
185,2017_104,Transdev had no policies or procedures in pla...,exact,1,"[0.01229847501963377, 0.011668944731354713, -0...",3,9.962910e-40,1.444995e-39,1.361745e-39,1.000000e+00,1.610913e-39,
186,2020_104,Implementation of an administrative control me...,exact,1,"[0.002548386575654149, 0.010533169843256474, 0...",1,3.795251e-02,6.038792e-01,3.844492e-02,4.619829e-02,5.494926e-02,


Unnamed: 0,report_id,si,mode,si_embedding,topic,human_topic
0,2019_106,No procedures were in place to direct train cr...,1,"[0.00557063240557909, 0.008644572459161282, -0...",2,
1,2013_107,"The high incidence of brake block replacement,...",1,"[0.0043902406468987465, 0.015069461427628994, ...",13,
2,2013_107,The visual inspection regime for wheel-bearing...,1,"[0.02037428691983223, 0.025948768481612206, -0...",11,
3,2013_107,"The RailBAM system, while operational, did not...",1,"[0.017969856038689613, 0.0032469567377120256, ...",2,
4,2013_107,The lack of a dedicated RailBAM analyst positi...,1,"[0.02813386544585228, 0.020084526389837265, -0...",13,
...,...,...,...,...,...,...
555,2011_006,The CAA had had recurring concerns for the man...,0,"[0.0078355073928833, 0.028676746413111687, 0.0...",1,
556,2017_003,The maintenance inspection programme for the l...,0,"[0.009246028028428555, 0.021894149482250214, -...",11,
557,2017_104,Transdev had no policies or procedures in plac...,1,"[0.013308736495673656, 0.015287939459085464, -...",2,
558,2020_104,Implementation of an administrative control me...,1,"[-3.3115891710622236e-05, 0.01492010336369276,...",7,


# Visualization of themes and safety issues

Now that we have some models that seem reasonable, it is time to create a user friendly representation.

In [None]:
modes_dfs = [openai_embeddings[openai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

pd.concat(modes_dfs)

Unnamed: 0,report_id,si,mode,si_embedding
0,2011_003,The New Zealand regulatory system has not prov...,0,"[0.0187440924346447, -0.000433413457358256, -0..."
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[0.01013844646513462, -0.03145159035921097, -0..."
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[0.005347656551748514, -0.022685393691062927, ..."
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[0.014976576901972294, 0.015324870124459267, -..."
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.0042054359801113605, 0.04125332459807396, ..."
...,...,...,...,...
164,2017_203,Technicians who are authorised to conduct mand...,2,"[0.002318679355084896, 0.015887508168816566, -..."
165,2013_201,The firefighting drills held on board the Taok...,2,"[0.006056208163499832, 0.01051066443324089, -0..."
166,2014_201,crew awareness of the operating limitations of...,2,"[-0.029451534152030945, 0.026009364053606987, ..."
167,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.021512825042009354, 0.029569942504167557, ..."


In [None]:

topic_model = BERTopic.load("demo_merged_model")

all_data = pd.concat(openai_modes_dfs)

make_visualization(topic_model, all_data)




In [None]:
demo_individual_models = [BERTopic.load(f"demo_individual_model_mode_{i}") for i in range(3)]

for model, df, i in zip(demo_individual_models, modes_dfs, range(len(demo_individual_models))):
    array_embeddings = column_to_2darray(df['si_embedding'])

    reduced_array_embeddings = UMAP(n_neighbors=3, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

    visualization = model.visualize_documents(df['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

    with open(os.path.join('topic_visuals', f'demo_individual_model_mode_{i}_visual.html'), 'w') as f:
        visualization.write_html(f)

    display(visualization)



In [75]:
topic_model = BERTopic.load("demo_group_model")

all_data = pd.concat(modes_dfs)

array_embeddings = column_to_2darray(all_data['si_embedding'])

reduced_array_embeddings = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

visualization = topic_model.visualize_documents(all_data['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

with open(os.path.join('topic_visuals', 'demo_group_model_visual.html'), 'w') as f:
    visualization.write_html(f)

visualization



NameError: name 'modes_dfs' is not defined