# What

As establised in this [notebook](./safey_themes_from_safety_issues.ipynb). BERTopic seems to be the most promising method for generating safety themes from safety issues.

There are a few problems that need to be address.
- Lots of outliers
- only 3 topics being generated

## Modules

In [74]:
# local

# third parties

import yaml
import pandas as pd
import numpy as np

from plotnine import * 

from dotenv import load_dotenv

import voyageai
import openai

from bertopic import BERTopic
from bertopic.representation import OpenAI
from umap import UMAP


# builtin
import os
from itertools import product

openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Getting safety issue data

In [None]:
safety_issues_df = pd.read_csv('safety_issues.csv')

# Confirm it has the right columns report_id, si and mode

if not safety_issues_df.columns.isin(['report_id', 'si', 'mode']).any():
    print("Safety issues dataset is missing columns")
    del safety_issues_df

# Getting embeddings to be used for clustering

In [2]:
embeddings_files = [file for file in os.listdir() if file.endswith("embeddings.pkl")]

all_embeddings = {os.path.splitext(file)[0].replace("_embeddings", ""): pd.read_pickle(file) for file in embeddings_files}

# BERTopic models

I have two things that I can play with are:
- What embeddings are used
- How the topic representation are generated (keywords, openai prompts etc)

In [3]:
openai_base_representation_model = OpenAI(
    openai_client,
    model="gpt-4-turbo",
    chat=True,
    nr_docs = 50)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [4]:
column_to_2darray = lambda column: np.array([np.array(x) for x in column.to_numpy()])

def runBERTopic(df, docs_name, embeddings_name, representation_model, umap_model, reduce_outliers=True):

    topic_model = BERTopic(
        representation_model=representation_model,
        umap_model=umap_model,
        calculate_probabilities=True)

    if embeddings_name is not None:
        topics, probs = topic_model.fit_transform(
            df[docs_name],
            column_to_2darray(df[embeddings_name]))
    else:
        topics, probs = topic_model.fit_transform(df[docs_name])
        
    if reduce_outliers:
        topics = topic_model.reduce_outliers(
            documents=df[docs_name].to_list(),
            topics=topics, 
            probabilities=probs,
            strategy="probabilities")

        topic_model.update_topics(
            df[docs_name].to_list(),
            topics=topics,
            representation_model=representation_model)
        
    df['topic'] = topics
    
    df = pd.concat([df, pd.DataFrame(probs)], axis=1)

    return topic_model, df

In [5]:
def perform_umap_parameter_search(df, n_neighbors_range = range(4,5), n_components_range = range(4,5)):
    overall_results = []

    mode_groups = df.groupby('mode')

    modes_dfs = [mode_groups.get_group(x).reset_index(drop=True) for x in mode_groups.groups]

    for n_neighbors, n_components in product(n_neighbors_range, n_components_range):
        print(f" Looking at {n_neighbors}, {n_components}")
        results = [runBERTopic(df,
                        'si',
                        'si_embedding',
                        None,
                        UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42),
                        False
                        ) for df in modes_dfs]

        
        group_clusters = runBERTopic(
            df,
            'si',
            'si_embedding',
            None,
            UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42),
            False)
        
        
        overall_results.append({
            'n_neighbors': n_neighbors,
            'n_components': n_components,
            'individual_models': [result[0] for result in results],
            'individual_df': pd.concat([result[1] for result in results], ignore_index=True),
            'group_model': group_clusters[0],
            'group_df': group_clusters[1]

        })
    
    return pd.DataFrame(overall_results)

## Running it on all safety issues


I want to generate the safety themes from all of the safety issues I have available.

In [None]:
def check_mode_cluster_distribution(df):
    safety_issues_df_topic_mode = df.pivot_table(index='topic', columns='mode', values='report_id', aggfunc='count').fillna(0)
    return safety_issues_df_topic_mode

### Simple minilm embeddings

This seems to of failed. I believe this is mainly due to the fact that each documents are really short.

In [None]:

topic_model, _ = runBERTopic(
    safety_issues_df, 'si', None, openai_base_representation_model, umap_model, reduce_outliers=False)

topic_model.get_topic_info()


There is a bit of a problem where the number of outliers is quite great.

I will try to merge the outliers

In [None]:
topic_model, _ = runBERTopic(
    safety_issues_df, 'si', None, openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,166,0_Rail Safety and Operational Issues in New Ze...,[Rail Safety and Operational Issues in New Zea...,[The training that drivers received for transi...
1,1,64,1_Maritime Safety and Navigation Management Is...,[Maritime Safety and Navigation Management Iss...,[The voyage planning for the time in the Snare...
2,2,36,2_Maritime Safety and Regulations Compliance I...,[Maritime Safety and Regulations Compliance Is...,[The skipper did not have the requisite knowle...
3,3,53,3_Safety and Maintenance Issues in Engineering...,[Safety and Maintenance Issues in Engineering ...,[There was a lack of clear communication and a...
4,4,53,4_Maritime and Aviation Safety Management and ...,[Maritime and Aviation Safety Management and E...,[It could not be established why the chief off...
5,5,50,5_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],[Had the controllers realised that the low clo...
6,6,27,6_Robinson Helicopter Safety and Accident Anal...,[Robinson Helicopter Safety and Accident Analy...,"[Due to their unique main rotor design, during..."
7,7,62,7_Aviation Safety and Regulatory Compliance Is...,[Aviation Safety and Regulatory Compliance Iss...,[The standard of pilot training and the superv...
8,8,26,8_Aircraft Landing Gear and Door Lock Failures,[Aircraft Landing Gear and Door Lock Failures],[Had the pilots known that the nose landing ge...
9,9,23,9_Deficiencies in Safety and Regulatory Compli...,[Deficiencies in Safety and Regulatory Complia...,[There were no established procedures for ente...


The main problem here is that the the distribution is not great. It seems that most of the rail are in the first topic then martime and aviation take up the rest.

### VoyageAI embeddings

In [None]:
topic_model, voyageai_clusters_df = runBERTopic(
    voyageai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()

NameError: name 'voyageai_embeddings' is not defined

In [None]:


check_mode_cluster_distribution(voyageai_clusters_df)

mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,204,8,167
1,2,177,2


This has created two topics wiht one being avaiation and martime and the other being rail.

### OpenAI embeddings

In [None]:
topic_model, openai_clusters_df = runBERTopic(
    openai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()[['Count', 'Name']]



Unnamed: 0,Count,Name
0,203,0_Aviation Safety and Compliance Issues
1,189,1_Rail Safety and Operational Issues in New Ze...
2,142,2_Maritime Safety and Navigation Management Flaws
3,26,3_Maritime Safety and Compliance Issues of the...


In [None]:
check_mode_cluster_distribution(openai_clusters_df)

mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,191.0,4.0,8.0
1,5.0,181.0,3.0
2,4.0,0.0,138.0
3,6.0,0.0,20.0


This has also made a cleanish split between modes of transport. I can eithe try to force it not to do this and/or run the model on each mode then merge the models.

In [None]:
umap_model_tweaked = UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

topic_model, openai_clusters_tweaked_df = runBERTopic(
    openai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model_tweaked, reduce_outliers=True)

display(topic_model.get_topic_info()[['Count', 'Name']])

check_mode_cluster_distribution(openai_clusters_tweaked_df)



Unnamed: 0,Count,Name
0,167,0_Rail Safety and Operational Failures
1,115,1_Maritime Safety and Resource Management Defi...
2,50,2_Safety and Compliance in Transport and Marit...
3,51,3_Aviation Safety and Regulatory Compliance Is...
4,41,4_Helicopter Safety and Operational Issues
5,52,5_Aviation Safety and Air Traffic Control Issues
6,27,6_Safety Issues in Rail Operations
7,30,7_Aircraft Landing Gear and Maintenance Issues
8,11,8_Aviation Safety Issues Related to Door Locki...
9,16,9_Safety and Maintenance Challenges in Maritim...


mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.0,157.0,7.0
1,7.0,0.0,108.0
2,15.0,2.0,33.0
3,47.0,0.0,4.0
4,40.0,1.0,0.0
5,51.0,0.0,1.0
6,2.0,25.0,0.0
7,30.0,0.0,0.0
8,11.0,0.0,0.0
9,0.0,0.0,16.0


I will try to tune the hyper paramters and see if I can get the right sort of safety themes

In [None]:
topic_model, openai_clusters_tuned_df = runBERTopic(
    openai_embeddings,
    'si',
    'si_embedding',
    openai_base_representation_model,
    UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine', random_state=42),
    reduce_outliers=True)

topic_model.get_topic_info()[['Count', "Name","Representative_Docs"]]



Unnamed: 0,Count,Name,Representative_Docs
0,167,0_Rail Safety and Operational Issues in New Ze...,[The training that drivers received for transi...
1,115,1_Maritime Safety and Resource Management Defi...,[The standard of passage planning on board the...
2,50,2_Maritime and Aviation Safety Regulations and...,[The absence of a visual indicator in the whee...
3,51,3_Aviation Safety and Regulatory Compliance Is...,[The operator's system for training its pilots...
4,41,4_Helicopter Safety and Maintenance Issues,"[Due to their unique main rotor design, during..."
5,52,5_Aviation Safety and Operational Procedures a...,[While ATC sequences an IFR aeroplane to land ...
6,27,6_Safety Issues and Management Deficiencies in...,[The train controller made an assumption about...
7,30,7_Aircraft Landing Gear and Maintenance Issues,[Had the pilots known that the nose landing ge...
8,11,8_Aviation Safety and Equipment Malfunction,"[The use of ""threat and error management"" (TEM..."
9,16,9_Maintenance and Risk Management in Marine Sa...,[A clear placard should be placed at the contr...


## Run cluster on just one mode

It would make sense that if the clustering is finding the transport modes then splitting into the modes first might help find the themes within each mode.

In [None]:
def printout_each_modes_topics(results):
    for res in results:
        print("Cluster names: ")
        for i, count in zip(res[0].get_topic_info()['Name'], res[0].get_topic_info()['Count']):
            print(f"{count}, {i}")

### VoyageAI

In [None]:
voyageai_modes_dfs = [voyageai_embeddings[voyageai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

In [None]:
results = [runBERTopic(df, 'si', 'si_embedding', openai_base_representation_model, umap_model) for df in voyageai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
54, 0_Aviation Safety and Operational Procedures
34, 1_Safety Challenges and Risks in Robinson Helicopter Operations
62, 2_Aviation Safety and Regulatory Compliance Issues
25, 3_Aircraft Landing Gear and Door System Failures
31, 4_Aircraft Maintenance and Safety Concerns
Cluster names: 
166, 0_Rail Safety and Management Issues
19, 1_Safety and Regulatory Issues at Rail Level Crossings
Cluster names: 
28, 0_Maritime Safety and Bridge Resource Management Deficiencies
33, 1_Maritime Safety and Management Failures
43, 2_Maritime Safety and Navigation Standards Compliance
30, 3_Maritime Safety and Emergency Response Deficiencies
22, 4_Maritime Safety Violations and Consequences aboard the Easy Rider
13, 5_Propulsion System Failures and Maintenance Issues in Marine Operations


In [None]:
checking = results[2][1]

In [None]:
merged_moode_models = BERTopic.merge_models([result[0] for result in results], min_similarity=0.9)


merged_moode_models.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,52,0_Aviation Safety and Air Traffic Control Proc...,[Aviation Safety and Air Traffic Control Proce...,
1,1,47,1_Helicopter Safety and Accident Analysis,[Helicopter Safety and Accident Analysis],
2,2,41,2_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],
3,3,33,3_Aircraft Safety and Maintenance Issues,[Aircraft Safety and Maintenance Issues],
4,4,17,4_Aviation Safety and Regulatory Compliance in...,[Aviation Safety and Regulatory Compliance in ...,
5,5,16,5_Safety and Regulatory Issues in New Zealand ...,[Safety and Regulatory Issues in New Zealand A...,
6,6,44,0_KiwiRail Safety and Compliance Issues,[KiwiRail Safety and Compliance Issues],
7,7,40,1_Rail Safety and Communication Failures,[Rail Safety and Communication Failures],
8,8,25,2_Safety and Management Issues in Rail Operations,[Safety and Management Issues in Rail Operations],
9,9,20,3_Safety and Regulatory Issues at Road-Rail Le...,[Safety and Regulatory Issues at Road-Rail Lev...,


### OpenAI

In [None]:
openai_modes_dfs = [openai_embeddings[openai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

for df in openai_modes_dfs:
    display(df)

Unnamed: 0,report_id,si,mode,si_embedding
0,2011_003,The New Zealand regulatory system has not prov...,0,"[0.0187440924346447, -0.000433413457358256, -0..."
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[0.01013844646513462, -0.03145159035921097, -0..."
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[0.005347656551748514, -0.022685393691062927, ..."
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[0.014976576901972294, 0.015324870124459267, -..."
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.0042054359801113605, 0.04125332459807396, ..."
...,...,...,...,...
201,2015_001,Parachute drop pilots were not required to wea...,0,"[0.0253401268273592, -0.02335318550467491, -0...."
202,2011_006,The council had not evaluated the effects of t...,0,"[-0.02767498977482319, 0.01624125801026821, -0..."
203,2011_006,The standard of pilot training and the supervi...,0,"[0.015297695063054562, -0.018917182460427284, ..."
204,2011_006,The CAA had had recurring concerns for the man...,0,"[0.001043604570440948, 0.00177335599437356, 0...."


Unnamed: 0,report_id,si,mode,si_embedding
0,2019_106,No procedures were in place to direct train cr...,1,"[0.017140474170446396, 0.03509647026658058, -0..."
1,2013_107,"The high incidence of brake block replacement,...",1,"[-0.0018233972368761897, 0.020808950066566467,..."
2,2013_107,The visual inspection regime for wheel-bearing...,1,"[0.0025237964000552893, 0.027265744283795357, ..."
3,2013_107,"The RailBAM system, while operational, did not...",1,"[0.006221923511475325, 0.025432679802179337, -..."
4,2013_107,The lack of a dedicated RailBAM analyst positi...,1,"[-0.004680005367845297, 0.013756909407675266, ..."
...,...,...,...,...
180,2017_101,KiwiRail did not have a mature fatigue risk ma...,1,"[-0.006654317956417799, 0.029867829754948616, ..."
181,2017_101,The eProtect KMC module on board the locomotiv...,1,"[-0.003919209353625774, 0.022458476945757866, ..."
182,2017_104,Transdev had no policies or procedures in plac...,1,"[0.013986819423735142, 0.01571197435259819, -0..."
183,2020_104,Implementation of an administrative control me...,1,"[-0.013463953509926796, -0.007039009593427181,..."


Unnamed: 0,report_id,si,mode,si_embedding
0,2019_202,There is limited data to quantify the extent o...,2,"[-0.0015265028923749924, 0.013446947559714317,..."
1,2019_201,the operator's planned maintenance programme d...,2,"[0.03530280664563179, 0.027329862117767334, 0...."
2,2019_201,the operator's hazard identification system ha...,2,"[0.009704935364425182, 0.02645685337483883, 0...."
3,2019_204,The operator had not included predefined weath...,2,"[0.038087889552116394, 0.000508625409565866, 0..."
4,2019_204,The operator of the Henerata had not assessed ...,2,"[0.018176013603806496, 0.026440272107720375, 0..."
...,...,...,...,...
164,2017_203,Technicians who are authorised to conduct mand...,2,"[0.002318679355084896, 0.015887508168816566, -..."
165,2013_201,The firefighting drills held on board the Taok...,2,"[0.006056208163499832, 0.01051066443324089, -0..."
166,2014_201,crew awareness of the operating limitations of...,2,"[-0.029451534152030945, 0.026009364053606987, ..."
167,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.021512825042009354, 0.029569942504167557, ..."


In [None]:
results = [runBERTopic(df, 'si', 'si_embedding', openai_base_representation_model, umap_model) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
46, 0_Aviation Safety and Operational Procedures Issues
42, 1_Aircraft Maintenance and Safety Issues
37, 2_Challenges and Safety Issues in Robinson Helicopter Operations
51, 3_Aviation Safety and Regulatory Oversight in New Zealand
30, 4_Aviation Safety and Emergency Response
Cluster names: 
49, 0_KiwiRail Safety and Compliance Issues
28, 1_Rail Safety and Inspection Inefficiencies
42, 2_Rail Safety and Communication Issues
27, 3_Safety and Oversight Concerns in Train Operations
21, 4_Road and Rail Safety at Level Crossings
18, 5_Risk Management and Safety Issues in Wellington Station Train Operations
Cluster names: 
150, 0_Maritime Safety and Crew Management Deficiencies
19, 1_Maritime Safety and Compliance Issues


I will try instead to do with no dimension reduction, or atleast decrease the amount of dimension reduction.

In [None]:
from bertopic.dimensionality import BaseDimensionalityReduction

results = [runBERTopic(df,
                       'si',
                       'si_embedding',
                       openai_base_representation_model,
                       BaseDimensionalityReduction()
                       ) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
46, 0_Aviation Safety and Air Traffic Management Issues
47, 1_Aircraft Maintenance and Safety Issues
41, 2_Safety and Training Issues in Robinson Helicopter Operations
53, 3_Aviation Safety and Compliance Issues
19, 4_Safety and Regulatory Oversight in Aviation and Parachuting Operations
Cluster names: 
47, 0_Issues in KiwiRail's Safety and Operational Procedures
62, 1_Rail Safety and Incident Analysis
29, 2_Rail Safety and Signal Management Issues in Wellington Station Approaches
20, 3_Safety Issues at Rail Level Crossings
27, 4_Safety and Risk Management in Rail Operations
Cluster names: 
149, 0_Maritime Safety and Resource Management Issues
20, 1_Maritime Safety Violations and the Sinking of the Easy Rider


This reuslts in just one cluster for each as the curse of dimensionality is prudent here. I will instead try to tune the hyper parameters of OPenAI

In [None]:
from bertopic.dimensionality import BaseDimensionalityReduction

results = [runBERTopic(df,
                       'si',
                       'si_embedding',
                       None,
                       UMAP(n_neighbors=6, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
                       ) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
52, 0_the_to_of_and
47, 1_the_to_of_and
41, 2_the_of_to_and
33, 3_the_gear_landing_to
17, 4_the_to_for_water
16, 5_zealand_new_of_the
Cluster names: 
44, 0_the_to_work_of
40, 1_the_train_to_and
25, 2_train_of_the_and
20, 3_road_level_crossings_the
20, 4_the_brake_braking_conditions
15, 5_the_in_of_wellington
21, 6_the_rail_of_to
Cluster names: 
120, 0_the_of_and_to
27, 1_the_to_of_easy
11, 2_co2_the_could_be
11, 3_the_fish_crew_of


In [None]:
merged_moode_models.get_topic_info()[['Count', "Name"]]

NameError: name 'merged_moode_models' is not defined

## Hypter parameter tuning

I have had a look at both single run and individual models.

I think the next step is to do some hypter paramter tuning.

 As there are not noticable differences between voyageAI and openAI I will go with openAI embedding model.

### Perform search

In [40]:
testing_search = perform_umap_parameter_search(all_embeddings['gtelarge'], n_neighbors_range = range(3,5), n_components_range = range(3,5))

 Looking at 3, 3
 Looking at 3, 4
 Looking at 4, 3
 Looking at 4, 4


In [42]:
testing_search.loc[0,]

Unnamed: 0,report_id,si,mode,si_embedding,topic,0,1,2,3,4,5
0,2011_003,The New Zealand regulatory system has not prov...,0,"[-0.5367612838745117, 0.34051889181137085, -0....",0,0.977810,2.219005e-02,,,,
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[-0.5823154449462891, 0.7852337956428528, 0.00...",0,1.000000,1.209985e-309,,,,
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[-0.5687302947044373, 0.34693843126296997, -0....",0,0.956347,4.365292e-02,,,,
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[-0.3379861116409302, 0.0036723273806273937, -...",-1,0.031528,3.061438e-02,,,,
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.07826011627912521, -0.31105685234069824, -...",1,0.010312,9.612732e-01,,,,
...,...,...,...,...,...,...,...,...,...,...,...
555,2017_203,Technicians who are authorised to conduct mand...,2,"[-0.8836644887924194, -0.2005808800458908, -0....",0,1.000000,7.035353e-310,,,,
556,2013_201,The firefighting drills held on board the Taok...,2,"[-1.0879993438720703, 0.8911566734313965, -0.3...",-1,0.038975,3.871146e-02,,,,
557,2014_201,crew awareness of the operating limitations of...,2,"[-0.5722806453704834, -0.3311285078525543, 0.0...",0,1.000000,6.951719e-310,,,,
558,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.7541806697845459, 0.762946605682373, 0.395...",0,0.998613,1.387301e-03,,,,


In [6]:
n_neighbors_range = range(3,6)
n_components_range = range(3,25)

saving_folder = 'umap_search_results'

os.makedirs(saving_folder, exist_ok=True)

def check_old_file_is_current(saving_file, n_neighbors_range, n_components_range):
    old_results = pd.read_pickle(saving_file)

    expected_length = len(n_neighbors_range) * len(n_components_range)

    if len(old_results) != expected_length:
        print(f"  {name} old file is not up to date, expected {expected_length} but found {len(old_results)}")
        return None
    
    if all(old_results['n_neighbors'].isin(n_neighbors_range)) and all(old_results['n_components'].isin(n_components_range)):
        print(f"  {name} already exists and is up to date")
        return old_results

results = dict()

for name, embeddings in all_embeddings.items():
    print("Performing UMAP parameter search for " + name)

    saving_file = os.path.join(saving_folder, name + "_umap_parameter_search_results" + ".pkl")

    # Check to see if file already exists and so umap does not need to be redone
    if os.path.exists(saving_file):
        print(f" {name} already exists, checking if current")
        
        old_file = check_old_file_is_current(saving_file, n_neighbors_range, n_components_range)
            
        if old_file is not None:
            results[name] = old_file
            continue
    
    umap_parameter_search = perform_umap_parameter_search(embeddings, n_neighbors_range, n_components_range)

    umap_parameter_search.to_pickle(saving_file)

    results[name] = umap_parameter_search



Performing UMAP parameter search for openai
 openai already exists, checking if current
  openai already exists and is up to date
Performing UMAP parameter search for voyageai
 voyageai already exists, checking if current
  voyageai already exists and is up to date
Performing UMAP parameter search for gtelarge
 gtelarge already exists, checking if current
  gtelarge already exists and is up to date


### Find best results

First off there probably is a better way to do this but I am just going to have to perform a somewhat manual search of the parameter space as my theoretical understanding and intuition is sadly lacking.

The best result is currently defined vaguely using natural language based on the number of topics and the distribution. This definition is something like  
"Find topics in the collection of safety issues that are useful and specific. The topics can be mode specific or multi modal but either way they capture unique aspects of this data that was not known before this."

In [66]:
def assign_topics_and_probabilities(df, model):

    cleaned_df = df[['report_id', 'si', 'mode', 'si_embedding']]

    embeddings = column_to_2darray(cleaned_df['si_embedding'])

    documents = cleaned_df['si'].to_list()

    transform = model.transform(documents, embeddings)

    cleaned_df['topic'] = transform[0]

    return pd.concat([cleaned_df.reset_index(drop=True), pd.DataFrame(transform[1])], axis=1)

def create_merged_models(df, min_similarity_range = range(90,100)):

    converted_min_similarity_range = [x / 100 for x in min_similarity_range]

    df['merged_model'] = df['individual_models'].apply(lambda list_of_models: [(min_similarity, BERTopic.merge_models(list_of_models, min_similarity=min_similarity)) for min_similarity in converted_min_similarity_range])

    df = df.explode('merged_model').reset_index(drop=True)

    df['min_similarity'] = df['merged_model'].apply(lambda x: x[0])
    df['merged_model'] = df['merged_model'].apply(lambda x: x[1])

    df['merged_df'] = df.apply(lambda row: assign_topics_and_probabilities(row['individual_df'], row['merged_model']), axis=1)

    return df

In [8]:
def topic_counts(df):

    df['individual_topic_counts'] = df['individual_models'].apply(lambda list_of_models: [(round(x.get_topic_info()['Count'][0]/sum(x.get_topic_info()['Count']), 2), len(x.get_topic_info()['Name'])) for x in list_of_models])

    df['average_individual_topic_count'] = df['individual_topic_counts'].apply(lambda list_of_models: round(np.mean([x[1] for x in list_of_models]), 2))

    df['average_individual_outliers'] = df['individual_topic_counts'].apply(lambda list_of_models: round(np.mean([x[0] for x in list_of_models]),2))

    df['group_topic_membership_counts'] = df.apply(
        lambda x: 
        [c  for c in x['group_model'].get_topic_info()['Count'].to_list()], axis=1)
    
    df['group_outliers_percent'] = df['group_model'].apply(lambda model: model.get_topic_info()['Count'][0] / model.get_topic_info()['Count'].sum() )

    column_names = ['n_components', 'n_neighbors', 'individual_topic_counts', 'average_individual_topic_count', 'average_individual_outliers', 'group_topic_membership_counts', 'group_outliers_percent']

    df = df[column_names + list(df.columns)]

    return df.loc[:,~df.columns.duplicated()].copy()


In [58]:
combined_results = pd.concat([result.assign(embedding_model=name) for name, result in results.items()], axis = 0, ignore_index=True)

In [69]:
file_name = 'combined_results_with_merged_models.pkl'

if os.path.exists(file_name):
    combined_results_with_merged_models = pd.read_pickle(file_name)

else:
    combined_results_with_merged_models = create_merged_models(combined_results)

    combined_results_with_merged_models.to_pickle(file_name)

combined_results_with_merged_models

In [71]:
combined_topic_counts = topic_counts(combined_results_with_merged_models)

In [73]:
combined_topic_counts

Unnamed: 0,n_components,n_neighbors,individual_topic_counts,average_individual_topic_count,average_individual_outliers,group_topic_membership_counts,group_outliers_percent,individual_models,individual_df,group_model,group_df,embedding_model,merged_model,min_similarity,merged_df
0,3,3,"[(0.24, 9), (0.08, 8), (0.22, 5)]",7.33,0.18,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.90,report_id ...
1,3,3,"[(0.24, 9), (0.08, 8), (0.22, 5)]",7.33,0.18,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.91,report_id ...
2,3,3,"[(0.24, 9), (0.08, 8), (0.22, 5)]",7.33,0.18,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.92,report_id ...
3,3,3,"[(0.24, 9), (0.08, 8), (0.22, 5)]",7.33,0.18,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.93,report_id ...
4,3,3,"[(0.24, 9), (0.08, 8), (0.22, 5)]",7.33,0.18,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.94,report_id ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,24,5,"[(0.27, 6), (0.22, 5), (0.91, 2)]",4.33,0.47,"[3, 209, 184, 152, 12]",0.005357,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,gtelarge,"BERTopic(calculate_probabilities=True, ctfidf_...",0.95,report_id ...
1976,24,5,"[(0.27, 6), (0.22, 5), (0.91, 2)]",4.33,0.47,"[3, 209, 184, 152, 12]",0.005357,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,gtelarge,"BERTopic(calculate_probabilities=True, ctfidf_...",0.96,report_id ...
1977,24,5,"[(0.27, 6), (0.22, 5), (0.91, 2)]",4.33,0.47,"[3, 209, 184, 152, 12]",0.005357,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,gtelarge,"BERTopic(calculate_probabilities=True, ctfidf_...",0.97,report_id ...
1978,24,5,"[(0.27, 6), (0.22, 5), (0.91, 2)]",4.33,0.47,"[3, 209, 184, 152, 12]",0.005357,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,gtelarge,"BERTopic(calculate_probabilities=True, ctfidf_...",0.98,report_id ...


In [72]:
# Make a graph of the average indivudal topic count and outliers
(ggplot(combined_topic_counts) + 
    aes(x = 'average_individual_topic_count', y = 'average_individual_outliers', color = 'type') +
    geom_point())

TypeError: '<' not supported between instances of 'type' and 'type'

TypeError: '<' not supported between instances of 'type' and 'type'

We can see that there are quite a few different ones that might seem reasonable.

I will have to choose one for the demo purposes.

#### Group model

I will look at the best ones that are group models.

In [None]:
potential_group_model = appropriate_counts_df.loc[[0,23,42],]

print(potential_group_model.columns)

potential_group_model.apply(
    lambda row:
    row['group_model'].update_topics(
        row['group_df']['si'].to_list(),
        representation_model = openai_base_representation_model
    ),
    axis=1
)


Index(['n_components', 'n_neighbors', 'individual_models', 'individual_df',
       'group_model', 'group_df', 'individual_topic_counts',
       'group_topic_membership_counts'],
      dtype='object')


KeyboardInterrupt: 

In [None]:

potential_group_model['model_summary'] = potential_group_model['group_model'].apply(lambda model: model.get_topic_info())

for i, row in potential_group_model.iterrows():

    display(row['model_summary'])

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,70,0_Rail and Road Safety and Standards Compliance,[Rail and Road Safety and Standards Compliance],[The Beach Road/ State Highway 1 intersection ...
1,1,60,1_Aviation and Maritime Safety and Compliance ...,[Aviation and Maritime Safety and Compliance I...,[The procedure for circling below the minimum ...
2,2,63,2_Maritime Safety and Navigation Management De...,[Maritime Safety and Navigation Management Def...,[The standard of bridge resource management on...
3,3,54,3_Safety and Regulatory Oversight in New Zeala...,[Safety and Regulatory Oversight in New Zealan...,[There was a low likelihood of the weather con...
4,4,50,4_Maritime Safety and Risk Management Deficien...,[Maritime Safety and Risk Management Deficienc...,[The plastic sheathing that had been placed ar...
5,5,36,5_Issues in KiwiRail's Work and Safety Managem...,[Issues in KiwiRail's Work and Safety Manageme...,[The New Zealand Rail Operating Rules and Proc...
6,6,51,6_Helicopter Flight Safety and Operating Chall...,[Helicopter Flight Safety and Operating Challe...,"[Due to their unique main rotor design, during..."
7,7,27,7_Challenges and Risks in Train Control Safety...,[Challenges and Risks in Train Control Safety ...,[The train controller made an assumption about...
8,8,30,8_Emergency Preparedness and Response in Trans...,[Emergency Preparedness and Response in Transp...,[There were as few as 4 approved lifejackets o...
9,9,28,9_Train Operation and Communication Safety Issues,[Train Operation and Communication Safety Issues],[Lack of a suitable communication system betwe...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,95,0_Maritime Safety and Navigation Management Is...,[Maritime Safety and Navigation Management Iss...,[The standard of bridge resource management on...
1,1,61,1_Aviation Safety and Compliance Issues in New...,[Aviation Safety and Compliance Issues in New ...,[The operator's system for training its pilots...
2,2,49,2_KiwiRail Operational and Safety Challenges,[KiwiRail Operational and Safety Challenges],[The New Zealand Rail Operating Rules and Proc...
3,3,49,3_Maritime Safety and Emergency Response Regul...,[Maritime Safety and Emergency Response Regula...,[The absence of a visual indicator in the whee...
4,4,47,4_Aviation Safety and Operational Miscommunica...,[Aviation Safety and Operational Miscommunicat...,[While ATC sequences an IFR aeroplane to land ...
5,5,43,5_Aircraft Maintenance and Safety Compliance I...,[Aircraft Maintenance and Safety Compliance Is...,[Had the pilots known that the nose landing ge...
6,6,38,6_Helicopter Safety and Operational Challenges,[Helicopter Safety and Operational Challenges],"[Due to their unique main rotor design, during..."
7,7,22,7_Train Operational Safety and Communication I...,[Train Operational Safety and Communication Is...,[The safety issue arising from this incident w...
8,8,26,8_Safety and Risk Management Issues in Train C...,[Safety and Risk Management Issues in Train Co...,[Poor planning and co-ordination of track infr...
9,9,18,9_Rail System Safety and Performance in Low-Ad...,[Rail System Safety and Performance in Low-Adh...,[A key safety issue was that the National Rail...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,80,0_Maritime Safety and Navigation Management Is...,[Maritime Safety and Navigation Management Iss...,[Neither the owner nor the skipper sought or w...
1,1,54,1_Rail Safety and Training Gaps,[Rail Safety and Training Gaps],[The Matangi braking and wheel-slide protectio...
2,2,57,2_Helicopter Safety and Maintenance Issues,[Helicopter Safety and Maintenance Issues],"[Due to their unique main rotor design, during..."
3,3,46,3_Train Collision Risks and Communication Fail...,[Train Collision Risks and Communication Failu...,[There are a number of reasonable measures tha...
4,4,47,4_Maritime Safety and Compliance Issues,[Maritime Safety and Compliance Issues],[Neither the ship's planned-maintenance system...
5,5,43,5_Emergency Preparedness and Response in Trans...,[Emergency Preparedness and Response in Transp...,[There were as few as 4 approved lifejackets o...
6,6,41,6_KiwiRail Operational and Safety Compliance I...,[KiwiRail Operational and Safety Compliance Is...,[The New Zealand Rail Operating Rules and Proc...
7,7,39,7_Aviation Safety and Air Traffic Management C...,[Aviation Safety and Air Traffic Management Co...,[When an IFR aeroplane is approved to conduct ...
8,8,39,8_Aviation Safety and Operational Compliance I...,[Aviation Safety and Operational Compliance Is...,[The operator's system for training its pilots...
9,9,27,9_Safety Challenges and Risks in Train Control...,[Safety Challenges and Risks in Train Control ...,[The train controller made an assumption about...


I need to choose just one for a demo.

This will be the last one as it looks the most reasonable.

In [None]:
check_mode_cluster_distribution(appropriate_counts_df.loc[1, 'group_df'])

KeyError: 1

In [None]:
demo_group_model = potential_group_model.loc[0,]

demo_group_model['group_model'].save('demo_group_model', serialization='pytorch')



TypeError: cannot pickle '_thread.RLock' object

#### Merged models

There are three sets of indivudal models that has good counts.
I can use this to merge a model and end up with quite a few topics.

In [None]:
potential_individual_models = combined_topic_counts[combined_topic_counts['average_individual_topic_count'] >= 7].sort_values('average_individual_outliers', ascending=True).reset_index(drop=True)

potential_individual_models

Unnamed: 0,n_components,n_neighbors,individual_topic_counts,average_individual_topic_count,average_individual_outliers,group_topic_membership_counts,group_outliers_percent,individual_models,individual_df,group_model,group_df,type
0,23,3,"[(0.11, 7), (0.02, 7), (0.21, 7)]",7.0,0.11,"[39, 189, 131, 84, 44, 19, 19, 18, 17]",0.069643,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI
1,24,4,"[(0.2, 8), (0.11, 8), (0.05, 5)]",7.0,0.12,"[61, 172, 161, 47, 35, 25, 24, 13, 11, 11]",0.108929,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,OpenAI
2,10,4,"[(0.19, 9), (0.09, 8), (0.07, 5)]",7.33,0.12,"[82, 145, 82, 44, 40, 23, 21, 18, 17, 17, 16, ...",0.146429,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI
3,13,3,"[(0.12, 9), (0.09, 7), (0.14, 7)]",7.67,0.12,"[36, 169, 128, 68, 47, 31, 19, 19, 18, 13, 12]",0.064286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI
4,12,4,"[(0.13, 9), (0.02, 7), (0.24, 7)]",7.67,0.13,"[50, 146, 79, 54, 45, 40, 22, 20, 18, 18, 16, ...",0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI
5,20,3,"[(0.21, 10), (0.08, 7), (0.14, 4)]",7.0,0.14,"[76, 61, 48, 46, 39, 37, 34, 31, 27, 26, 25, 2...",0.135714,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,OpenAI
6,19,3,"[(0.18, 9), (0.1, 7), (0.14, 7)]",7.67,0.14,"[54, 128, 64, 61, 47, 35, 34, 19, 19, 18, 17, ...",0.096429,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI
7,8,3,"[(0.16, 7), (0.12, 8), (0.13, 6)]",7.0,0.14,"[52, 130, 72, 71, 49, 48, 19, 18, 15, 15, 13, ...",0.092857,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI
8,11,3,"[(0.19, 10), (0.08, 8), (0.15, 6)]",8.0,0.14,"[64, 127, 70, 62, 43, 37, 30, 27, 24, 18, 18, ...",0.114286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,OpenAI
9,12,3,"[(0.09, 11), (0.15, 8), (0.19, 7)]",8.67,0.14,"[42, 131, 128, 93, 38, 25, 24, 24, 20, 18, 17]",0.075,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI


In [None]:

potential_individual_models = create_merged_models(potential_individual_models, min_similarity= 0.95)

potential_individual_models['merged_outliers'] = potential_individual_models['merged_model'].apply(lambda model: model.get_topic_info()['Count'][0] / model.get_topic_info()['Count'].sum())

potential_individual_models = potential_individual_models[potential_individual_models['merged_outliers'] <= 0.13]

potential_individual_models

Unnamed: 0,n_components,n_neighbors,individual_topic_counts,average_individual_topic_count,average_individual_outliers,group_topic_membership_counts,group_outliers_percent,individual_models,individual_df,group_model,group_df,type,merged_model,merged_df,merged_outliers
0,23,3,"[(0.11, 7), (0.02, 7), (0.21, 7)]",7.0,0.11,"[39, 189, 131, 84, 44, 19, 19, 18, 17]",0.069643,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,0.108929
1,24,4,"[(0.2, 8), (0.11, 8), (0.05, 5)]",7.0,0.12,"[61, 172, 161, 47, 35, 25, 24, 13, 11, 11]",0.108929,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,OpenAI,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,0.125
2,10,4,"[(0.19, 9), (0.09, 8), (0.07, 5)]",7.33,0.12,"[82, 145, 82, 44, 40, 23, 21, 18, 17, 17, 16, ...",0.146429,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,0.119643
3,13,3,"[(0.12, 9), (0.09, 7), (0.14, 7)]",7.67,0.12,"[36, 169, 128, 68, 47, 31, 19, 19, 18, 13, 12]",0.064286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,0.114286
4,12,4,"[(0.13, 9), (0.02, 7), (0.24, 7)]",7.67,0.13,"[50, 146, 79, 54, 45, 40, 22, 20, 18, 18, 16, ...",0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,VoyageAI,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,0.123214


In [None]:
potential_individual_models.apply(
    lambda row: 
    row['merged_model'].update_topics(
        row['merged_df']['si'].tolist(),
        representation_model = openai_base_representation_model
    ), axis = 1
)


0    None
1    None
2    None
3    None
4    None
dtype: object

In [None]:
potential_individual_models['model_summary'] = potential_individual_models['merged_model'].apply(lambda model: model.get_topic_info())

for i, row in potential_individual_models.iterrows():
    display(row['model_summary'])

    display(make_visualization(row['merged_model'], row['merged_df']))

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,61,-1_Transportation Safety and Compliance Issues,[Transportation Safety and Compliance Issues],
1,0,110,0_Aviation and Maritime Safety and Management ...,[Aviation and Maritime Safety and Management I...,
2,1,29,1_Helicopter Safety and Training Concerns,[Helicopter Safety and Training Concerns],
3,2,25,2_Aviation Safety and Oversight Issues,[Aviation Safety and Oversight Issues],
4,3,23,3_Challenges and Risks in Air Traffic Control ...,[Challenges and Risks in Air Traffic Control C...,
5,4,18,4_Airport Operational Safety and Compliance Is...,[Airport Operational Safety and Compliance Iss...,
6,5,17,5_Regulatory Oversight and Safety Risks in New...,[Regulatory Oversight and Safety Risks in New ...,
7,6,75,6_Rail Safety and Operational Risks,[Rail Safety and Operational Risks],
8,7,40,7_KiwiRail Operational and Safety Compliance I...,[KiwiRail Operational and Safety Compliance Is...,
9,8,21,8_Safety and Operational Challenges in Welling...,[Safety and Operational Challenges in Wellingt...,


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,70,-1_Transportation Safety and Compliance Issues,[Transportation Safety and Compliance Issues],
1,0,45,0_Aviation Safety and Air Traffic Management I...,[Aviation Safety and Air Traffic Management Is...,
2,1,29,1_Aircraft Landing Gear Maintenance and System...,[Aircraft Landing Gear Maintenance and System ...,
3,2,26,2_Helicopter Safety and Accident Factors,[Helicopter Safety and Accident Factors],
4,3,25,3_Aviation Safety and Regulatory Compliance Is...,[Aviation Safety and Regulatory Compliance Iss...,
5,4,17,4_Aviation Safety and Regulatory Oversight in ...,[Aviation Safety and Regulatory Oversight in W...,
6,5,12,5_Safety and Regulatory Issues in New Zealand ...,[Safety and Regulatory Issues in New Zealand A...,
7,6,11,6_Aviation Safety and Human Error in Door Lock...,[Aviation Safety and Human Error in Door Locki...,
8,7,41,7_Train Safety and Communication Issues,[Train Safety and Communication Issues],
9,8,38,8_Issues with KiwiRail's Safety and Compliance...,[Issues with KiwiRail's Safety and Compliance ...,


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,67,-1_Transportation Safety and Compliance Issues,[Transportation Safety and Compliance Issues],
1,0,41,0_Safety and Risks in Robinson Helicopter Oper...,[Safety and Risks in Robinson Helicopter Opera...,
2,1,143,1_Maritime Safety and Management Compliance Is...,[Maritime Safety and Management Compliance Iss...,
3,2,22,2_Air Traffic Control and Flight Crew Coordina...,[Air Traffic Control and Flight Crew Coordinat...,
4,3,17,3_Aviation Safety and Compliance Issues at Aer...,[Aviation Safety and Compliance Issues at Aero...,
5,4,16,4_Aircraft Maintenance Safety Concerns,[Aircraft Maintenance Safety Concerns],
6,5,27,5_Transportation Safety and Inspection Failures,[Transportation Safety and Inspection Failures],
7,6,13,6_Challenges in Landing Gear Operation and Pil...,[Challenges in Landing Gear Operation and Pilo...,
8,7,12,7_Aviation Safety and Collision Risk Managemen...,[Aviation Safety and Collision Risk Management...,
9,8,61,8_Train Safety and Control Issues,[Train Safety and Control Issues],


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,64,-1_Transportation and Safety Compliance Issues,[Transportation and Safety Compliance Issues],
1,0,34,0_Aircraft Maintenance and Safety Issues,[Aircraft Maintenance and Safety Issues],
2,1,26,1_Aircraft Landing Gear and Door Locking Issues,[Aircraft Landing Gear and Door Locking Issues],
3,2,36,2_Safety and Compliance Issues in Aviation and...,[Safety and Compliance Issues in Aviation and ...,
4,3,24,3_Aviation Safety and Air Traffic Control Issues,[Aviation Safety and Air Traffic Control Issues],
5,4,23,4_Challenges and Risks in Robinson Helicopter ...,[Challenges and Risks in Robinson Helicopter O...,
6,5,64,5_Aviation and Maritime Safety Management and ...,[Aviation and Maritime Safety Management and C...,
7,6,14,6_Airport Safety and Operational Procedure Com...,[Airport Safety and Operational Procedure Comp...,
8,7,13,7_Challenges and Risks in Helicopter Flight Dy...,[Challenges and Risks in Helicopter Flight Dyn...,
9,8,54,8_Rail Safety and Communication Failures,[Rail Safety and Communication Failures],


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,69,-1_Transportation Safety and Compliance Issues,[Transportation Safety and Compliance Issues],
1,0,36,0_Challenges and Risks in Robinson Helicopter ...,[Challenges and Risks in Robinson Helicopter O...,
2,1,29,1_Safety and Oversight Issues in Aircraft and ...,[Safety and Oversight Issues in Aircraft and P...,
3,2,25,2_Aircraft Landing Gear and Door Locking Issues,[Aircraft Landing Gear and Door Locking Issues],
4,3,61,3_Aviation and Maritime Safety Management Prac...,[Aviation and Maritime Safety Management Pract...,
5,4,22,4_Air Traffic Control and Pilot Communication ...,[Air Traffic Control and Pilot Communication I...,
6,5,17,5_Aerodrome Safety and Compliance Issues,[Aerodrome Safety and Compliance Issues],
7,6,16,6_Aircraft Maintenance and Safety Risks,[Aircraft Maintenance and Safety Risks],
8,7,11,7_Aviation Safety and Collision Risks,[Aviation Safety and Collision Risks],
9,8,72,8_Train Safety and Control Issues,[Train Safety and Control Issues],


I have decided to go with the fourth set of hyper paremters as these seem to give the best results.


**Merged model**

In [None]:
demo_merged_model = potential_individual_models.loc[3,]

demo_merged_model['merged_model'].save("demo_merged_model", serialization="pytorch")

**Individual model**

In [None]:


for model, df in zip(
    potential_individual_models.loc[3,'individual_models'],
    potential_individual_models.loc[3,'individual_df']):

    model.update_topics(
        df.dropna(subset=['si'])['si'].tolist(),
        representation_model=openai_base_representation_model)



In [None]:
for i, model in enumerate(potential_individual_models.loc[3,'individual_models']):
    display(model.get_topic_info())

    model.save(f"demo_individual_model_mode_{i}", serialization="pytorch")

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,46,0_Aviation Safety and Communication Issues,[Aviation Safety and Communication Issues],[While ATC sequences an IFR aeroplane to land ...
1,1,49,1_Helicopter Safety and Operational Challenges,[Helicopter Safety and Operational Challenges],"[Due to their unique main rotor design, during..."
2,2,36,2_Aviation Safety and Regulatory Compliance Co...,[Aviation Safety and Regulatory Compliance Con...,[The operator's system for training its pilots...
3,3,20,3_Aircraft Safety and Maintenance Issues,[Aircraft Safety and Maintenance Issues],[There was a lack of clear communication and a...
4,4,18,4_Safety and Regulatory Oversight in Parachuti...,[Safety and Regulatory Oversight in Parachutin...,[The risk to people involved in helicopter ope...
5,5,24,5_Aircraft Landing Gear and Maintenance Issues,[Aircraft Landing Gear and Maintenance Issues],[Had the pilots known that the nose landing ge...
6,6,13,6_Safety and Regulatory Issues in New Zealand ...,[Safety and Regulatory Issues in New Zealand A...,[The regulatory oversight of commercial balloo...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,42,0_Issues in KiwiRail's Operational Procedures ...,[Issues in KiwiRail's Operational Procedures a...,[The New Zealand Rail Operating Rules and Proc...
1,1,26,1_Safety and Risk Management in Rail Operations,[Safety and Risk Management in Rail Operations],[Poor planning and co-ordination of track infr...
2,2,27,2_Rail Safety and Signal Management Issues,[Rail Safety and Signal Management Issues],[The lever in the signal box that was used to ...
3,3,23,3_Train Safety and Communication Failures,[Train Safety and Communication Failures],[Nor could the system rely on visually sightin...
4,4,21,4_Train Collision Risks at Wellington Station,[Train Collision Risks at Wellington Station],[There is a heightened risk of trains collidin...
5,5,18,5_Rail Safety and Standards Compliance Concerns,[Rail Safety and Standards Compliance Concerns],[A key safety issue was that the National Rail...
6,6,15,6_Rail System Failures and Inspection Limitations,[Rail System Failures and Inspection Limitations],[The rail fracture examination revealed that t...
7,7,13,7_Safety and Compatibility Issues at Road and ...,[Safety and Compatibility Issues at Road and R...,[Level crossing assessments do not require the...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,83,0_Maritime Safety and Resource Management Prac...,[Maritime Safety and Resource Management Pract...,[The voyage planning for the time in the Snare...
1,1,46,1_Maritime Safety and Risk Management Deficien...,[Maritime Safety and Risk Management Deficienc...,[The plastic sheathing that had been placed ar...
2,2,26,2_Maritime Safety and Emergency Response Failures,[Maritime Safety and Emergency Response Failures],[The owner of the Easy Rider was not meeting i...
3,3,14,3_Maintenance and Regulation Issues in Maritim...,[Maintenance and Regulation Issues in Maritime...,"[The CO2 system's pilot cylinder leaked, but t..."


# Visualization of themes and safety issues

Now that we have some models that seem reasonable, it is time to create a user friendly representation.

In [None]:
modes_dfs = [openai_embeddings[openai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

pd.concat(modes_dfs)

Unnamed: 0,report_id,si,mode,si_embedding
0,2011_003,The New Zealand regulatory system has not prov...,0,"[0.0187440924346447, -0.000433413457358256, -0..."
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[0.01013844646513462, -0.03145159035921097, -0..."
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[0.005347656551748514, -0.022685393691062927, ..."
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[0.014976576901972294, 0.015324870124459267, -..."
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.0042054359801113605, 0.04125332459807396, ..."
...,...,...,...,...
164,2017_203,Technicians who are authorised to conduct mand...,2,"[0.002318679355084896, 0.015887508168816566, -..."
165,2013_201,The firefighting drills held on board the Taok...,2,"[0.006056208163499832, 0.01051066443324089, -0..."
166,2014_201,crew awareness of the operating limitations of...,2,"[-0.029451534152030945, 0.026009364053606987, ..."
167,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.021512825042009354, 0.029569942504167557, ..."


In [None]:
def make_visualization(model, df, save = False, name = 'topic model visual'):

    array_embeddings = column_to_2darray(df['si_embedding'])

    reduced_array_embeddings = UMAP(n_neighbors=7, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

    visualization = model.visualize_documents(df['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

    if save:

        with open(os.path.join('topic_visuals', name), 'w') as f:
            visualization.write_html(f)

    return visualization

topic_model = BERTopic.load("demo_merged_model")

all_data = pd.concat(openai_modes_dfs)

make_visualization(topic_model, all_data)






In [None]:
demo_individual_models = [BERTopic.load(f"demo_individual_model_mode_{i}") for i in range(3)]

for model, df, i in zip(demo_individual_models, modes_dfs, range(len(demo_individual_models))):
    array_embeddings = column_to_2darray(df['si_embedding'])

    reduced_array_embeddings = UMAP(n_neighbors=3, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

    visualization = model.visualize_documents(df['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

    with open(os.path.join('topic_visuals', f'demo_individual_model_mode_{i}_visual.html'), 'w') as f:
        visualization.write_html(f)

    display(visualization)



In [None]:
topic_model = BERTopic.load("demo_group_model")

all_data = pd.concat(modes_dfs)

array_embeddings = column_to_2darray(all_data['si_embedding'])

reduced_array_embeddings = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

visualization = topic_model.visualize_documents(all_data['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

with open(os.path.join('topic_visuals', 'demo_group_model_visual.html'), 'w') as f:
    visualization.write_html(f)

visualization

