# What

As establised in this [notebook](./safey_themes_from_safety_issues.ipynb). BERTopic seems to be the most promising method for generating safety themes from safety issues.

There are a few problems that need to be address.
- Lots of outliers
- only 3 topics being generated

## Modules

In [1]:
# local

# third parties

import yaml
import pandas as pd
import numpy as np

import plotly.express as px

from dotenv import load_dotenv

import voyageai
import openai

from bertopic import BERTopic
from bertopic.representation import OpenAI
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP

from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.cluster import KMeans

import swifter

# builtin
import os
from itertools import product
from multiprocessing import Pool, cpu_count


openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))



# Getting safety issue data

In [37]:
safety_issues_df = pd.read_csv('safety_issues.csv')

# Confirm it has the right columns report_id, si and mode

if not safety_issues_df.columns.isin(['report_id', 'si', 'mode']).any():
    print("Safety issues dataset is missing columns")
    del safety_issues_df

# Getting embeddings to be used for clustering

In [3]:
embeddings_files = [file for file in os.listdir() if file.endswith("embeddings.pkl")]

all_embeddings = {os.path.splitext(file)[0].replace("_embeddings", ""): pd.read_pickle(file) for file in embeddings_files}

# BERTopic models

I played around abit manually trying to find the best ones. However the search space is just too large.

I have found out what I can tweak but in ranges of reasonble values and going to let it automatically go through and searh for them.

The list of thigns to tweak is:

- UMAP and the number of components and neighbors. This is the dimension reduction step
- HDBSCAN and the min_cluster_size. This is the clusterting algorithm
- Whether it is merged from individual models or trained on all embeddings at once.
- The embeddings that it is trained on.

## Needed functions

In [4]:
openai_base_representation_model = OpenAI(
    openai_client,
    model="gpt-4-turbo",
    chat=True,
    nr_docs = 50)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [5]:
column_to_2darray = lambda column: np.array([np.array(x) for x in column.to_numpy()])

def runBERTopic(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, reduce_outliers=True):

    topic_model = BERTopic(
        representation_model = representation_model,
        umap_model = umap_model,
        hdbscan_model = hdbscan_model,
        calculate_probabilities=True)

    if embeddings_name is not None:
        topics, probs = topic_model.fit_transform(
            df[docs_name],
            column_to_2darray(df[embeddings_name]))
    else:
        topics, probs = topic_model.fit_transform(df[docs_name])
        
    if reduce_outliers:
        topics = topic_model.reduce_outliers(
            documents=df[docs_name].to_list(),
            topics=topics, 
            probabilities=probs,
            strategy="probabilities")

        topic_model.update_topics(
            df[docs_name].to_list(),
            topics=topics,
            representation_model=representation_model)
        
    df['topic'] = topics
    
    df = pd.concat([df, pd.DataFrame(probs)], axis=1)

    return topic_model, df

In [24]:
def assign_topics_and_probabilities(df, model):

    cleaned_df = df[['report_id', 'si', 'mode', 'si_embedding']]

    embeddings = column_to_2darray(cleaned_df['si_embedding'])

    documents = cleaned_df['si'].to_list()

    transform = model.transform(documents, embeddings)

    cleaned_df['topic'] = transform[0]

    return pd.concat([cleaned_df.reset_index(drop=True), pd.DataFrame(transform[1])], axis=1)

def run_merged_model(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, min_similarity):
    mode_groups = df.groupby('mode')

    modes_dfs = [mode_groups.get_group(x).reset_index(drop=True) for x in mode_groups.groups]

    models = [runBERTopic(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, False)[0] for df in modes_dfs]

    merged_model = BERTopic.merge_models(models, min_similarity=min_similarity)

    merged_df = assign_topics_and_probabilities(df, merged_model)

    return merged_model, merged_df


def perform_hyper_parameter_search(embeddings_dfs, UMAP_models, HDBSCAN_models, merged_ranges):
    print("Performing hyper parameter search of BERTopic models...")

    model_types = ['merged', 'group']

    df = pd.DataFrame(
        list(product(UMAP_models, HDBSCAN_models, embeddings_dfs, model_types)),
        columns=['umap_model', 'hdbscan_model', 'embedding_type', 'model_type']
    )

    df['merged_min_similarity'] = df['model_type'].apply(lambda x: merged_ranges['min_similarity'] if x == 'merged' else None)
    df = df.explode('merged_min_similarity')

    df['embedding_df'] = df['embedding_type'].apply(lambda x: embeddings_dfs[x])

    print(f"There are {len(df)} models to run.")

    df= df.sample(100)

    ### 
    ### Run model ###
    ###

    def run_model(row): 
        if row['model_type'] == 'group':
            return runBERTopic(
                row['embedding_df'],
                'si',
                'si_embedding',
                None,
                row['umap_model'],
                row['hdbscan_model'],
                False) 
        else:
            return run_merged_model(
                row['embedding_df'],
                'si',
                'si_embedding',
                None,
                row['umap_model'],
                row['hdbscan_model'],
                row['merged_min_similarity']
            )

    df['model'] = df.progress_apply(run_model, axis=1)

    df.to_pickle('bertopic_models.pkl')
    
    df['embedding_df'] = df['model'].apply(lambda x: x[1])

    df['model'] = df['model'].apply(lambda x: x[0])
    
    return df

## Performing search

The search will be done using the `perform_hyper_parameter_search` function with the results saved ina pickle file.

In [21]:

UMAP_models = [
    UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42)
    for n_neighbors, n_components in
    product(range(3,6), range(5,50, 2))
]

HDBSCAN_models = [
    HDBSCAN(min_cluster_size=min_cluster_size, metric="euclidean", cluster_selection_method='eom', prediction_data=True)
    for min_cluster_size in
    range(5,20,5)
]

KMEANS_models = [
    KMeans(n_clusters=n_clusters, random_state=42)
    for n_clusters in
    range(5,20,4)
]


results = perform_hyper_parameter_search(
    all_embeddings,
    UMAP_models,
    HDBSCAN_models + KMEANS_models,
    merged_ranges = {'min_similarity': [e / 100 for e in list(range(90,100, 2))]}
)

results.to_pickle('hyper_parameter_search_results.pkl')

Performing hyper parameter search of BERTopic models...
There are 8694 models to run.


  0%|          | 0/8694 [00:00<?, ?it/s]

TypeError: 'NoneType' object is not subscriptable

## Parsing results

In [None]:
def get_stats(df):

    df['topic_membership_counts'] = df['model'].apply(lambda x: x.get_topic_info()['Count'].to_list())
    df['topic_membership_counts_std'] = df['topic_membership_counts'].apply(lambda x: np.std(x))
    df['num_topics'] = df['topic_membership_counts'].apply(lambda x: len(x))
    df['outlier_percent'] = df['model'].apply(lambda x: x.get_topic_info()['Count'][0]/sum(x.get_topic_info()['Count']))

    return df

def topic_counts(df):
    df['individual_topic_counts'] = df['individual_models'].apply(lambda list_of_models: [(round(x.get_topic_info()['Count'][0]/sum(x.get_topic_info()['Count']), 2), len(x.get_topic_info()['Name'])) for x in list_of_models])
    
    df['average_individual_topic_count'] = df['individual_topic_counts'].apply(lambda list_of_models: round(np.mean([x[1] for x in list_of_models]), 2))

    df['individual_topic_membership_counts'] = df['individual_models'].apply(lambda list_of_models: [x.get_topic_info()['Count'].to_list() for x in list_of_models])

    df['average_individual_outliers'] = df['individual_topic_counts'].apply(lambda list_of_models: round(np.mean([x[0] for x in list_of_models]),2))

    df.drop('individual_topic_counts', axis=1, inplace=True)


    df['merged_topic_membership_counts'] = df.apply(
        lambda x: 
        [c  for c in x['merged_model'].get_topic_info()['Count'].to_list()], axis=1)
    
    df['merged_outliers_percent'] = df['merged_model'].apply(lambda model: model.get_topic_info()['Count'][0] / model.get_topic_info()['Count'].sum() )

    df['group_topic_membership_counts'] = df.apply(
        lambda x: 
        [c  for c in x['group_model'].get_topic_info()['Count'].to_list()], axis=1)
    
    df['group_outliers_percent'] = df['group_model'].apply(lambda model: model.get_topic_info()['Count'][0] / model.get_topic_info()['Count'].sum() )

    df['merged_topic_count'] = df['merged_topic_membership_counts'].apply(len)

    df['group_topic_count'] = df['group_topic_membership_counts'].apply(len)

    column_names = ['n_components', 'n_neighbors', 'individual_topic_membership_counts', 'average_individual_topic_count', 'average_individual_outliers', 'merged_topic_membership_counts', 'merged_topic_count', 'merged_outliers_percent', 'group_topic_membership_counts', 'group_topic_count', 'group_outliers_percent']

    df = df[column_names + list(df.columns)]

    return df.loc[:,~df.columns.duplicated()].copy()


In [None]:
results = get_stats(results_with_topic_counts)
results

In [None]:
results_with_topic_counts

Unnamed: 0,n_components,n_neighbors,individual_topic_membership_counts,average_individual_topic_count,average_individual_outliers,merged_topic_membership_counts,merged_topic_count,merged_outliers_percent,group_topic_membership_counts,group_topic_count,group_outliers_percent,individual_models,individual_df,group_model,group_df,embedding_model,merged_model,min_similarity,merged_df
0,3,3,"[[50, 34, 33, 18, 17, 17, 13, 12, 12], [15, 40...",7.33,0.18,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20,0.183929,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",14,0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.90,report_id ...
1,3,3,"[[50, 34, 33, 18, 17, 17, 13, 12, 12], [15, 40...",7.33,0.18,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20,0.183929,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",14,0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.91,report_id ...
2,3,3,"[[50, 34, 33, 18, 17, 17, 13, 12, 12], [15, 40...",7.33,0.18,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20,0.183929,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",14,0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.92,report_id ...
3,3,3,"[[50, 34, 33, 18, 17, 17, 13, 12, 12], [15, 40...",7.33,0.18,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20,0.183929,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",14,0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.93,report_id ...
4,3,3,"[[50, 34, 33, 18, 17, 17, 13, 12, 12], [15, 40...",7.33,0.18,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20,0.183929,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",14,0.089286,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,openai,"BERTopic(calculate_probabilities=True, ctfidf_...",0.94,report_id ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,24,5,"[[55, 48, 42, 29, 19, 13], [41, 72, 44, 17, 11...",4.33,0.47,"[96, 48, 42, 29, 19, 13, 72, 44, 17, 11, 153, 16]",12,0.171429,"[3, 209, 184, 152, 12]",5,0.005357,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,gtelarge,"BERTopic(calculate_probabilities=True, ctfidf_...",0.95,report_id ...
1976,24,5,"[[55, 48, 42, 29, 19, 13], [41, 72, 44, 17, 11...",4.33,0.47,"[96, 48, 42, 29, 19, 13, 72, 44, 17, 11, 153, 16]",12,0.171429,"[3, 209, 184, 152, 12]",5,0.005357,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,gtelarge,"BERTopic(calculate_probabilities=True, ctfidf_...",0.96,report_id ...
1977,24,5,"[[55, 48, 42, 29, 19, 13], [41, 72, 44, 17, 11...",4.33,0.47,"[96, 48, 42, 29, 19, 13, 72, 44, 17, 11, 153, 16]",12,0.171429,"[3, 209, 184, 152, 12]",5,0.005357,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,gtelarge,"BERTopic(calculate_probabilities=True, ctfidf_...",0.97,report_id ...
1978,24,5,"[[55, 48, 42, 29, 19, 13], [41, 72, 44, 17, 11...",4.33,0.47,"[96, 48, 42, 29, 19, 13, 72, 44, 17, 11, 153, 16]",12,0.171429,"[3, 209, 184, 152, 12]",5,0.005357,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,gtelarge,"BERTopic(calculate_probabilities=True, ctfidf_...",0.98,report_id ...


Visual all of the models by printing them out.

I will need to make the dataset longer by merging the common measures into singluar columns

There can be an outlier measure, topic count measure and a third column of model type, individual, merged, group

In [None]:
# Get this longer dataset

def lengthen_counts(df):

    df['model'] = df.apply(
        lambda row: [
            ('individual', row['individual_models']),
            ('merged', row['merged_model']),
            ('group', row['group_model'])
        ],
        axis = 1
    )

    df['df'] = df.apply(
        lambda row: [
            row['individual_df'],
            row['merged_df'],
            row['group_df']
        ],
        axis = 1
    )

    df['topic_membership_counts'] = df.apply(
        lambda row: [
            row['individual_topic_membership_counts'],
            row['merged_topic_membership_counts'],
            row['group_topic_membership_counts']
        ],
        axis=1
    )



    # Combine topic counts into one column
    df['topic_counts'] = df.apply(
        lambda row: [
            row['average_individual_topic_count'],
            row['merged_topic_count'],
            row['group_topic_count']
        ],
        axis=1
    )

    df['outliers_percent'] = df.apply(
        lambda row: [
            row['average_individual_outliers'],
            row['merged_outliers_percent'],
            row['group_outliers_percent']
        ],
        axis=1
    )

    # Drop individual topic count columns
    df = df.explode(['topic_counts', 'outliers_percent', 'model', 'df', 'topic_membership_counts'], ignore_index=True).drop(columns=['individual_topic_membership_counts', 'merged_topic_membership_counts', 'group_topic_membership_counts', 'average_individual_topic_count', 'merged_topic_count', 'group_topic_count', 'average_individual_outliers', 'merged_outliers_percent', 'group_outliers_percent', 'individual_models', 'merged_model', 'group_model', 'individual_df', 'merged_df', 'group_df'])

    df['model_type'] = df['model'].apply(lambda x: x[0])
    df['model'] = df['model'].apply(lambda x: x[1])


    # Make all new value columns foats
    df = df.astype({'topic_counts': 'float', 'outliers_percent': 'float', 'min_similarity': 'float'})

    df['id'] = range(0, len(df))

    return df


lengthened_combined_topic_counts = lengthen_counts(combined_topic_counts)

lengthened_combined_topic_counts

Unnamed: 0,n_components,n_neighbors,embedding_model,min_similarity,model,df,topic_membership_counts,topic_counts,outliers_percent,model_type,id
0,3,3,openai,0.90,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"[[50, 34, 33, 18, 17, 17, 13, 12, 12], [15, 40...",7.33,0.180000,individual,0
1,3,3,openai,0.90,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20.00,0.183929,merged,1
2,3,3,openai,0.90,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",14.00,0.089286,group,2
3,3,3,openai,0.91,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"[[50, 34, 33, 18, 17, 17, 13, 12, 12], [15, 40...",7.33,0.180000,individual,3
4,3,3,openai,0.91,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20.00,0.183929,merged,4
...,...,...,...,...,...,...,...,...,...,...,...
5935,24,5,gtelarge,0.98,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[96, 48, 42, 29, 19, 13, 72, 44, 17, 11, 153, 16]",12.00,0.171429,merged,5935
5936,24,5,gtelarge,0.98,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[3, 209, 184, 152, 12]",5.00,0.005357,group,5936
5937,24,5,gtelarge,0.99,"[BERTopic(calculate_probabilities=True, ctfidf...",report_id ...,"[[55, 48, 42, 29, 19, 13], [41, 72, 44, 17, 11...",4.33,0.470000,individual,5937
5938,24,5,gtelarge,0.99,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[96, 48, 42, 29, 19, 13, 72, 44, 17, 11, 153, 16]",12.00,0.171429,merged,5938


#### Intepret dataset

I have access to a few datasets at this point.

The most useful would be `lengthend_combined_topic_counts`

In [None]:
def make_visualization(model, df, save = False, name = 'topic model visual'):

    array_embeddings = column_to_2darray(df['si_embedding'])

    reduced_array_embeddings = UMAP(n_neighbors=7, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

    visualization = model.visualize_documents(df['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

    if save:

        with open(os.path.join('topic_visuals', name), 'w') as f:
            visualization.write_html(f)

    return visualization


In [None]:
full_models = lengthened_combined_topic_counts.loc[lengthened_combined_topic_counts['model_type'] != "individual"]

full_models['topic_membership_std'] = full_models['topic_membership_counts'].apply(np.std)
        # lambda x: float(np.std(x) / np.mean(x)) if 
        # [type(item) for item in x][0] is int 
        # else [float(np.std(sublist) / np.mean(sublist)) for sublist in x])



full_models


Unnamed: 0,n_components,n_neighbors,embedding_model,min_similarity,model,df,topic_membership_counts,topic_counts,outliers_percent,model_type,id,topic_membership_std
1,3,3,openai,0.90,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20.0,0.183929,merged,1,21.904337
2,3,3,openai,0.90,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",14.0,0.089286,group,2,29.529646
4,3,3,openai,0.91,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20.0,0.183929,merged,4,21.904337
5,3,3,openai,0.91,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[50, 130, 62, 55, 49, 40, 36, 27, 27, 26, 21, ...",14.0,0.089286,group,5,29.529646
7,3,3,openai,0.92,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[103, 34, 33, 18, 17, 17, 13, 12, 12, 40, 38, ...",20.0,0.183929,merged,7,21.904337
...,...,...,...,...,...,...,...,...,...,...,...,...
5933,24,5,gtelarge,0.97,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[3, 209, 184, 152, 12]",5.0,0.005357,group,5933,87.262821
5935,24,5,gtelarge,0.98,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[96, 48, 42, 29, 19, 13, 72, 44, 17, 11, 153, 16]",12.0,0.171429,merged,5935,40.534622
5936,24,5,gtelarge,0.98,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[3, 209, 184, 152, 12]",5.0,0.005357,group,5936,87.262821
5938,24,5,gtelarge,0.99,"BERTopic(calculate_probabilities=True, ctfidf_...",report_id ...,"[96, 48, 42, 29, 19, 13, 72, 44, 17, 11, 153, 16]",12.0,0.171429,merged,5938,40.534622


In [None]:
# Assuming your 'lengthened_combined_topic_counts' DataFrame is available

fig = px.scatter(full_models.query('topic_counts > 10 '),
                 x='topic_membership_std', y='outliers_percent',
                 color='model_type', size='topic_counts',
                 hover_data=['id', 'n_components', 'n_neighbors', 'topic_membership_counts', 'min_similarity'],)

fig.update_layout(
    yaxis=dict(range=[0, 0.7]),
    hovermode='closest'
)

fig.show()

There are many that are good. It all depends on what consititues a good topic model. I feel that one that has a small amount of outliers as well as a decent amount of topics is quite suitable.

Here are some that seem intersting

In [None]:
# Get row that has id 2908

rows = lengthened_combined_topic_counts.query('id in [419, 2512, 2518]')


for i, row in rows.iterrows():
    model = row.model
    model.update_topics(
        row['df']['si'].to_list(),
        representation_model = openai_base_representation_model
    )
    display(make_visualization(model, row['df'], save=True, name=f"{i}.html"))
    display(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,36,-1_Aviation and Maritime Safety Compliance and...,[Aviation and Maritime Safety Compliance and C...,[The protection person did not follow the amen...
1,0,118,0_Maritime Safety and Resource Management Issues,[Maritime Safety and Resource Management Issues],[The safety management system on board the Cap...
2,1,70,1_Rail and Road Safety Issues with Level Cross...,[Rail and Road Safety Issues with Level Crossi...,[Level crossing assessments do not require the...
3,2,49,2_Helicopter Safety and Operational Challenges,[Helicopter Safety and Operational Challenges],"[Due to their unique main rotor design, during..."
4,3,48,3_Aviation Safety and Regulatory Compliance Is...,[Aviation Safety and Regulatory Compliance Iss...,[There was a low likelihood of the weather con...
5,4,46,4_Safety and Communication Failures in Train O...,[Safety and Communication Failures in Train Op...,[There are a number of reasonable measures tha...
6,5,37,5_KiwiRail Operational and Safety Compliance I...,[KiwiRail Operational and Safety Compliance Is...,[The New Zealand Rail Operating Rules and Proc...
7,6,30,6_Safety and Regulatory Oversight in Aviation ...,[Safety and Regulatory Oversight in Aviation a...,[The Easy Rider's life-raft was in current sur...
8,7,27,7_Rail Safety and Train Controller Challenges,[Rail Safety and Train Controller Challenges],[The train controller made an assumption about...
9,8,20,8_Aircraft Landing Gear and Hydraulic System F...,[Aircraft Landing Gear and Hydraulic System Fa...,[Had the pilots known that the nose landing ge...


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,43,-1_Transportation and Communication Safety Issues,[Transportation and Communication Safety Issues],
1,0,44,0_Helicopter Safety and Training Issues in New...,[Helicopter Safety and Training Issues in New ...,
2,1,41,1_Aviation Safety and Crew Management Issues,[Aviation Safety and Crew Management Issues],
3,2,27,2_Aviation Safety and Pilot Training Deficiencies,[Aviation Safety and Pilot Training Deficiencies],
4,3,25,3_Aviation Safety and Regulation Compliance Co...,[Aviation Safety and Regulation Compliance Con...,
5,4,23,4_Aircraft Maintenance and Safety Issues,[Aircraft Maintenance and Safety Issues],
6,5,19,5_Aviation Safety and Air Traffic Control Issues,[Aviation Safety and Air Traffic Control Issues],
7,6,17,6_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],
8,7,92,7_Rail Safety and Incident Management Concerns,[Rail Safety and Incident Management Concerns],
9,8,41,8_KiwiRail Safety and Operational Compliance I...,[KiwiRail Safety and Operational Compliance Is...,


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,43,-1_Transportation Safety and Regulatory Compli...,[Transportation Safety and Regulatory Complian...,
1,0,44,0_Helicopter Safety and Training Issues in New...,[Helicopter Safety and Training Issues in New ...,
2,1,30,1_Aircraft Landing Gear and Door Lock Failures,[Aircraft Landing Gear and Door Lock Failures],
3,2,27,2_Aviation Safety and Operational Standards Co...,[Aviation Safety and Operational Standards Com...,
4,3,25,3_Aviation Safety and Regulatory Oversight Con...,[Aviation Safety and Regulatory Oversight Conc...,
5,4,23,4_Aircraft Maintenance and Safety Issues,[Aircraft Maintenance and Safety Issues],
6,5,19,5_Aviation Safety and Air Traffic Control Issues,[Aviation Safety and Air Traffic Control Issues],
7,6,17,6_Aviation Safety and Regulatory Compliance Is...,[Aviation Safety and Regulatory Compliance Iss...,
8,7,92,7_Rail Safety and Incident Management,[Rail Safety and Incident Management],
9,8,41,8_KiwiRail Safety and Compliance Issues,[KiwiRail Safety and Compliance Issues],


I will have alook at 2512.

In [None]:
row_2512 = rows.query('id == 2518').to_dict(orient='records')[0]
model = row_2512['model']
df = row_2512['df']
display(model.get_topic_info())
# Get counts of each mode in each topic
check_mode_cluster_distribution(df)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,43,-1_Transportation Safety and Regulatory Compli...,[Transportation Safety and Regulatory Complian...,
1,0,44,0_Helicopter Safety and Training Issues in New...,[Helicopter Safety and Training Issues in New ...,
2,1,30,1_Aircraft Landing Gear and Door Lock Failures,[Aircraft Landing Gear and Door Lock Failures],
3,2,27,2_Aviation Safety and Operational Standards Co...,[Aviation Safety and Operational Standards Com...,
4,3,25,3_Aviation Safety and Regulatory Oversight Con...,[Aviation Safety and Regulatory Oversight Conc...,
5,4,23,4_Aircraft Maintenance and Safety Issues,[Aircraft Maintenance and Safety Issues],
6,5,19,5_Aviation Safety and Air Traffic Control Issues,[Aviation Safety and Air Traffic Control Issues],
7,6,17,6_Aviation Safety and Regulatory Compliance Is...,[Aviation Safety and Regulatory Compliance Iss...,
8,7,92,7_Rail Safety and Incident Management,[Rail Safety and Incident Management],
9,8,41,8_KiwiRail Safety and Compliance Issues,[KiwiRail Safety and Compliance Issues],


mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,21.0,0.0,1.0
0,47.0,0.0,0.0
1,27.0,0.0,0.0
2,23.0,1.0,3.0
3,22.0,6.0,5.0
4,22.0,1.0,8.0
5,20.0,0.0,2.0
6,17.0,1.0,1.0
7,1.0,78.0,0.0
8,1.0,42.0,3.0


In [None]:
temp =df.query('topic == 4')
temp


Unnamed: 0,report_id,si,mode,si_embedding,topic,0,1,2,3,4,...,7,8,9,10,11,12,13,14,15,16
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.016823502257466316, 0.022679857909679413, ...",4,0.893405,0.91762,0.953968,0.91177,0.904218,...,0.897179,0.849041,0.833138,0.79291,0.866021,0.802648,0.874608,0.857946,0.842798,0.897435
5,2010_010,The damage to the seals within the extend/retr...,0,"[-0.004329497925937176, 0.01708492636680603, -...",4,0.903573,0.928427,0.958669,0.923614,0.924247,...,0.911834,0.868872,0.854043,0.814679,0.878737,0.821475,0.896696,0.877361,0.86548,0.916941
17,2013_011,The isolated nature of the component failure i...,0,"[0.001065625692717731, 0.008955495432019234, -...",4,0.941686,0.939444,0.964757,0.960327,0.950934,...,0.950651,0.913426,0.903133,0.854536,0.89153,0.868345,0.931704,0.91986,0.90594,0.963674
54,2015_003,Aircraft design organisations did not have to ...,0,"[0.007535106036812067, 0.007739271968603134, -...",4,0.917882,0.918329,0.933496,0.931414,0.931537,...,0.92837,0.870578,0.872986,0.835904,0.863864,0.836515,0.896232,0.888865,0.870901,0.918892
55,2013_002,The imported second-hand engine and its access...,0,"[-0.010757106356322765, 0.012414596974849701, ...",4,0.917506,0.908612,0.935704,0.93024,0.933529,...,0.930633,0.88506,0.885987,0.841761,0.877124,0.848257,0.908003,0.899193,0.884992,0.921444
58,2011_004,Improper repair and maintenance practices span...,0,"[0.003563880454748869, 0.019006341695785522, -...",4,0.909454,0.923402,0.95522,0.928902,0.935269,...,0.924511,0.884304,0.876499,0.828313,0.881772,0.834114,0.902099,0.882343,0.871363,0.934347
60,2011_004,Although much aircraft maintenance is done und...,0,"[0.007655046414583921, 0.011774522252380848, -...",4,0.89709,0.917003,0.947426,0.91271,0.921752,...,0.910624,0.865013,0.855885,0.803103,0.864783,0.809806,0.88433,0.86343,0.84835,0.914376
70,2016_004,The failed spark plug allowed hot combustion g...,0,"[-0.0030344142578542233, 0.017432235181331635,...",4,0.88139,0.920122,0.923509,0.911268,0.927392,...,0.892593,0.851179,0.837496,0.799548,0.851132,0.813332,0.888391,0.86439,0.847265,0.900185
71,2016_004,"The DENSO W24EMR-C spark plug, originally desi...",0,"[-0.00027844039141200483, 0.02180514670908451,...",4,0.874641,0.897013,0.915964,0.897551,0.918934,...,0.888283,0.850322,0.838308,0.792942,0.853308,0.804197,0.871598,0.851485,0.83578,0.897682
73,2016_004,There was a lack of clear communication and ag...,0,"[-0.004059033002704382, 0.017587538808584213, ...",4,0.878526,0.906731,0.922247,0.903231,0.92295,...,0.892195,0.847482,0.836082,0.786262,0.851466,0.796712,0.876447,0.857925,0.840816,0.901799


## Interpreting results

In [43]:
def check_mode_cluster_distribution(df):
    safety_issues_df_topic_mode = df.pivot_table(index='topic', columns='mode', values='report_id', aggfunc='count').fillna(0)
    return safety_issues_df_topic_mode

## Maually looking at models

### Running it on all safety issues


I want to generate the safety themes from all of the safety issues I have available.

#### Simple minilm embeddings

This seems to of failed. I believe this is mainly due to the fact that each documents are really short.

In [None]:

topic_model, _ = runBERTopic(
    safety_issues_df, 'si', None, openai_base_representation_model, umap_model, reduce_outliers=False)

topic_model.get_topic_info()


There is a bit of a problem where the number of outliers is quite great.

I will try to merge the outliers

In [None]:
topic_model, _ = runBERTopic(
    safety_issues_df, 'si', None, openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,166,0_Rail Safety and Operational Issues in New Ze...,[Rail Safety and Operational Issues in New Zea...,[The training that drivers received for transi...
1,1,64,1_Maritime Safety and Navigation Management Is...,[Maritime Safety and Navigation Management Iss...,[The voyage planning for the time in the Snare...
2,2,36,2_Maritime Safety and Regulations Compliance I...,[Maritime Safety and Regulations Compliance Is...,[The skipper did not have the requisite knowle...
3,3,53,3_Safety and Maintenance Issues in Engineering...,[Safety and Maintenance Issues in Engineering ...,[There was a lack of clear communication and a...
4,4,53,4_Maritime and Aviation Safety Management and ...,[Maritime and Aviation Safety Management and E...,[It could not be established why the chief off...
5,5,50,5_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],[Had the controllers realised that the low clo...
6,6,27,6_Robinson Helicopter Safety and Accident Anal...,[Robinson Helicopter Safety and Accident Analy...,"[Due to their unique main rotor design, during..."
7,7,62,7_Aviation Safety and Regulatory Compliance Is...,[Aviation Safety and Regulatory Compliance Iss...,[The standard of pilot training and the superv...
8,8,26,8_Aircraft Landing Gear and Door Lock Failures,[Aircraft Landing Gear and Door Lock Failures],[Had the pilots known that the nose landing ge...
9,9,23,9_Deficiencies in Safety and Regulatory Compli...,[Deficiencies in Safety and Regulatory Complia...,[There were no established procedures for ente...


The main problem here is that the the distribution is not great. It seems that most of the rail are in the first topic then martime and aviation take up the rest.

#### VoyageAI embeddings

In [None]:
topic_model, voyageai_clusters_df = runBERTopic(
    voyageai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()

NameError: name 'voyageai_embeddings' is not defined

In [None]:


check_mode_cluster_distribution(voyageai_clusters_df)

mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,204,8,167
1,2,177,2


This has created two topics wiht one being avaiation and martime and the other being rail.

#### OpenAI embeddings

In [None]:
topic_model, openai_clusters_df = runBERTopic(
    openai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()[['Count', 'Name']]



Unnamed: 0,Count,Name
0,203,0_Aviation Safety and Compliance Issues
1,189,1_Rail Safety and Operational Issues in New Ze...
2,142,2_Maritime Safety and Navigation Management Flaws
3,26,3_Maritime Safety and Compliance Issues of the...


In [None]:
check_mode_cluster_distribution(openai_clusters_df)

mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,191.0,4.0,8.0
1,5.0,181.0,3.0
2,4.0,0.0,138.0
3,6.0,0.0,20.0


This has also made a cleanish split between modes of transport. I can eithe try to force it not to do this and/or run the model on each mode then merge the models.

In [None]:
umap_model_tweaked = UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

topic_model, openai_clusters_tweaked_df = runBERTopic(
    openai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model_tweaked, reduce_outliers=True)

display(topic_model.get_topic_info()[['Count', 'Name']])

check_mode_cluster_distribution(openai_clusters_tweaked_df)



Unnamed: 0,Count,Name
0,167,0_Rail Safety and Operational Failures
1,115,1_Maritime Safety and Resource Management Defi...
2,50,2_Safety and Compliance in Transport and Marit...
3,51,3_Aviation Safety and Regulatory Compliance Is...
4,41,4_Helicopter Safety and Operational Issues
5,52,5_Aviation Safety and Air Traffic Control Issues
6,27,6_Safety Issues in Rail Operations
7,30,7_Aircraft Landing Gear and Maintenance Issues
8,11,8_Aviation Safety Issues Related to Door Locki...
9,16,9_Safety and Maintenance Challenges in Maritim...


mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.0,157.0,7.0
1,7.0,0.0,108.0
2,15.0,2.0,33.0
3,47.0,0.0,4.0
4,40.0,1.0,0.0
5,51.0,0.0,1.0
6,2.0,25.0,0.0
7,30.0,0.0,0.0
8,11.0,0.0,0.0
9,0.0,0.0,16.0


I will try to tune the hyper paramters and see if I can get the right sort of safety themes

In [None]:
topic_model, openai_clusters_tuned_df = runBERTopic(
    openai_embeddings,
    'si',
    'si_embedding',
    openai_base_representation_model,
    UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine', random_state=42),
    reduce_outliers=True)

topic_model.get_topic_info()[['Count', "Name","Representative_Docs"]]



Unnamed: 0,Count,Name,Representative_Docs
0,167,0_Rail Safety and Operational Issues in New Ze...,[The training that drivers received for transi...
1,115,1_Maritime Safety and Resource Management Defi...,[The standard of passage planning on board the...
2,50,2_Maritime and Aviation Safety Regulations and...,[The absence of a visual indicator in the whee...
3,51,3_Aviation Safety and Regulatory Compliance Is...,[The operator's system for training its pilots...
4,41,4_Helicopter Safety and Maintenance Issues,"[Due to their unique main rotor design, during..."
5,52,5_Aviation Safety and Operational Procedures a...,[While ATC sequences an IFR aeroplane to land ...
6,27,6_Safety Issues and Management Deficiencies in...,[The train controller made an assumption about...
7,30,7_Aircraft Landing Gear and Maintenance Issues,[Had the pilots known that the nose landing ge...
8,11,8_Aviation Safety and Equipment Malfunction,"[The use of ""threat and error management"" (TEM..."
9,16,9_Maintenance and Risk Management in Marine Sa...,[A clear placard should be placed at the contr...


### Run cluster on just one mode

It would make sense that if the clustering is finding the transport modes then splitting into the modes first might help find the themes within each mode.

In [None]:
def printout_each_modes_topics(results):
    for res in results:
        print("Cluster names: ")
        for i, count in zip(res[0].get_topic_info()['Name'], res[0].get_topic_info()['Count']):
            print(f"{count}, {i}")

#### VoyageAI

In [None]:
voyageai_modes_dfs = [voyageai_embeddings[voyageai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

In [None]:
results = [runBERTopic(df, 'si', 'si_embedding', openai_base_representation_model, umap_model) for df in voyageai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
54, 0_Aviation Safety and Operational Procedures
34, 1_Safety Challenges and Risks in Robinson Helicopter Operations
62, 2_Aviation Safety and Regulatory Compliance Issues
25, 3_Aircraft Landing Gear and Door System Failures
31, 4_Aircraft Maintenance and Safety Concerns
Cluster names: 
166, 0_Rail Safety and Management Issues
19, 1_Safety and Regulatory Issues at Rail Level Crossings
Cluster names: 
28, 0_Maritime Safety and Bridge Resource Management Deficiencies
33, 1_Maritime Safety and Management Failures
43, 2_Maritime Safety and Navigation Standards Compliance
30, 3_Maritime Safety and Emergency Response Deficiencies
22, 4_Maritime Safety Violations and Consequences aboard the Easy Rider
13, 5_Propulsion System Failures and Maintenance Issues in Marine Operations


In [None]:
checking = results[2][1]

In [None]:
merged_moode_models = BERTopic.merge_models([result[0] for result in results], min_similarity=0.9)


merged_moode_models.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,52,0_Aviation Safety and Air Traffic Control Proc...,[Aviation Safety and Air Traffic Control Proce...,
1,1,47,1_Helicopter Safety and Accident Analysis,[Helicopter Safety and Accident Analysis],
2,2,41,2_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],
3,3,33,3_Aircraft Safety and Maintenance Issues,[Aircraft Safety and Maintenance Issues],
4,4,17,4_Aviation Safety and Regulatory Compliance in...,[Aviation Safety and Regulatory Compliance in ...,
5,5,16,5_Safety and Regulatory Issues in New Zealand ...,[Safety and Regulatory Issues in New Zealand A...,
6,6,44,0_KiwiRail Safety and Compliance Issues,[KiwiRail Safety and Compliance Issues],
7,7,40,1_Rail Safety and Communication Failures,[Rail Safety and Communication Failures],
8,8,25,2_Safety and Management Issues in Rail Operations,[Safety and Management Issues in Rail Operations],
9,9,20,3_Safety and Regulatory Issues at Road-Rail Le...,[Safety and Regulatory Issues at Road-Rail Lev...,


#### OpenAI

In [None]:
openai_modes_dfs = [openai_embeddings[openai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

for df in openai_modes_dfs:
    display(df)

Unnamed: 0,report_id,si,mode,si_embedding
0,2011_003,The New Zealand regulatory system has not prov...,0,"[0.0187440924346447, -0.000433413457358256, -0..."
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[0.01013844646513462, -0.03145159035921097, -0..."
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[0.005347656551748514, -0.022685393691062927, ..."
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[0.014976576901972294, 0.015324870124459267, -..."
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.0042054359801113605, 0.04125332459807396, ..."
...,...,...,...,...
201,2015_001,Parachute drop pilots were not required to wea...,0,"[0.0253401268273592, -0.02335318550467491, -0...."
202,2011_006,The council had not evaluated the effects of t...,0,"[-0.02767498977482319, 0.01624125801026821, -0..."
203,2011_006,The standard of pilot training and the supervi...,0,"[0.015297695063054562, -0.018917182460427284, ..."
204,2011_006,The CAA had had recurring concerns for the man...,0,"[0.001043604570440948, 0.00177335599437356, 0...."


Unnamed: 0,report_id,si,mode,si_embedding
0,2019_106,No procedures were in place to direct train cr...,1,"[0.017140474170446396, 0.03509647026658058, -0..."
1,2013_107,"The high incidence of brake block replacement,...",1,"[-0.0018233972368761897, 0.020808950066566467,..."
2,2013_107,The visual inspection regime for wheel-bearing...,1,"[0.0025237964000552893, 0.027265744283795357, ..."
3,2013_107,"The RailBAM system, while operational, did not...",1,"[0.006221923511475325, 0.025432679802179337, -..."
4,2013_107,The lack of a dedicated RailBAM analyst positi...,1,"[-0.004680005367845297, 0.013756909407675266, ..."
...,...,...,...,...
180,2017_101,KiwiRail did not have a mature fatigue risk ma...,1,"[-0.006654317956417799, 0.029867829754948616, ..."
181,2017_101,The eProtect KMC module on board the locomotiv...,1,"[-0.003919209353625774, 0.022458476945757866, ..."
182,2017_104,Transdev had no policies or procedures in plac...,1,"[0.013986819423735142, 0.01571197435259819, -0..."
183,2020_104,Implementation of an administrative control me...,1,"[-0.013463953509926796, -0.007039009593427181,..."


Unnamed: 0,report_id,si,mode,si_embedding
0,2019_202,There is limited data to quantify the extent o...,2,"[-0.0015265028923749924, 0.013446947559714317,..."
1,2019_201,the operator's planned maintenance programme d...,2,"[0.03530280664563179, 0.027329862117767334, 0...."
2,2019_201,the operator's hazard identification system ha...,2,"[0.009704935364425182, 0.02645685337483883, 0...."
3,2019_204,The operator had not included predefined weath...,2,"[0.038087889552116394, 0.000508625409565866, 0..."
4,2019_204,The operator of the Henerata had not assessed ...,2,"[0.018176013603806496, 0.026440272107720375, 0..."
...,...,...,...,...
164,2017_203,Technicians who are authorised to conduct mand...,2,"[0.002318679355084896, 0.015887508168816566, -..."
165,2013_201,The firefighting drills held on board the Taok...,2,"[0.006056208163499832, 0.01051066443324089, -0..."
166,2014_201,crew awareness of the operating limitations of...,2,"[-0.029451534152030945, 0.026009364053606987, ..."
167,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.021512825042009354, 0.029569942504167557, ..."


In [None]:
results = [runBERTopic(df, 'si', 'si_embedding', openai_base_representation_model, umap_model) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
46, 0_Aviation Safety and Operational Procedures Issues
42, 1_Aircraft Maintenance and Safety Issues
37, 2_Challenges and Safety Issues in Robinson Helicopter Operations
51, 3_Aviation Safety and Regulatory Oversight in New Zealand
30, 4_Aviation Safety and Emergency Response
Cluster names: 
49, 0_KiwiRail Safety and Compliance Issues
28, 1_Rail Safety and Inspection Inefficiencies
42, 2_Rail Safety and Communication Issues
27, 3_Safety and Oversight Concerns in Train Operations
21, 4_Road and Rail Safety at Level Crossings
18, 5_Risk Management and Safety Issues in Wellington Station Train Operations
Cluster names: 
150, 0_Maritime Safety and Crew Management Deficiencies
19, 1_Maritime Safety and Compliance Issues


I will try instead to do with no dimension reduction, or atleast decrease the amount of dimension reduction.

In [None]:
from bertopic.dimensionality import BaseDimensionalityReduction

results = [runBERTopic(df,
                       'si',
                       'si_embedding',
                       openai_base_representation_model,
                       BaseDimensionalityReduction()
                       ) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
46, 0_Aviation Safety and Air Traffic Management Issues
47, 1_Aircraft Maintenance and Safety Issues
41, 2_Safety and Training Issues in Robinson Helicopter Operations
53, 3_Aviation Safety and Compliance Issues
19, 4_Safety and Regulatory Oversight in Aviation and Parachuting Operations
Cluster names: 
47, 0_Issues in KiwiRail's Safety and Operational Procedures
62, 1_Rail Safety and Incident Analysis
29, 2_Rail Safety and Signal Management Issues in Wellington Station Approaches
20, 3_Safety Issues at Rail Level Crossings
27, 4_Safety and Risk Management in Rail Operations
Cluster names: 
149, 0_Maritime Safety and Resource Management Issues
20, 1_Maritime Safety Violations and the Sinking of the Easy Rider


This reuslts in just one cluster for each as the curse of dimensionality is prudent here. I will instead try to tune the hyper parameters of OPenAI

In [None]:
from bertopic.dimensionality import BaseDimensionalityReduction

results = [runBERTopic(df,
                       'si',
                       'si_embedding',
                       None,
                       UMAP(n_neighbors=6, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
                       ) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
52, 0_the_to_of_and
47, 1_the_to_of_and
41, 2_the_of_to_and
33, 3_the_gear_landing_to
17, 4_the_to_for_water
16, 5_zealand_new_of_the
Cluster names: 
44, 0_the_to_work_of
40, 1_the_train_to_and
25, 2_train_of_the_and
20, 3_road_level_crossings_the
20, 4_the_brake_braking_conditions
15, 5_the_in_of_wellington
21, 6_the_rail_of_to
Cluster names: 
120, 0_the_of_and_to
27, 1_the_to_of_easy
11, 2_co2_the_could_be
11, 3_the_fish_crew_of


In [None]:
merged_moode_models.get_topic_info()[['Count', "Name"]]

NameError: name 'merged_moode_models' is not defined


I have had a look at both single run and individual models.

I think the next step is to do some hypter paramter tuning.

 As there are not noticable differences between voyageAI and openAI I will go with openAI embedding model.

# Visualization of themes and safety issues

Now that we have some models that seem reasonable, it is time to create a user friendly representation.

In [None]:
modes_dfs = [openai_embeddings[openai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

pd.concat(modes_dfs)

Unnamed: 0,report_id,si,mode,si_embedding
0,2011_003,The New Zealand regulatory system has not prov...,0,"[0.0187440924346447, -0.000433413457358256, -0..."
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[0.01013844646513462, -0.03145159035921097, -0..."
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[0.005347656551748514, -0.022685393691062927, ..."
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[0.014976576901972294, 0.015324870124459267, -..."
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.0042054359801113605, 0.04125332459807396, ..."
...,...,...,...,...
164,2017_203,Technicians who are authorised to conduct mand...,2,"[0.002318679355084896, 0.015887508168816566, -..."
165,2013_201,The firefighting drills held on board the Taok...,2,"[0.006056208163499832, 0.01051066443324089, -0..."
166,2014_201,crew awareness of the operating limitations of...,2,"[-0.029451534152030945, 0.026009364053606987, ..."
167,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.021512825042009354, 0.029569942504167557, ..."


In [None]:

topic_model = BERTopic.load("demo_merged_model")

all_data = pd.concat(openai_modes_dfs)

make_visualization(topic_model, all_data)




In [None]:
demo_individual_models = [BERTopic.load(f"demo_individual_model_mode_{i}") for i in range(3)]

for model, df, i in zip(demo_individual_models, modes_dfs, range(len(demo_individual_models))):
    array_embeddings = column_to_2darray(df['si_embedding'])

    reduced_array_embeddings = UMAP(n_neighbors=3, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

    visualization = model.visualize_documents(df['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

    with open(os.path.join('topic_visuals', f'demo_individual_model_mode_{i}_visual.html'), 'w') as f:
        visualization.write_html(f)

    display(visualization)



In [None]:
topic_model = BERTopic.load("demo_group_model")

all_data = pd.concat(modes_dfs)

array_embeddings = column_to_2darray(all_data['si_embedding'])

reduced_array_embeddings = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

visualization = topic_model.visualize_documents(all_data['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

with open(os.path.join('topic_visuals', 'demo_group_model_visual.html'), 'w') as f:
    visualization.write_html(f)

visualization

