# What

As establised in this [notebook](./safey_themes_from_safety_issues.ipynb). BERTopic seems to be the most promising method for generating safety themes from safety issues.

There are a few problems that need to be address.
- Lots of outliers
- only 3 topics being generated

## Modules

In [1]:
# local

# third parties

import yaml
import pandas as pd
import numpy as np

import plotly.express as px

from dotenv import load_dotenv

import voyageai
import openai

from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.representation import OpenAI
from cuml.cluster import HDBSCAN
# from cuml.metrics.cluster import silhouette_score
from cuml.manifold import UMAP

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from tqdm.auto import tqdm
tqdm.pandas()

import swifter

# builtin
import os
from itertools import product
import multiprocessing
from collections import namedtuple

openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))



# Getting safety issue data

In [2]:
safety_issues_df = pd.read_csv('safety_issues.csv')

# Confirm it has the right columns report_id, si and mode

if not safety_issues_df.columns.isin(['report_id', 'si', 'mode']).any():
    print("Safety issues dataset is missing columns")
    del safety_issues_df

# Getting embeddings to be used for clustering

In [3]:
def column_to_2darray(column):
    return np.array([np.array(x) for x in column.to_numpy()])


embeddings_files = [file for file in os.listdir() if file.endswith("embeddings.pkl")]

all_embeddings = {os.path.splitext(file)[0].replace("_embeddings", ""): pd.read_pickle(file) for file in embeddings_files}

embeddings_2darrays = {k: column_to_2darray(v['si_embedding']) for k, v in all_embeddings.items()}

In [4]:
all_embeddings = {k: v for k, v in all_embeddings.items() if k in ['openai', 'voyageai', 'voyageai_reccontext', 'voyageai_only_exact', 'gtelarge']}

embeddings_2darrays = {k: embeddings_2darrays[k] for k in all_embeddings.keys()}

# BERTopic models

I played around abit manually trying to find the best ones. However the search space is just too large.

I have found out what I can tweak but in ranges of reasonble values and going to let it automatically go through and searh for them.

The list of thigns to tweak is:

- UMAP and the number of components and neighbors. This is the dimension reduction step
- HDBSCAN and the min_cluster_size. This is the clusterting algorithm
- Whether it is merged from individual models or trained on all embeddings at once.
- The embeddings that it is trained on.

## Needed functions

In [5]:
prompt = """
I have the following safety issues in a topic: [DOCUMENTS] 

Can you please generate a short topic label that will be the title of this Safety issues.
Note these two definitions:
Safety issue - A safety factor that:
    • can reasonably be regarded as having the
    potential to adversely affect the safety of future
    operations, and
    • is characteristic of an organization, a system, or an
    operational environment at a specific point in time.
    Safety Issues are derived from safety factors classified
    either as Risk Controls or Organizational Influences.

Safety theme - Indication of recurring circumstances or causes, either across transport modes or over time. A safety theme may
    cover a single safety issue, or two or more related safety
    issues. 

Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""

openai_base_representation_model = OpenAI(
    openai_client,
    prompt = prompt,
    model="gpt-4o",
    chat=True,
    nr_docs = 50)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [6]:

def runBERTopic(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, reduce_outliers=True, embeddings_array = None):

    topic_model = BERTopic(
        representation_model = representation_model,
        umap_model = umap_model,
        hdbscan_model = hdbscan_model,
        calculate_probabilities=True)

    if embeddings_name is not None:
        topics, probs = topic_model.fit_transform(
            df[docs_name],
            embeddings_array if not embeddings_array is None else column_to_2darray(df[embeddings_name]))
    else:
        topics, probs = topic_model.fit_transform(df[docs_name])
        
    if reduce_outliers:
        topics = topic_model.reduce_outliers(
            documents=df[docs_name].to_list(),
            topics=topics, 
            probabilities=probs,
            strategy="probabilities")

        topic_model.update_topics(
            df[docs_name].to_list(),
            topics=topics,
            representation_model=representation_model)
        
    df['topic'] = topics
    
    df = pd.concat([df, pd.DataFrame(probs)], axis=1)

    return topic_model, df

def assign_topics_and_probabilities(df, model, embeddings):

    cleaned_df = df[['report_id', 'si', 'mode', 'si_embedding']]

    documents = cleaned_df['si'].to_list()

    transform = model.transform(documents, embeddings)

    cleaned_df['topic'] = transform[0]

    return pd.concat([cleaned_df.reset_index(drop=True), pd.DataFrame(transform[1])], axis=1)

def run_merged_model(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, min_similarity, embeddings_array):
    mode_groups = df.groupby('mode')

    modes_dfs = [mode_groups.get_group(x).reset_index(drop=True) for x in mode_groups.groups]

    mode_arrays = [embeddings_array[rows] for i, rows in mode_groups.groups.items()]

    try:
        umap_model.n_components = min(umap_model.n_components, min([arr.shape[0] for arr in mode_arrays])-1)
    except:
        print("base model")


    models = [runBERTopic(df, docs_name, embeddings_name, representation_model, umap_model, hdbscan_model, False, array)[0] for df, array in zip(modes_dfs, mode_arrays)]

    merged_model = BERTopic.merge_models(models, min_similarity=min_similarity)

    merged_df = assign_topics_and_probabilities(df, merged_model, embeddings_array)

    return merged_model, merged_df

def add_config_columns(df):

    def get_cluster_config(x):
        if isinstance(x, HDBSCAN):
            return (x.min_cluster_size)
        elif isinstance(x, KMeans):
            return (x.n_clusters)

    df['cluster_config'] = df['hdbscan_model'].apply(get_cluster_config)

    def get_dimension_reduction_config(x):
        if isinstance(x, UMAP):    
            return (x.n_components, x.n_neighbors)
        else:
            return (None)

    df['dimension_reduction_config'] = df['umap_model'].apply(get_dimension_reduction_config)   

    return df

def hyper_parameter_search(embeddings_arrays, embeddings_dfs, UMAP_models, HDBSCAN_models, merged_ranges, current_df = None):
    print("Performing hyper parameter search of BERTopic models...")

    ###
    ### Get model arguements ready into a dataframe
    ###

    model_types = ['merged', 'group']

    df = pd.DataFrame(
        list(product(UMAP_models, HDBSCAN_models, embeddings_dfs, model_types)),
        columns=['umap_model', 'hdbscan_model', 'embedding_type', 'model_type']
    )

    df = add_config_columns(df)

    df['merged_min_similarity'] = df['model_type'].apply(lambda x: merged_ranges['min_similarity'] if x == 'merged' else None)
    df = df.explode('merged_min_similarity', ignore_index=True)
    
    df['embedding_2darray'] = df['embedding_type'].apply(lambda x: embeddings_arrays[x])
    df['embedding_df'] = df['embedding_type'].apply(lambda x: embeddings_dfs[x])

    print(f"There are {df.shape[0]} models to run with given aruements")

    ###
    ### Compare arguments dataframe with existing results df and see what rows have already been calculated.
    ###

    # Find rows that are in df but not in current_df these are the new_rows that need to be computed

    new_rows = df.merge(current_df,
                         on=['cluster_config', 'dimension_reduction_config', 'embedding_type', 'model_type', 'merged_min_similarity'],
                         how='left', indicator=True, suffixes=(None, "_to_delete")).query('_merge == "left_only"').drop(columns=['_merge'])
    # Delete all columns that are full of NaN
    new_rows = new_rows.dropna(axis=1, how='all')

    if new_rows.shape[0] == 0:
        print("No new models to run")
        return current_df

    ### 
    ### Run model ###
    ###

    print(f"Only {new_rows.shape[0]} new models to run")

    def run_model(row): 
        if row['model_type'] == 'group':
            return runBERTopic(
                row['embedding_df'],
                'si',
                'si_embedding',
                None,
                row['umap_model'],
                row['hdbscan_model'],
                False,
                row['embedding_2darray']) 
        else:
            return run_merged_model(
                row['embedding_df'],
                'si',
                'si_embedding',
                None,
                row['umap_model'],
                row['hdbscan_model'],
                row['merged_min_similarity'],
                row['embedding_2darray']
            )

    new_rows['model'] = new_rows.progress_apply(run_model, axis=1)

    new_rows.to_pickle('bertopic_models_temp.pkl')
    
    new_rows['embedding_df'] = new_rows['model'].apply(lambda x: x[1])

    new_rows['model'] = new_rows['model'].apply(lambda x: x[0])

    df = pd.concat([current_df, new_rows], ignore_index=True)    

    return df

## Performing search

The search will be done using the `perform_hyper_parameter_search` function with the results saved ina pickle file.

In [7]:
calculated_results = pd.read_pickle('hyper_parameter_search_results.pkl')

In [8]:
calculated_results = add_config_columns(calculated_results)
calculated_results

Unnamed: 0,umap_model,hdbscan_model,embedding_type,model_type,merged_min_similarity,embedding_df,model,cluster_config,dimension_reduction_config,embedding_2darray
0,UMAP(),HDBSCAN(),openai,merged,0.9,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -..."
1,UMAP(),HDBSCAN(),openai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -..."
2,UMAP(),HDBSCAN(),openai,merged,0.94,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -..."
3,UMAP(),HDBSCAN(),openai,merged,0.96,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -..."
4,UMAP(),HDBSCAN(),openai,merged,0.98,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -..."
...,...,...,...,...,...,...,...,...,...,...
15031,<bertopic.dimensionality._base.BaseDimensional...,HDBSCAN(),voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",15,,"[[0.0030111961532384157, 0.006824926473200321,..."
15032,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=5, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,,"[[0.0030111961532384157, 0.006824926473200321,..."
15033,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=9, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",9,,"[[0.0030111961532384157, 0.006824926473200321,..."
15034,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=13, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",13,,"[[0.0030111961532384157, 0.006824926473200321,..."


In [9]:
UMAP_models = [
    UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42)
    for n_neighbors, n_components in
    product(range(3,6), range(5,47, 2))
]+ [BaseDimensionalityReduction()]

HDBSCAN_models = [
    HDBSCAN(min_cluster_size=min_cluster_size, metric="euclidean", cluster_selection_method='eom', prediction_data=True)
    for min_cluster_size in
    range(5,20,5)
]

KMEANS_models = [
    KMeans(n_clusters=n_clusters, random_state=42)
    for n_clusters in
    range(5,20,4)
]


results = hyper_parameter_search(
    embeddings_2darrays,
    all_embeddings,
    UMAP_models,
    HDBSCAN_models + KMEANS_models,
    merged_ranges = {'min_similarity': [e / 100 for e in list(range(90,100, 2))]},
    current_df = calculated_results
)

results.to_pickle('hyper_parameter_search_results.pkl')

results

del calculated_results



Performing hyper parameter search of BERTopic models...
There are 13440 models to run with given aruements
No new models to run


## Parsing results

All of the models dont have any metrics to them. But topic models in general do have ways of measuring their quality in various ways. Using something like https://github.com/MIND-Lab/OCTIS would help use the various evalulation tools.

There are certain metrics I could look at and these are: https://github.com/MIND-Lab/OCTIS#available-metrics

Here are the metrics I will look at.

| metric | description |
| ------ | ----------- |
| outliers_percent | This measures how many outliers are found when doing the clustering. It is only relevant when working with HDBSCAN as other clutsering methods don't identify outliers. |
| num_topics | It is important that there are a reasonable amount of topics created. This reasonable number is somewhere around 15. |
| topic_membership_counts_std | There is a problem of having a few themes that have most of the issues and then some reall small ones. This standard deviation combined with num_topics can help have a uniformly distributed amount of issues. | 

### DIY

These are my initial attempts at my own metrics

In [10]:
def get_stats(df):

    df['topic_membership_counts'] = df['model'].apply(lambda x: x.get_topic_info()['Count'].to_list())
    df['topic_membership_counts_std'] = df['topic_membership_counts'].apply(np.std)
    df['num_topics'] = df['topic_membership_counts'].apply(len)
    # Count percent of issues that have topic as -1 in embedding_df
    def temp(x):
        try:
            return (x['topic'] == -1).mean() * 100
        except:
            display(x)
    df['outlier_percent'] = df['embedding_df'].apply(temp)

    df['cluster_model_type'] = df['hdbscan_model'].apply(lambda x: "HDBSCAN" if isinstance(x, HDBSCAN) else "Kmeans")
    df['dimmension_reduction_type'] = df['umap_model'].apply(lambda x: "UMAP" if isinstance(x, UMAP) else "Base")

    df['id'] = df.index

    return df

In [12]:
results = get_stats(results)
results

Unnamed: 0,umap_model,hdbscan_model,embedding_type,model_type,merged_min_similarity,embedding_df,model,cluster_config,dimension_reduction_config,embedding_2darray,topic_membership_counts,topic_membership_counts_std,num_topics,outlier_percent,cluster_model_type,dimmension_reduction_type,id
0,UMAP(),HDBSCAN(),openai,merged,0.9,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[61, 59, 20, 17, 13, 12, 11, 9, 8, 7, 7, 7, 6,...",26.438944,26,3.214286,HDBSCAN,UMAP,0
1,UMAP(),HDBSCAN(),openai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[36, 97, 93, 11, 39, 25, 21, 16, 14, 12, 12, 1...",36.469924,17,1.607143,HDBSCAN,UMAP,1
2,UMAP(),HDBSCAN(),openai,merged,0.94,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[61, 59, 20, 17, 13, 12, 11, 9, 8, 7, 7, 7, 6,...",26.438944,26,3.214286,HDBSCAN,UMAP,2
3,UMAP(),HDBSCAN(),openai,merged,0.96,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[55, 189, 10, 6, 39, 25, 21, 16, 14, 12, 12, 1...",40.730654,19,0.357143,HDBSCAN,UMAP,3
4,UMAP(),HDBSCAN(),openai,merged,0.98,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[66, 32, 28, 27, 19, 14, 11, 9, 9, 8, 7, 7, 39...",26.026141,25,5.000000,HDBSCAN,UMAP,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15031,<bertopic.dimensionality._base.BaseDimensional...,HDBSCAN(),voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",15,,"[[0.0030111961532384157, 0.006824926473200321,...","[117, 50, 21]",40.202266,3,62.234043,HDBSCAN,Base,15031
15032,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=5, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,,"[[0.0030111961532384157, 0.006824926473200321,...","[56, 42, 39, 27, 24]",11.464729,5,0.000000,Kmeans,Base,15032
15033,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=9, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",9,,"[[0.0030111961532384157, 0.006824926473200321,...","[31, 27, 26, 22, 20, 17, 17, 15, 13]",5.724110,9,0.000000,Kmeans,Base,15033
15034,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=13, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",13,,"[[0.0030111961532384157, 0.006824926473200321,...","[29, 22, 17, 16, 16, 14, 13, 12, 11, 11, 11, 9...",5.610641,13,0.000000,Kmeans,Base,15034


### Topic Cohernece

This is the measure of much each issue is like the other issues within a topic.

As per recommendation from the creater of BERTopic I will use NPMI.

Given that OCTIS is not being actively maintained I will have to ahve a look at gensim

#### Solihouette score

This is the measure of much safety issues are like the cluster compared to other clusters. The maker of BERTopic advises against this but provides a simple implementation here  https://github.com/MaartenGr/BERTopic/issues/428#issuecomment-1027647827

Becuase it is taking so long for these too be computed I am going to filer out the undesirable ones.

In [21]:
silhouette_filtered = results.query('num_topics > 5 & num_topics < 25 & outlier_percent <= 10')

silhouette_filtered

Unnamed: 0,umap_model,hdbscan_model,embedding_type,model_type,merged_min_similarity,embedding_df,model,cluster_config,dimension_reduction_config,embedding_2darray,topic_membership_counts,topic_membership_counts_std,num_topics,outlier_percent,cluster_model_type,dimmension_reduction_type,id
1,UMAP(),HDBSCAN(),openai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[36, 97, 93, 11, 39, 25, 21, 16, 14, 12, 12, 1...",36.469924,17,1.607143,HDBSCAN,UMAP,1
3,UMAP(),HDBSCAN(),openai,merged,0.96,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[55, 189, 10, 6, 39, 25, 21, 16, 14, 12, 12, 1...",40.730654,19,0.357143,HDBSCAN,UMAP,3
6,UMAP(),HDBSCAN(),voyageai,merged,0.9,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 203, 22, 18, 16, 21, 10, 9, 76, 7, 2...",55.305392,13,4.464286,HDBSCAN,UMAP,6
7,UMAP(),HDBSCAN(),voyageai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 185, 22, 18, 16, 21, 10, 9, 76, 7, 6...",48.268779,16,4.464286,HDBSCAN,UMAP,7
8,UMAP(),HDBSCAN(),voyageai,merged,0.94,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 46, 22, 18, 16, 21, 10, 9, 37, 7, 6,...",28.783814,21,3.392857,HDBSCAN,UMAP,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15027,UMAP(),"KMeans(n_clusters=13, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",13,"(45, 5)","[[0.0030111961532384157, 0.006824926473200321,...","[45, 40, 31, 29, 22, 14, 1, 1, 1, 1, 1, 1, 1]",16.160804,13,0.000000,Kmeans,UMAP,15027
15028,UMAP(),"KMeans(n_clusters=17, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",17,"(45, 5)","[[0.0030111961532384157, 0.006824926473200321,...","[30, 29, 20, 19, 18, 16, 14, 13, 12, 7, 4, 1, ...",9.649569,17,0.000000,Kmeans,UMAP,15028
15033,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=9, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",9,,"[[0.0030111961532384157, 0.006824926473200321,...","[31, 27, 26, 22, 20, 17, 17, 15, 13]",5.724110,9,0.000000,Kmeans,Base,15033
15034,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=13, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",13,,"[[0.0030111961532384157, 0.006824926473200321,...","[29, 22, 17, 16, 16, 14, 13, 12, 11, 11, 11, 9...",5.610641,13,0.000000,Kmeans,Base,15034


In [None]:
## This was here as there was problems with the IDS no matching.
def update_ids(df, df_to_be_updated):
    subset_columns = ['cluster_config', 'dimension_reduction_config', 'embedding_type', 'model_type', 'merged_min_similarity', 'cluster_model_type', 'dimmension_reduction_type']  # Specify the columns you want to use for matching

    df = df[subset_columns+["id"]]
    df_to_be_updated = df_to_be_updated[subset_columns+["id"]]

    # Merge the DataFrames based on the subset of columns
    merged_results = pd.merge(df, df_to_be_updated, on=subset_columns, how='inner')
    merged_results

    # Find ones where the id mismtaches?
    mismatched_ids = merged_results[merged_results['id_x'] != merged_results['id_y']]

    display(mismatched_ids.query('id_x == 4969 | id_y == 4969'))

    id_mapping = dict(zip(df.apply(lambda row: tuple(row[subset_columns]), axis=1), df['id']))
    
    df_to_be_updated['id'] = df_to_be_updated.apply(lambda row: id_mapping.get(tuple(row[subset_columns]), row['id']), axis=1)

    return df_to_be_updated['id']
    
display(silhouette_filtered.query('id == 4969'))
display(loaded_results.query('id in [4969, 10835]'))

loaded_results['id'] = update_ids(silhouette_filtered, loaded_results)

loaded_results.query('id in [4969, 10835]')



In [22]:
def get_accepted_indicies(df):
    df['accepted_indicies'] = df.progress_apply(lambda row: [index for index, topic in enumerate(row['embedding_df']['topic'].tolist()) if topic != -1], axis = 1)

    return df

def silhouette_score_row(row):
    if len(row['accepted_indicies']) < 5:
        return "No accepted_indicies"
    
    X = row['reduced_embeddings']

    labels = row['embedding_df']['topic'][row['accepted_indicies']]

    if len(labels.unique()) < 2:
        return "Too few topics"

    print(row)

    return silhouette_score(X, labels)

def add_silhouette_score(df, preivously_calculated):
    if preivously_calculated is None:
        new_rows = df
    else:
        new_rows = df.merge(preivously_calculated,
                            on=['cluster_config', 'dimension_reduction_config', 'embedding_type', 'model_type', 'merged_min_similarity', 'cluster_model_type', 'dimmension_reduction_type'],
                            how='left', indicator=True, suffixes=(None, "_to_delete")).query('_merge == "left_only"').drop(columns=['_merge'])
        # Delete all columns that are full of NaN
        new_rows = new_rows.dropna(axis=1, how='all')

    print(f"Total {df.shape[0]} rows needing silhouette scores")
    if new_rows.shape[0] == 0:
        print("No new silheuttes to run")
        return preivously_calculated
    else:
        print(f"Only {new_rows.shape[0]} new silhuettes to run")

    new_rows = get_accepted_indicies(new_rows)

    new_rows['reduced_embeddings'] = new_rows.progress_apply(
        lambda row: 
        row['embedding_2darray'] if not isinstance(row['umap_model'], UMAP) else row['umap_model'].fit_transform(row['embedding_2darray'])[row['accepted_indicies']]
        if len(row['accepted_indicies']) >= 5 else
        "Too few accepted"
        ,axis = 1
    )

    
    new_rows['silhouette_score'] = new_rows.progress_apply(silhouette_score_row, axis = 1)

    if preivously_calculated is None:
        return new_rows
    else:
        return pd.concat([preivously_calculated, new_rows], ignore_index=True)


try:
    loaded_results = pd.read_pickle('silhouette_scores.pkl')
except FileNotFoundError as e:
    print(e)
    loaded_results = None


calculated_results = add_silhouette_score(silhouette_filtered, loaded_results)

calculated_results.to_pickle('silhouette_scores.pkl')

calculated_results

Total 6729 rows needing silhouette scores
Only 1242 new silhuettes to run


  0%|          | 0/1242 [00:00<?, ?it/s]

  0%|          | 0/1242 [00:00<?, ?it/s]

  0%|          | 0/1242 [00:00<?, ?it/s]

umap_model                                                                UMAP()
hdbscan_model                                                          HDBSCAN()
embedding_type                                               voyageai_only_exact
model_type                                                                merged
merged_min_similarity                                                       0.96
embedding_df                       report_id                                 ...
model                          BERTopic(calculate_probabilities=True, ctfidf_...
cluster_config                                                                 5
dimension_reduction_config                                                (5, 3)
embedding_2darray              [[0.0030111961532384157, 0.006824926473200321,...
topic_membership_counts                             [21, 50, 12, 39, 24, 11, 31]
topic_membership_counts_std                                            13.173876
num_topics                  

Unnamed: 0,umap_model,hdbscan_model,embedding_type,model_type,merged_min_similarity,embedding_df,model,cluster_config,dimension_reduction_config,embedding_2darray,topic_membership_counts,topic_membership_counts_std,num_topics,outlier_percent,id,accepted_indicies,reduced_embeddings,silhouette_score,cluster_model_type,dimmension_reduction_type
0,UMAP(),HDBSCAN(),openai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[36, 97, 93, 11, 39, 25, 21, 16, 14, 12, 12, 1...",36.469924,17,1.607143,1,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14...","[[-2.5475276, -2.985186, -0.879056, -0.0050687...",0.161992,HDBSCAN,UMAP
1,UMAP(),HDBSCAN(),openai,merged,0.96,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[55, 189, 10, 6, 39, 25, 21, 16, 14, 12, 12, 1...",40.730654,19,0.357143,3,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-2.449398, -3.0897288, -0.9104028, 0.1285584...",0.077534,HDBSCAN,UMAP
2,UMAP(),HDBSCAN(),voyageai,merged,0.9,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 203, 22, 18, 16, 21, 10, 9, 76, 7, 2...",55.305392,13,4.464286,6,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-3.759121, 3.431325, 1.8554363, -1.8523803, ...",-0.028394,HDBSCAN,UMAP
3,UMAP(),HDBSCAN(),voyageai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 185, 22, 18, 16, 21, 10, 9, 76, 7, 6...",48.268779,16,4.464286,7,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-3.9462743, 3.2983723, 2.3500576, -1.100832,...",-0.012372,HDBSCAN,UMAP
4,UMAP(),HDBSCAN(),voyageai,merged,0.94,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 46, 22, 18, 16, 21, 10, 9, 37, 7, 6,...",28.783814,21,3.392857,8,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-4.651314, 3.0769386, 1.6765718, -1.3375773,...",0.030043,HDBSCAN,UMAP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6724,UMAP(),"KMeans(n_clusters=13, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",13,"(45, 5)","[[0.0030111961532384157, 0.006824926473200321,...","[45, 40, 31, 29, 22, 14, 1, 1, 1, 1, 1, 1, 1]",16.160804,13,0.000000,15027,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-0.07859349, -0.45278025, 0.13109803, 0.3208...",-0.003320,Kmeans,UMAP
6725,UMAP(),"KMeans(n_clusters=17, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",17,"(45, 5)","[[0.0030111961532384157, 0.006824926473200321,...","[30, 29, 20, 19, 18, 16, 14, 13, 12, 7, 4, 1, ...",9.649569,17,0.000000,15028,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-0.024328232, -0.24872625, -0.1145494, -0.19...",-0.042915,Kmeans,UMAP
6726,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=9, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",9,,"[[0.0030111961532384157, 0.006824926473200321,...","[31, 27, 26, 22, 20, 17, 17, 15, 13]",5.724110,9,0.000000,15033,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0.0030111961532384157, 0.006824926473200321,...",0.088080,Kmeans,Base
6727,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=13, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",13,,"[[0.0030111961532384157, 0.006824926473200321,...","[29, 22, 17, 16, 16, 14, 13, 12, 11, 11, 11, 9...",5.610641,13,0.000000,15034,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0.0030111961532384157, 0.006824926473200321,...",0.069965,Kmeans,Base


## Interpreting results

The goal here is going to be deciding what out of the thousands of models are the bests ones.

There are going to be a few ways that I come at the notion of "Useful accurate theme generation" and these were generated in the [Previous section](#parsing-results)

In [13]:
def check_mode_cluster_distribution(df):
    safety_issues_df_topic_mode = df.pivot_table(index='topic', columns='mode', values='report_id', aggfunc='count').fillna(0)
    return safety_issues_df_topic_mode

In [14]:
def make_visualization(model, df, save = False, name = 'topic model visual'):

    array_embeddings = column_to_2darray(df['si_embedding'])

    reduced_array_embeddings = UMAP(n_neighbors=7, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

    visualization = model.visualize_documents(df['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

    if save:

        with open(os.path.join('topic_visuals', name), 'w') as f:
            visualization.write_html(f)

    return visualization


This is where I will provide infomration on decisions on what I have done:

- Kmeans seems to struggle to make meaning full clusters atleast from the topic representation. I looked at these models 6480, 6116, 6747 but have removed them from the running.
- 

In [24]:
calculated_results

Unnamed: 0,umap_model,hdbscan_model,embedding_type,model_type,merged_min_similarity,embedding_df,model,cluster_config,dimension_reduction_config,embedding_2darray,topic_membership_counts,topic_membership_counts_std,num_topics,outlier_percent,id,accepted_indicies,reduced_embeddings,silhouette_score,cluster_model_type,dimmension_reduction_type
0,UMAP(),HDBSCAN(),openai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[36, 97, 93, 11, 39, 25, 21, 16, 14, 12, 12, 1...",36.469924,17,1.607143,1,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14...","[[-2.5475276, -2.985186, -0.879056, -0.0050687...",0.161992,HDBSCAN,UMAP
1,UMAP(),HDBSCAN(),openai,merged,0.96,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.017140474170446396, 0.03509647026658058, -...","[55, 189, 10, 6, 39, 25, 21, 16, 14, 12, 12, 1...",40.730654,19,0.357143,3,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-2.449398, -3.0897288, -0.9104028, 0.1285584...",0.077534,HDBSCAN,UMAP
2,UMAP(),HDBSCAN(),voyageai,merged,0.9,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 203, 22, 18, 16, 21, 10, 9, 76, 7, 2...",55.305392,13,4.464286,6,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-3.759121, 3.431325, 1.8554363, -1.8523803, ...",-0.028394,HDBSCAN,UMAP
3,UMAP(),HDBSCAN(),voyageai,merged,0.92,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 185, 22, 18, 16, 21, 10, 9, 76, 7, 6...",48.268779,16,4.464286,7,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-3.9462743, 3.2983723, 2.3500576, -1.100832,...",-0.012372,HDBSCAN,UMAP
4,UMAP(),HDBSCAN(),voyageai,merged,0.94,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",5,"(5, 3)","[[0.00557063240557909, 0.008644572459161282, -...","[116, 34, 46, 22, 18, 16, 21, 10, 9, 37, 7, 6,...",28.783814,21,3.392857,8,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-4.651314, 3.0769386, 1.6765718, -1.3375773,...",0.030043,HDBSCAN,UMAP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6724,UMAP(),"KMeans(n_clusters=13, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",13,"(45, 5)","[[0.0030111961532384157, 0.006824926473200321,...","[45, 40, 31, 29, 22, 14, 1, 1, 1, 1, 1, 1, 1]",16.160804,13,0.000000,15027,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-0.07859349, -0.45278025, 0.13109803, 0.3208...",-0.003320,Kmeans,UMAP
6725,UMAP(),"KMeans(n_clusters=17, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",17,"(45, 5)","[[0.0030111961532384157, 0.006824926473200321,...","[30, 29, 20, 19, 18, 16, 14, 13, 12, 7, 4, 1, ...",9.649569,17,0.000000,15028,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[-0.024328232, -0.24872625, -0.1145494, -0.19...",-0.042915,Kmeans,UMAP
6726,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=9, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",9,,"[[0.0030111961532384157, 0.006824926473200321,...","[31, 27, 26, 22, 20, 17, 17, 15, 13]",5.724110,9,0.000000,15033,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0.0030111961532384157, 0.006824926473200321,...",0.088080,Kmeans,Base
6727,<bertopic.dimensionality._base.BaseDimensional...,"KMeans(n_clusters=13, random_state=42)",voyageai_only_exact,group,,report_id ...,"BERTopic(calculate_probabilities=True, ctfidf_...",13,,"[[0.0030111961532384157, 0.006824926473200321,...","[29, 22, 17, 16, 16, 14, 13, 12, 11, 11, 11, 9...",5.610641,13,0.000000,15034,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[0.0030111961532384157, 0.006824926473200321,...",0.069965,Kmeans,Base


In [26]:
# Assuming your 'lengthened_combined_topic_counts' DataFrame is available

import plotly.colors as pc

# Define a colorblind-friendly palette
colorblind_palette = pc.qualitative.Safe

fig = px.scatter(calculated_results,
                 x='silhouette_score', y='topic_membership_counts_std',
                 color='embedding_type', size='num_topics',
                 hover_data=['id', 'topic_membership_counts'],
                 color_discrete_sequence=colorblind_palette)

fig.update_layout(
    hovermode='closest'
)

fig.show()

There are many that are good. It all depends on what consititues a good topic model. I feel that one that has a small amount of outliers as well as a decent amount of topics is quite suitable.

Here are some that seem intersting

In [17]:
# 136, 135, 5807, 1

rows = results.query('id in [5909, 3143, 6041, 10835, 10841]').to_dict('index')
# replace keys with value in id
rows = {row['id']: row for i, row in rows.items()}


def inspect_model(row):

    model = row['model']
    model.update_topics(
        row['embedding_df']['si'].to_list(),
        representation_model = openai_base_representation_model
    )
    print(f"Looking at model: {row['id']}. Facts:\nmodel type - {row['model_type']}\nembedding - {row['embedding_type']}\ndimension reduction - {row['dimmension_reduction_type']}:{row['dimension_reduction_config']}  \nclustering - {row['cluster_model_type']}:{row['cluster_config']}")
    display(model.get_topic_info())

    return row

rows

{3143: {'umap_model': UMAP(),
  'hdbscan_model': KMeans(n_clusters=17, random_state=42),
  'embedding_type': 'voyageai',
  'model_type': 'group',
  'merged_min_similarity': None,
  'embedding_df':     report_id                                                 si  mode  \
  0    2019_106  No procedures were in place to direct train cr...     1   
  1    2013_107  The high incidence of brake block replacement,...     1   
  2    2013_107  The visual inspection regime for wheel-bearing...     1   
  3    2013_107  The RailBAM system, while operational, did not...     1   
  4    2013_107  The lack of a dedicated RailBAM analyst positi...     1   
  ..        ...                                                ...   ...   
  555  2011_006  The CAA had had recurring concerns for the man...     0   
  556  2017_003  The maintenance inspection programme for the l...     0   
  557  2017_104  Transdev had no policies or procedures in plac...     1   
  558  2020_104  Implementation of an adminis

In [18]:
updated_rows = {i: inspect_model(row) for i, row in rows.items()}

updated_rows

Looking at model: 3143. Facts:
model type - group
embedding - voyageai
dimension reduction - UMAP:(7, 4)  
clustering - Kmeans:17


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,60,0_Rail Safety Issues and Organizational Vulner...,[Rail Safety Issues and Organizational Vulnera...,[The safety issue arising from this incident w...
1,1,53,1_Safety Issues in Helicopter and Parachuting ...,[Safety Issues in Helicopter and Parachuting O...,[The New Zealand regulatory system has not pro...
2,2,45,2_Comprehensive Review of Safety Deficiencies ...,[Comprehensive Review of Safety Deficiencies i...,"[Aerodrome managers, in particular those at un..."
3,3,39,3_Comprehensive Safety Deficiencies in KiwiRai...,[Comprehensive Safety Deficiencies in KiwiRail...,[KiwiRail's system for training and assessment...
4,4,38,4_Operational and Safety Management Deficienci...,[Operational and Safety Management Deficiencie...,[Had the pilots known that the nose landing ge...
5,5,37,5_Critical Safety Concerns in Maritime Operati...,[Critical Safety Concerns in Maritime Operatio...,[The safety management system on board the Cap...
6,6,36,6_Rail Safety and Risk Management Challenges i...,[Rail Safety and Risk Management Challenges in...,[The profile of the Beach Road level crossing ...
7,7,35,7_Comprehensive Maintenance and Operational Sa...,[Comprehensive Maintenance and Operational Saf...,[Despite the general acceptance of final inspe...
8,8,32,8_Safety Management and Compliance Issues in N...,[Safety Management and Compliance Issues in Ne...,[Eight of the 12 vessels managed by CIEL had h...
9,9,31,9_Topic: Maritime Vessel Safety Management and...,[Topic: Maritime Vessel Safety Management and ...,[The bilge pumping system on the Jubilee was n...


Looking at model: 5909. Facts:
model type - group
embedding - openai
dimension reduction - UMAP:(5, 5)  
clustering - Kmeans:17


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,51,0_Helicopter Safety and Operational Risks,[Helicopter Safety and Operational Risks],"[Due to their unique main rotor design, during..."
1,1,48,"1_Topic: Inadequate Training, Communication, a...","[Topic: Inadequate Training, Communication, an...",[The amended communication plan increased the ...
2,2,47,2_Rail System Operational and Communication Sa...,[Rail System Operational and Communication Saf...,[The safety issue arising from this incident w...
3,3,40,3_Deficiencies in Shipboard Safety Management ...,[Deficiencies in Shipboard Safety Management a...,[The standards of navigation and bridge resour...
4,4,39,4_Maritime Navigation and Operational Safety I...,[Maritime Navigation and Operational Safety Is...,[The standard of passage planning on board the...
5,5,38,5_Aviation Safety Oversight and Operational Co...,[Aviation Safety Oversight and Operational Com...,[The CAA had had recurring concerns for the ma...
6,6,37,6_Aircraft Maintenance and Operational Safety ...,[Aircraft Maintenance and Operational Safety C...,[Had the pilots known that the nose landing ge...
7,7,34,7_Topic: Aviation Operational Risk and Communi...,[Topic: Aviation Operational Risk and Communic...,"[There are four factors that were not, but sho..."
8,8,32,8_Rail and Road Crossing Safety and Risk Mitig...,[Rail and Road Crossing Safety and Risk Mitiga...,[The profile of the Beach Road level crossing ...
9,9,32,"9_Inadequate Load-Securing, Braking System Def...","[Inadequate Load-Securing, Braking System Defi...",[The Matangi braking and wheel-slide protectio...


Looking at model: 6041. Facts:
model type - group
embedding - voyageai
dimension reduction - UMAP:(7, 5)  
clustering - Kmeans:17


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,68,0_Safety Management and Maintenance Practices ...,[Safety Management and Maintenance Practices i...,[Had the pilots known that the nose landing ge...
1,1,61,1_Systemic Oversights and Operational Non-Comp...,[Systemic Oversights and Operational Non-Compl...,[The operator's system for training its pilots...
2,2,57,2_Railway Safety Issues,[Railway Safety Issues],[The train driver did not challenge the train ...
3,3,45,3_Helicopter Operational Safety and Training D...,[Helicopter Operational Safety and Training De...,[The New Zealand regulatory system has not pro...
4,4,41,4_Maritime Safety Management and Equipment Ins...,[Maritime Safety Management and Equipment Insp...,[Encasing steel wire in plastic sheathing when...
5,5,40,5_Comprehensive Safety and Compliance Issues i...,[Comprehensive Safety and Compliance Issues in...,[KiwiRail's system for training and assessment...
6,6,32,6_Maritime Safety Violations and Operational N...,[Maritime Safety Violations and Operational No...,[The skipper did not have the requisite knowle...
7,7,30,7_Safety Concerns in Rail Operations and Human...,[Safety Concerns in Rail Operations and Human ...,[Work within the rail corridor was undertaken ...
8,8,29,8_Topic: Bridge Resource Management and Naviga...,[Topic: Bridge Resource Management and Navigat...,[The situational awareness of the bridge team ...
9,9,27,9_Aviation Operations and Communication Safety...,[Aviation Operations and Communication Safety ...,[The controllers and the pilots of the Pacific...


Looking at model: 10835. Facts:
model type - group
embedding - voyageai_reccontext
dimension reduction - UMAP:(7, 5)  
clustering - Kmeans:17


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,60,0_Topic: Maritime Safety Issues and Recommenda...,[Topic: Maritime Safety Issues and Recommendat...,[Safety issue: The owner and the master on boa...
1,1,55,1_Topic: Aviation Safety Issues and Recommenda...,[Topic: Aviation Safety Issues and Recommendat...,[Safety issue: With the nose landing gear stuc...
2,2,53,2_Comprehensive Review of Helicopter Operation...,[Comprehensive Review of Helicopter Operationa...,"[Safety issue: A mast-bump occurred, which led..."
3,3,48,3_Maritime Safety Management and Operational P...,[Maritime Safety Management and Operational Pr...,[Safety issue: The bridge operations on board ...
4,4,48,4_Safety Issues and Recommendations in Maritim...,[Safety Issues and Recommendations in Maritime...,[Safety issue: The guidelines for internal ins...
5,5,36,5_Air Traffic Control and Pilot Communication ...,[Air Traffic Control and Pilot Communication a...,[Safety issue: The visual circuit procedure pu...
6,6,34,6_Rail Transportation Safety Issues and Organi...,[Rail Transportation Safety Issues and Organiz...,[Safety issue: The safety issue arising from t...
7,7,32,7_Topic: Comprehensive Safety Reforms in New Z...,[Topic: Comprehensive Safety Reforms in New Ze...,"[Safety issue: The train controller, who was p..."
8,8,31,8_Safety Issues and Recommendations in KiwiRai...,[Safety Issues and Recommendations in KiwiRail...,[Safety issue: There was no requirement for ge...
9,9,30,9_Comprehensive Evaluation of Rail Safety Mana...,[Comprehensive Evaluation of Rail Safety Manag...,[Safety issue: The training that drivers recei...


Looking at model: 10841. Facts:
model type - group
embedding - voyageai_reccontext
dimension reduction - UMAP:(9, 5)  
clustering - HDBSCAN:5


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,51,-1_Aircraft and Helicopter Safety Concerns and...,[Aircraft and Helicopter Safety Concerns and R...,[Safety issue: It is likely that the level of ...
1,0,156,0_Maritime Safety Management and Operational D...,[Maritime Safety Management and Operational De...,[Safety issue: The operation of L'Austral's EC...
2,1,97,1_Comprehensive Rail Safety Issues and Improve...,[Comprehensive Rail Safety Issues and Improvem...,[Safety issue: A key safety issue was that the...
3,2,61,2_Comprehensive Review of Railway Operational ...,[Comprehensive Review of Railway Operational a...,[Safety issue: The safety issue arising from t...
4,3,23,3_Addressing Operational and Manufacturing Saf...,[Addressing Operational and Manufacturing Safe...,[Safety issue: The maintenance inspection prog...
5,4,23,4_Systemic Safety Issues in Operational and Ma...,[Systemic Safety Issues in Operational and Mai...,[Safety issue: The closed-circuit television r...
6,5,22,5_Topic: Air Traffic Management and Pilot-Cont...,[Topic: Air Traffic Management and Pilot-Contr...,[Safety issue: Civil Aviation Rule 91.223 can ...
7,6,21,6_Safety and Risk Management at Railway Level ...,[Safety and Risk Management at Railway Level C...,[Safety issue: Sighting distances for road use...
8,7,19,7_Enhancing Aviation Safety through Effective ...,[Enhancing Aviation Safety through Effective O...,[Safety issue: The standard of team resource m...
9,8,14,8_Aircraft Maintenance and Operational Safety ...,[Aircraft Maintenance and Operational Safety I...,[Safety issue: With the nose landing gear stuc...


{3143: {'umap_model': UMAP(),
  'hdbscan_model': KMeans(n_clusters=17, random_state=42),
  'embedding_type': 'voyageai',
  'model_type': 'group',
  'merged_min_similarity': None,
  'embedding_df':     report_id                                                 si  mode  \
  0    2019_106  No procedures were in place to direct train cr...     1   
  1    2013_107  The high incidence of brake block replacement,...     1   
  2    2013_107  The visual inspection regime for wheel-bearing...     1   
  3    2013_107  The RailBAM system, while operational, did not...     1   
  4    2013_107  The lack of a dedicated RailBAM analyst positi...     1   
  ..        ...                                                ...   ...   
  555  2011_006  The CAA had had recurring concerns for the man...     0   
  556  2017_003  The maintenance inspection programme for the l...     0   
  557  2017_104  Transdev had no policies or procedures in plac...     1   
  558  2020_104  Implementation of an adminis

In [19]:
def inspect_topic_assignments(model, df, num = 5):
    # Get 10 rows for each topic
    grouped = df.groupby('topic')
    group_dfs = [group.sample(min(len(group), num), random_state=42) for _, group in grouped]

    topics = model.get_topic_info()['Representation']

    counts = model.get_topic_info()['Count']

    for samples, topic, count  in zip(group_dfs, topics, counts):
        print(f"Examples from topic {topic} containing {count} safety issues\n\n")
        examples = ["report_id: {}: {}".format(report_id, si) for report_id, si in zip(samples['report_id'], samples['si'])]
        print("\n\n".join(examples))
        print("\n" + "-"*50 + "\n")  # Separator for clarity


inspect_topic_assignments(updated_rows[10841]['model'], updated_rows[10841]['embedding_df'])

Examples from topic ['Aircraft and Helicopter Safety Concerns and Recommendations'] containing 51 safety issues


report_id: 2019_005: Safety issue: The operator's procedures for single-pilot VFR operations into the Southern Ocean were not complete or adequate at the time of this accident to safely manage the flight.




report_id: 2019_002: Safety issue: When an IFR aeroplane is approved to conduct a visual approach to land, current ATC procedures can allow the flight crew to change their radio to the tower frequency dependent on their other clearance actions. This can create a situation where the approach controller is unable to contact that flight crew when the controller is still responsible for monitoring that flight crew's compliance with an instruction.




report_id: 2020_002: Safety issue: The Taupo Gliding Club did not ensure GNZ instructor training procedures were fully implemented and GNZ audits did not detect this discrepancy. This increased the risk that instructors were 

In [27]:
make_visualization(updated_rows[10841]['model'], updated_rows[10841]['embedding_df'])

## Maually looking at models

### Running it on all safety issues


I want to generate the safety themes from all of the safety issues I have available.

#### Simple minilm embeddings

This seems to of failed. I believe this is mainly due to the fact that each documents are really short.

In [None]:

topic_model, _ = runBERTopic(
    safety_issues_df, 'si', None, openai_base_representation_model, umap_model, reduce_outliers=False)

topic_model.get_topic_info()


There is a bit of a problem where the number of outliers is quite great.

I will try to merge the outliers

In [None]:
topic_model, _ = runBERTopic(
    safety_issues_df, 'si', None, openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,166,0_Rail Safety and Operational Issues in New Ze...,[Rail Safety and Operational Issues in New Zea...,[The training that drivers received for transi...
1,1,64,1_Maritime Safety and Navigation Management Is...,[Maritime Safety and Navigation Management Iss...,[The voyage planning for the time in the Snare...
2,2,36,2_Maritime Safety and Regulations Compliance I...,[Maritime Safety and Regulations Compliance Is...,[The skipper did not have the requisite knowle...
3,3,53,3_Safety and Maintenance Issues in Engineering...,[Safety and Maintenance Issues in Engineering ...,[There was a lack of clear communication and a...
4,4,53,4_Maritime and Aviation Safety Management and ...,[Maritime and Aviation Safety Management and E...,[It could not be established why the chief off...
5,5,50,5_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],[Had the controllers realised that the low clo...
6,6,27,6_Robinson Helicopter Safety and Accident Anal...,[Robinson Helicopter Safety and Accident Analy...,"[Due to their unique main rotor design, during..."
7,7,62,7_Aviation Safety and Regulatory Compliance Is...,[Aviation Safety and Regulatory Compliance Iss...,[The standard of pilot training and the superv...
8,8,26,8_Aircraft Landing Gear and Door Lock Failures,[Aircraft Landing Gear and Door Lock Failures],[Had the pilots known that the nose landing ge...
9,9,23,9_Deficiencies in Safety and Regulatory Compli...,[Deficiencies in Safety and Regulatory Complia...,[There were no established procedures for ente...


The main problem here is that the the distribution is not great. It seems that most of the rail are in the first topic then martime and aviation take up the rest.

#### VoyageAI embeddings

In [None]:
topic_model, voyageai_clusters_df = runBERTopic(
    voyageai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()

NameError: name 'voyageai_embeddings' is not defined

In [None]:


check_mode_cluster_distribution(voyageai_clusters_df)

mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,204,8,167
1,2,177,2


This has created two topics wiht one being avaiation and martime and the other being rail.

#### OpenAI embeddings

In [None]:
topic_model, openai_clusters_df = runBERTopic(
    openai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model, reduce_outliers=True)

topic_model.get_topic_info()[['Count', 'Name']]



Unnamed: 0,Count,Name
0,203,0_Aviation Safety and Compliance Issues
1,189,1_Rail Safety and Operational Issues in New Ze...
2,142,2_Maritime Safety and Navigation Management Flaws
3,26,3_Maritime Safety and Compliance Issues of the...


In [None]:
check_mode_cluster_distribution(openai_clusters_df)

mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,191.0,4.0,8.0
1,5.0,181.0,3.0
2,4.0,0.0,138.0
3,6.0,0.0,20.0


This has also made a cleanish split between modes of transport. I can eithe try to force it not to do this and/or run the model on each mode then merge the models.

In [None]:
umap_model_tweaked = UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

topic_model, openai_clusters_tweaked_df = runBERTopic(
    openai_embeddings, 'si', 'si_embedding', openai_base_representation_model, umap_model_tweaked, reduce_outliers=True)

display(topic_model.get_topic_info()[['Count', 'Name']])

check_mode_cluster_distribution(openai_clusters_tweaked_df)



Unnamed: 0,Count,Name
0,167,0_Rail Safety and Operational Failures
1,115,1_Maritime Safety and Resource Management Defi...
2,50,2_Safety and Compliance in Transport and Marit...
3,51,3_Aviation Safety and Regulatory Compliance Is...
4,41,4_Helicopter Safety and Operational Issues
5,52,5_Aviation Safety and Air Traffic Control Issues
6,27,6_Safety Issues in Rail Operations
7,30,7_Aircraft Landing Gear and Maintenance Issues
8,11,8_Aviation Safety Issues Related to Door Locki...
9,16,9_Safety and Maintenance Challenges in Maritim...


mode,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.0,157.0,7.0
1,7.0,0.0,108.0
2,15.0,2.0,33.0
3,47.0,0.0,4.0
4,40.0,1.0,0.0
5,51.0,0.0,1.0
6,2.0,25.0,0.0
7,30.0,0.0,0.0
8,11.0,0.0,0.0
9,0.0,0.0,16.0


I will try to tune the hyper paramters and see if I can get the right sort of safety themes

In [None]:
topic_model, openai_clusters_tuned_df = runBERTopic(
    openai_embeddings,
    'si',
    'si_embedding',
    openai_base_representation_model,
    UMAP(n_neighbors=4, n_components=5, min_dist=0.0, metric='cosine', random_state=42),
    reduce_outliers=True)

topic_model.get_topic_info()[['Count', "Name","Representative_Docs"]]



Unnamed: 0,Count,Name,Representative_Docs
0,167,0_Rail Safety and Operational Issues in New Ze...,[The training that drivers received for transi...
1,115,1_Maritime Safety and Resource Management Defi...,[The standard of passage planning on board the...
2,50,2_Maritime and Aviation Safety Regulations and...,[The absence of a visual indicator in the whee...
3,51,3_Aviation Safety and Regulatory Compliance Is...,[The operator's system for training its pilots...
4,41,4_Helicopter Safety and Maintenance Issues,"[Due to their unique main rotor design, during..."
5,52,5_Aviation Safety and Operational Procedures a...,[While ATC sequences an IFR aeroplane to land ...
6,27,6_Safety Issues and Management Deficiencies in...,[The train controller made an assumption about...
7,30,7_Aircraft Landing Gear and Maintenance Issues,[Had the pilots known that the nose landing ge...
8,11,8_Aviation Safety and Equipment Malfunction,"[The use of ""threat and error management"" (TEM..."
9,16,9_Maintenance and Risk Management in Marine Sa...,[A clear placard should be placed at the contr...


### Run cluster on just one mode

It would make sense that if the clustering is finding the transport modes then splitting into the modes first might help find the themes within each mode.

In [None]:
def printout_each_modes_topics(results):
    for res in results:
        print("Cluster names: ")
        for i, count in zip(res[0].get_topic_info()['Name'], res[0].get_topic_info()['Count']):
            print(f"{count}, {i}")

#### VoyageAI

In [None]:
voyageai_modes_dfs = [voyageai_embeddings[voyageai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

In [None]:
results = [runBERTopic(df, 'si', 'si_embedding', openai_base_representation_model, umap_model) for df in voyageai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
54, 0_Aviation Safety and Operational Procedures
34, 1_Safety Challenges and Risks in Robinson Helicopter Operations
62, 2_Aviation Safety and Regulatory Compliance Issues
25, 3_Aircraft Landing Gear and Door System Failures
31, 4_Aircraft Maintenance and Safety Concerns
Cluster names: 
166, 0_Rail Safety and Management Issues
19, 1_Safety and Regulatory Issues at Rail Level Crossings
Cluster names: 
28, 0_Maritime Safety and Bridge Resource Management Deficiencies
33, 1_Maritime Safety and Management Failures
43, 2_Maritime Safety and Navigation Standards Compliance
30, 3_Maritime Safety and Emergency Response Deficiencies
22, 4_Maritime Safety Violations and Consequences aboard the Easy Rider
13, 5_Propulsion System Failures and Maintenance Issues in Marine Operations


In [None]:
checking = results[2][1]

In [None]:
merged_moode_models = BERTopic.merge_models([result[0] for result in results], min_similarity=0.9)


merged_moode_models.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,52,0_Aviation Safety and Air Traffic Control Proc...,[Aviation Safety and Air Traffic Control Proce...,
1,1,47,1_Helicopter Safety and Accident Analysis,[Helicopter Safety and Accident Analysis],
2,2,41,2_Aviation Safety and Compliance Issues,[Aviation Safety and Compliance Issues],
3,3,33,3_Aircraft Safety and Maintenance Issues,[Aircraft Safety and Maintenance Issues],
4,4,17,4_Aviation Safety and Regulatory Compliance in...,[Aviation Safety and Regulatory Compliance in ...,
5,5,16,5_Safety and Regulatory Issues in New Zealand ...,[Safety and Regulatory Issues in New Zealand A...,
6,6,44,0_KiwiRail Safety and Compliance Issues,[KiwiRail Safety and Compliance Issues],
7,7,40,1_Rail Safety and Communication Failures,[Rail Safety and Communication Failures],
8,8,25,2_Safety and Management Issues in Rail Operations,[Safety and Management Issues in Rail Operations],
9,9,20,3_Safety and Regulatory Issues at Road-Rail Le...,[Safety and Regulatory Issues at Road-Rail Lev...,


#### OpenAI

In [None]:
openai_modes_dfs = [openai_embeddings[openai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

for df in openai_modes_dfs:
    display(df)

Unnamed: 0,report_id,si,mode,si_embedding
0,2011_003,The New Zealand regulatory system has not prov...,0,"[0.0187440924346447, -0.000433413457358256, -0..."
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[0.01013844646513462, -0.03145159035921097, -0..."
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[0.005347656551748514, -0.022685393691062927, ..."
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[0.014976576901972294, 0.015324870124459267, -..."
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.0042054359801113605, 0.04125332459807396, ..."
...,...,...,...,...
201,2015_001,Parachute drop pilots were not required to wea...,0,"[0.0253401268273592, -0.02335318550467491, -0...."
202,2011_006,The council had not evaluated the effects of t...,0,"[-0.02767498977482319, 0.01624125801026821, -0..."
203,2011_006,The standard of pilot training and the supervi...,0,"[0.015297695063054562, -0.018917182460427284, ..."
204,2011_006,The CAA had had recurring concerns for the man...,0,"[0.001043604570440948, 0.00177335599437356, 0...."


Unnamed: 0,report_id,si,mode,si_embedding
0,2019_106,No procedures were in place to direct train cr...,1,"[0.017140474170446396, 0.03509647026658058, -0..."
1,2013_107,"The high incidence of brake block replacement,...",1,"[-0.0018233972368761897, 0.020808950066566467,..."
2,2013_107,The visual inspection regime for wheel-bearing...,1,"[0.0025237964000552893, 0.027265744283795357, ..."
3,2013_107,"The RailBAM system, while operational, did not...",1,"[0.006221923511475325, 0.025432679802179337, -..."
4,2013_107,The lack of a dedicated RailBAM analyst positi...,1,"[-0.004680005367845297, 0.013756909407675266, ..."
...,...,...,...,...
180,2017_101,KiwiRail did not have a mature fatigue risk ma...,1,"[-0.006654317956417799, 0.029867829754948616, ..."
181,2017_101,The eProtect KMC module on board the locomotiv...,1,"[-0.003919209353625774, 0.022458476945757866, ..."
182,2017_104,Transdev had no policies or procedures in plac...,1,"[0.013986819423735142, 0.01571197435259819, -0..."
183,2020_104,Implementation of an administrative control me...,1,"[-0.013463953509926796, -0.007039009593427181,..."


Unnamed: 0,report_id,si,mode,si_embedding
0,2019_202,There is limited data to quantify the extent o...,2,"[-0.0015265028923749924, 0.013446947559714317,..."
1,2019_201,the operator's planned maintenance programme d...,2,"[0.03530280664563179, 0.027329862117767334, 0...."
2,2019_201,the operator's hazard identification system ha...,2,"[0.009704935364425182, 0.02645685337483883, 0...."
3,2019_204,The operator had not included predefined weath...,2,"[0.038087889552116394, 0.000508625409565866, 0..."
4,2019_204,The operator of the Henerata had not assessed ...,2,"[0.018176013603806496, 0.026440272107720375, 0..."
...,...,...,...,...
164,2017_203,Technicians who are authorised to conduct mand...,2,"[0.002318679355084896, 0.015887508168816566, -..."
165,2013_201,The firefighting drills held on board the Taok...,2,"[0.006056208163499832, 0.01051066443324089, -0..."
166,2014_201,crew awareness of the operating limitations of...,2,"[-0.029451534152030945, 0.026009364053606987, ..."
167,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.021512825042009354, 0.029569942504167557, ..."


In [None]:
results = [runBERTopic(df, 'si', 'si_embedding', openai_base_representation_model, umap_model) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
46, 0_Aviation Safety and Operational Procedures Issues
42, 1_Aircraft Maintenance and Safety Issues
37, 2_Challenges and Safety Issues in Robinson Helicopter Operations
51, 3_Aviation Safety and Regulatory Oversight in New Zealand
30, 4_Aviation Safety and Emergency Response
Cluster names: 
49, 0_KiwiRail Safety and Compliance Issues
28, 1_Rail Safety and Inspection Inefficiencies
42, 2_Rail Safety and Communication Issues
27, 3_Safety and Oversight Concerns in Train Operations
21, 4_Road and Rail Safety at Level Crossings
18, 5_Risk Management and Safety Issues in Wellington Station Train Operations
Cluster names: 
150, 0_Maritime Safety and Crew Management Deficiencies
19, 1_Maritime Safety and Compliance Issues


I will try instead to do with no dimension reduction, or atleast decrease the amount of dimension reduction.

In [None]:
from bertopic.dimensionality import BaseDimensionalityReduction

results = [runBERTopic(df,
                       'si',
                       'si_embedding',
                       openai_base_representation_model,
                       BaseDimensionalityReduction()
                       ) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
46, 0_Aviation Safety and Air Traffic Management Issues
47, 1_Aircraft Maintenance and Safety Issues
41, 2_Safety and Training Issues in Robinson Helicopter Operations
53, 3_Aviation Safety and Compliance Issues
19, 4_Safety and Regulatory Oversight in Aviation and Parachuting Operations
Cluster names: 
47, 0_Issues in KiwiRail's Safety and Operational Procedures
62, 1_Rail Safety and Incident Analysis
29, 2_Rail Safety and Signal Management Issues in Wellington Station Approaches
20, 3_Safety Issues at Rail Level Crossings
27, 4_Safety and Risk Management in Rail Operations
Cluster names: 
149, 0_Maritime Safety and Resource Management Issues
20, 1_Maritime Safety Violations and the Sinking of the Easy Rider


This reuslts in just one cluster for each as the curse of dimensionality is prudent here. I will instead try to tune the hyper parameters of OPenAI

In [None]:
from bertopic.dimensionality import BaseDimensionalityReduction

results = [runBERTopic(df,
                       'si',
                       'si_embedding',
                       None,
                       UMAP(n_neighbors=6, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
                       ) for df in openai_modes_dfs]

printout_each_modes_topics(results)



Cluster names: 
52, 0_the_to_of_and
47, 1_the_to_of_and
41, 2_the_of_to_and
33, 3_the_gear_landing_to
17, 4_the_to_for_water
16, 5_zealand_new_of_the
Cluster names: 
44, 0_the_to_work_of
40, 1_the_train_to_and
25, 2_train_of_the_and
20, 3_road_level_crossings_the
20, 4_the_brake_braking_conditions
15, 5_the_in_of_wellington
21, 6_the_rail_of_to
Cluster names: 
120, 0_the_of_and_to
27, 1_the_to_of_easy
11, 2_co2_the_could_be
11, 3_the_fish_crew_of


In [None]:
merged_moode_models.get_topic_info()[['Count', "Name"]]

NameError: name 'merged_moode_models' is not defined


I have had a look at both single run and individual models.

I think the next step is to do some hypter paramter tuning.

 As there are not noticable differences between voyageAI and openAI I will go with openAI embedding model.

# Visualization of themes and safety issues

Now that we have some models that seem reasonable, it is time to create a user friendly representation.

In [None]:
modes_dfs = [openai_embeddings[openai_embeddings['mode'] == i].reset_index(drop=True) for i in range(3)]

pd.concat(modes_dfs)

Unnamed: 0,report_id,si,mode,si_embedding
0,2011_003,The New Zealand regulatory system has not prov...,0,"[0.0187440924346447, -0.000433413457358256, -0..."
1,2011_003,The format of the Robinson R22 helicopter flig...,0,"[0.01013844646513462, -0.03145159035921097, -0..."
2,2011_003,The rate of R22 in-flight break-up accidents i...,0,"[0.005347656551748514, -0.022685393691062927, ..."
3,2011_003,"The crashworthiness of the ELT, which was desi...",0,"[0.014976576901972294, 0.015324870124459267, -..."
4,2010_010,The failure of the nose landing gear to extend...,0,"[-0.0042054359801113605, 0.04125332459807396, ..."
...,...,...,...,...
164,2017_203,Technicians who are authorised to conduct mand...,2,"[0.002318679355084896, 0.015887508168816566, -..."
165,2013_201,The firefighting drills held on board the Taok...,2,"[0.006056208163499832, 0.01051066443324089, -0..."
166,2014_201,crew awareness of the operating limitations of...,2,"[-0.029451534152030945, 0.026009364053606987, ..."
167,2014_201,crew operating knowledge of on-board emergency...,2,"[-0.021512825042009354, 0.029569942504167557, ..."


In [None]:

topic_model = BERTopic.load("demo_merged_model")

all_data = pd.concat(openai_modes_dfs)

make_visualization(topic_model, all_data)




In [None]:
demo_individual_models = [BERTopic.load(f"demo_individual_model_mode_{i}") for i in range(3)]

for model, df, i in zip(demo_individual_models, modes_dfs, range(len(demo_individual_models))):
    array_embeddings = column_to_2darray(df['si_embedding'])

    reduced_array_embeddings = UMAP(n_neighbors=3, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

    visualization = model.visualize_documents(df['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

    with open(os.path.join('topic_visuals', f'demo_individual_model_mode_{i}_visual.html'), 'w') as f:
        visualization.write_html(f)

    display(visualization)



In [None]:
topic_model = BERTopic.load("demo_group_model")

all_data = pd.concat(modes_dfs)

array_embeddings = column_to_2darray(all_data['si_embedding'])

reduced_array_embeddings = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(array_embeddings)

visualization = topic_model.visualize_documents(all_data['si'].to_list(), embeddings=array_embeddings, reduced_embeddings=reduced_array_embeddings)

with open(os.path.join('topic_visuals', 'demo_group_model_visual.html'), 'w') as f:
    visualization.write_html(f)

visualization

