In [1]:
import numpy as np
import os
import pandas as pd

import umap.umap_ as umap
import matplotlib.pyplot as plt

import pickle 
import plotly.graph_objects as go



In [2]:
emb_path = r"F:\Workspace\bioshock-audio-clustering\all_voicelines_emb"


In [3]:
# Load embeddings and index map
embeddings = np.load("./all_embeddings.npy")
with open('./index_map.pkl', 'rb') as f:
    index_map = pickle.load(f)

In [28]:
# UMAP

audio_files = [os.path.join(r"F:\Workspace\bioshock-audio-clustering\all_voicelines_wav", index_map[i] + ".wav") for i in range(len(index_map.keys()))]

reducer = umap.UMAP(n_neighbors=20, min_dist=0.1, metric="euclidean")
embedding_2d = reducer.fit_transform(np.array(embeddings))

In [34]:
# TSNE

import os
from sklearn.manifold import TSNE
import numpy as np

audio_files = [os.path.join(r"F:\Workspace\bioshock-audio-clustering\all_voicelines_wav", index_map[i] + ".wav") for i in range(len(index_map.keys()))]

reducer = TSNE(n_components=2,  # Reduce to 2D
               perplexity=60,    # Typical range: 5-50 (adjust based on dataset size)
               learning_rate=200,
               metric="euclidean",
               random_state=42,
               n_iter=1000)      # Increase for better convergence

embedding_2d = reducer.fit_transform(np.array(embeddings))


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.



In [None]:
# # clustering with DBSCAN
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(embedding_2d)

In [36]:
df = pd.DataFrame({
    "UMAP1": embedding_2d[:, 0],
    "UMAP2": embedding_2d[:, 1],
    "audio_files": audio_files
})

In [16]:
import librosa

def get_audio_duration(filepath):
    duration = librosa.get_duration(filename=filepath)
    return duration

# Add duration column
df['duration'] = df['audio_files'].apply(get_audio_duration)


get_duration() keyword argument 'filename' has been renamed to 'path' in version 0.10.0.
	This alias will be removed in version 1.0.



In [19]:
df_all = df

In [23]:
df_filtered = df_all[df_all['duration'] > 3]

df_filtered

Unnamed: 0,UMAP1,UMAP2,audio_files,duration
11,1.155244,-2.936456,F:\Workspace\bioshock-audio-clustering\all_voi...,6.228000
14,-0.303304,3.889644,F:\Workspace\bioshock-audio-clustering\all_voi...,5.908000
15,4.950957,-5.076347,F:\Workspace\bioshock-audio-clustering\all_voi...,14.572000
16,0.221881,4.870303,F:\Workspace\bioshock-audio-clustering\all_voi...,9.062667
17,2.791785,-0.538825,F:\Workspace\bioshock-audio-clustering\all_voi...,3.382667
...,...,...,...,...
9560,5.425685,-0.086589,F:\Workspace\bioshock-audio-clustering\all_voi...,3.322667
9561,3.705809,-3.727015,F:\Workspace\bioshock-audio-clustering\all_voi...,3.153333
9566,0.084313,4.257648,F:\Workspace\bioshock-audio-clustering\all_voi...,6.889333
9570,2.109828,-2.178634,F:\Workspace\bioshock-audio-clustering\all_voi...,3.276000


In [37]:
import pandas as pd
import plotly.graph_objects as go
from ipywidgets import Output
from IPython.display import display, Audio
import os



# Create FigureWidget
fig = go.FigureWidget()

fig.add_trace(go.Scatter(
    x=df["UMAP1"],
    y=df["UMAP2"],
    mode="markers",
    marker=dict(size=10, color='blue'),
    text=df_filtered["audio_files"],
    hoverinfo="text",
    customdata=df["audio_files"]
))

fig.update_layout(
    title="UMAP Clustering (Click to play audio)",
    xaxis_title="UMAP Dimension 1",
    yaxis_title="UMAP Dimension 2",
    width=1200,  # Wider
    height=800,  # Taller
    margin=dict(l=20, r=20, b=20, t=40),  # Minimize margins
)

# Create an output area for the audio
audio_out = Output()

def handle_click(trace, points, selector):
    with audio_out:
        audio_out.clear_output()
        if points.point_inds:
            index = points.point_inds[0]
            audio_file = trace.customdata[index]
            display(Audio(filename=audio_file))

fig.data[0].on_click(handle_click)

# Display the figure and output area
display(fig)
display(audio_out)

FigureWidget({
    'data': [{'customdata': array(['F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-1.wav',
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-10.wav',
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-100.wav',
                                   ...,
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-997.wav',
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-998.wav',
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-999.wav'],
                                  dtype=object),
              'hoverinfo': 'text',
              'marker': {'color': 'blue', 'size': 10},
              'mode': 'markers',
              'text': array(['F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-1008.wa

Output()