In [8]:
import numpy as np
import os
import pandas as pd

import matplotlib.pyplot as plt

import pickle 
import plotly.graph_objects as go

import librosa

In [10]:
# Run this block once to generate all required files for visualization (~10min)

embeddings = []

emb_path = r"F:\Workspace\bioshock-audio-clustering\all_voicelines_emb"
emb_files = [os.path.join(emb_path, file) for file in os.listdir(emb_path)]

index = 0
index_map = {}

# Create index map and combined embeddings file
for file in emb_files:
    emb = np.load(file)
    emb = emb.flatten()
    
    if emb.shape[0] == 192:
        embeddings.append(emb)
        index_map[index] = file.split("\\")[-1].replace(".npy", "")
        index += 1

audio_files = [os.path.join(r"F:\Workspace\bioshock-audio-clustering\all_voicelines_wav", index_map[i] + ".wav") for i in range(len(index_map.keys()))]
audio_durations = []

# Generate audio file durations
for audio_file in audio_files:
    audio_durations.append(librosa.get_duration(filename=audio_file))


# Save audio file durations
with open("./tmp/audio_durations.pkl", "wb") as f:
    pickle.dump(audio_durations, f)

# Save index map
with open('./tmp/index_map.pkl', 'wb') as f:
    pickle.dump(index_map, f)

# Save combined embeddings file
np.save("./tmp/all_embeddings.npy", np.array(embeddings))

	This alias will be removed in version 1.0.
  audio_durations.append(librosa.get_duration(filename=audio_file))


In [11]:
# Load embeddings
embeddings = np.load("./tmp/all_embeddings.npy")

# Load index map
with open('./tmp/index_map.pkl', 'rb') as f:
    index_map = pickle.load(f)

# Load audio file durations
with open("./tmp/audio_durations.pkl", "rb") as f:
    loaded_audio_durations = pickle.load(f)

audio_files = [os.path.join(r"F:\Workspace\bioshock-audio-clustering\all_voicelines_wav", index_map[i] + ".wav") for i in range(len(index_map.keys()))]

In [37]:
# Dimensionality reduction using UMAP

import umap.umap_ as umap

reducer = umap.UMAP(n_neighbors=10, min_dist=0.2, metric="euclidean")
embedding_2d = reducer.fit_transform(np.array(embeddings))

In [41]:
# Dimensionality reduction using TSNE

from sklearn.manifold import TSNE

reducer = TSNE(n_components=2,  # Reduce to 2D
               perplexity=60,    # Typical range: 5-50 (adjust based on dataset size)
               learning_rate=200,
               metric="euclidean",
               random_state=42,
               n_iter=1000)      # Increase for better convergence

embedding_2d = reducer.fit_transform(np.array(embeddings))


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.



In [12]:
# Dimensionality reduction using PCA

from sklearn.decomposition import PCA

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
embedding_2d = pca.fit_transform(np.array(embeddings))

In [42]:
df = pd.DataFrame({
    "DIM1": embedding_2d[:, 0],
    "DIM2": embedding_2d[:, 1],
    "audio_files": audio_files,
    'duration': loaded_audio_durations
})
df

Unnamed: 0,UMAP1,UMAP2,audio_files,duration
0,7.167749,16.546165,F:\Workspace\bioshock-audio-clustering\all_voi...,1.846667
1,32.350285,11.677607,F:\Workspace\bioshock-audio-clustering\all_voi...,1.032000
2,27.550278,-14.555147,F:\Workspace\bioshock-audio-clustering\all_voi...,1.169333
3,20.292585,-13.060826,F:\Workspace\bioshock-audio-clustering\all_voi...,1.246667
4,2.100838,6.595560,F:\Workspace\bioshock-audio-clustering\all_voi...,1.206667
...,...,...,...,...
9578,-21.402248,-2.233195,F:\Workspace\bioshock-audio-clustering\all_voi...,0.561333
9579,23.497978,22.990147,F:\Workspace\bioshock-audio-clustering\all_voi...,1.022667
9580,-9.161514,-21.381226,F:\Workspace\bioshock-audio-clustering\all_voi...,4.086667
9581,17.681168,4.821425,F:\Workspace\bioshock-audio-clustering\all_voi...,1.592000


In [43]:
df = df[df['duration'] > 2]
df

Unnamed: 0,UMAP1,UMAP2,audio_files,duration
5,11.746811,1.186889,F:\Workspace\bioshock-audio-clustering\all_voi...,2.057333
9,2.252609,-26.047966,F:\Workspace\bioshock-audio-clustering\all_voi...,2.169333
10,-20.526806,4.615812,F:\Workspace\bioshock-audio-clustering\all_voi...,2.594667
11,-6.025005,7.682512,F:\Workspace\bioshock-audio-clustering\all_voi...,6.228000
14,19.533211,19.169405,F:\Workspace\bioshock-audio-clustering\all_voi...,5.908000
...,...,...,...,...
9560,-34.545242,21.715080,F:\Workspace\bioshock-audio-clustering\all_voi...,3.322667
9561,-23.425821,3.279910,F:\Workspace\bioshock-audio-clustering\all_voi...,3.153333
9566,18.208263,19.524591,F:\Workspace\bioshock-audio-clustering\all_voi...,6.889333
9570,-10.408412,4.968479,F:\Workspace\bioshock-audio-clustering\all_voi...,3.276000


In [44]:
import pandas as pd
import plotly.graph_objects as go
from ipywidgets import Output
from IPython.display import display, Audio
import os



# Create FigureWidget
fig = go.FigureWidget()

fig.add_trace(go.Scatter(
    x=df["DIM1"],
    y=df["DIM2"],
    mode="markers",
    marker=dict(size=10, color='blue'),
    text=df["audio_files"],
    hoverinfo="text",
    customdata=df["audio_files"]
))

fig.update_layout(
    title="UMAP Clustering (Click to play audio)",
    xaxis_title="Dimension 1",
    yaxis_title="Dimension 2",
    width=1200,  # Wider
    height=800,  # Taller
    margin=dict(l=20, r=20, b=20, t=40),  # Minimize margins
)

# Create an output area for the audio
audio_out = Output()

def handle_click(trace, points, selector):
    with audio_out:
        audio_out.clear_output()
        if points.point_inds:
            index = points.point_inds[0]
            audio_file = trace.customdata[index]
            display(Audio(filename=audio_file))

fig.data[0].on_click(handle_click)

# Display the figure and output area
display(fig)
display(audio_out)

FigureWidget({
    'data': [{'customdata': array(['F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-1002.wav',
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-1006.wav',
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-1007.wav',
                                   ...,
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-984.wav',
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-988.wav',
                                   'F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-997.wav'],
                                  dtype=object),
              'hoverinfo': 'text',
              'marker': {'color': 'blue', 'size': 10},
              'mode': 'markers',
              'text': array(['F:\\Workspace\\bioshock-audio-clustering\\all_voicelines_wav\\pck-1

Output()