In [52]:
import ast
from collections import Counter

from imblearn.under_sampling import RandomUnderSampler
from matplotlib.colors import LogNorm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans

In [53]:
# Set the number of neurons to the same number as the loaded model
som_neurons = (38, 38)
model_path = '../models/AgglomerativeClustering_training_lr_0.001_sigma_7.5.p'
ClusterClass = AgglomerativeClustering
# ClusterClass = MiniBatchKMeans

# Set this to LogNorm() if there is no clear clustering
# heatmap_color_norm = None
HeatmapColorNormClass = LogNorm

rng = np.random.default_rng()

## Data
Select a subset of tracks from some genres, and under sample the genres
to the size of the smallest genre.

In [None]:
genres = ["rock",
          "classical",
          "latin",
          "pop",
          "jazz",
          "soul",
          "classic bollywood",
          "rap",
          "folk",
          "funk",
          "opera"]
tracks = pd.read_csv('../data/tracks_with_genres.csv')
tracks['genres'] = tracks['genres'].apply(ast.literal_eval)
tracks = tracks.explode('genres')
tracks.rename(columns={'genres': 'genre'}, inplace=True)
tracks_subset = tracks[tracks['genre'].isin(genres)]

rus = RandomUnderSampler(random_state=1)
X, y = rus.fit_resample(tracks_subset, tracks_subset['genre'])

tracks_genres = y.astype('category')

print(f'In total, there are {y.shape[0]} tracks in the dataset used.')

Convert the pandas dataframe to a matrix of numerical values, normalize the values column wise and shuffle the rows.

In [None]:
features = ['acousticness', 'instrumentalness', 'loudness', 'energy', 'danceability', 'valence']
tracks_values = X[features].values
tracks_values = (tracks_values - tracks_values.mean(axis=0))/tracks_values.std(axis=0)
rng.shuffle(tracks_values, axis=0)

In [None]:
# Load model
with open(model_path, 'rb') as model_file:
    som = pickle.load(model_file)

In [None]:
neuron_activations = som.activation_response(tracks_values)

sns.set_style('white')
heatmap = sns.heatmap(neuron_activations, cbar_kws={'label': 'Number of activations'},
                      norm=HeatmapColorNormClass())
#plt.axis('off')
plt.show()
heatmap.get_figure().savefig('../figures/neurons_heatmap.png')

Display the distance map.

In [None]:
sns.set_style('white')
heatmap = sns.heatmap(som.distance_map(), cbar_kws={'label': 'Distance'},
                      norm=HeatmapColorNormClass())
#plt.axis('off')
plt.show()
heatmap.get_figure().savefig('../figures/neurons_distance_map.png')

In [None]:
cluster = ClusterClass(n_clusters=len(genres))
som_weights = som.get_weights()
labels = cluster.fit_predict(
    som_weights.reshape(som_neurons[0]*som_neurons[1], len(features))
)

labels_matrix = labels.reshape(som_neurons[0], som_neurons[1])
sns.heatmap(labels_matrix)
plt.show()

In [None]:
labels_map = som.labels_map(tracks_values, tracks_genres)
labels_classified_as_genre = [Counter() for i in range(len(genres))]

for neuron in range(len(labels)):
    y, x = neuron//som_neurons[1], neuron%som_neurons[1]
    label = labels[neuron]
    counter = labels_map[(y, x)]

    if counter:
        labels_classified_as_genre[label] += counter

for label in labels_classified_as_genre:
    print(label)

In [None]:
print("Songs per genre and cluster")
classifications = pd.DataFrame(labels_classified_as_genre)
classifications.T.plot.bar(stacked=True)
plt.legend(title='Cluster', loc='upper center', ncol=len(genres)//2,
           bbox_to_anchor=(0.5, -0.15))
plt.show()