In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [3]:
data = pd.read_csv('./data/cleaned_data.csv')

features = ['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
            'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

X = data[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

n_clusters = 10

kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(X_scaled)

data['cluster'] = clusters

genre_distribution = data.groupby('cluster')['track_genre'].value_counts(normalize=True).unstack().fillna(0)

print(genre_distribution)


track_genre  acoustic  afrobeat  alt-rock  alternative   ambient     anime  \
cluster                                                                      
0            0.004088  0.000681  0.000341     0.000000  0.103049  0.023846   
1            0.016329  0.002674  0.019003     0.004584  0.004870  0.016329   
2            0.003735  0.036652  0.010155     0.004903  0.000233  0.008404   
3            0.004021  0.003197  0.012477     0.002990  0.000825  0.010002   
4            0.041980  0.006092  0.003171     0.001586  0.013437  0.005008   
5            0.002559  0.009053  0.011218     0.001968  0.001771  0.005904   
6            0.004967  0.002247  0.017857     0.006031  0.001892  0.023061   
7            0.000129  0.019874  0.002452     0.000258  0.010195  0.012131   
8            0.000000  0.003009  0.000000     0.000000  0.001003  0.000000   
9            0.009302  0.022532  0.009943     0.002967  0.000401  0.006655   

track_genre  black-metal  bluegrass     blues    brazil  ...   

In [11]:
pca = PCA(n_components=14)
X_pca = pca.fit_transform(X_scaled)

# Cumulative explained variance
cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()
print(cumulative_explained_variance)

# plt.figure(figsize=(12, 8))
# for cluster in range(n_clusters):
#     plt.scatter(X_pca[clusters == cluster, 0], X_pca[clusters == cluster, 1], label=f'Cluster {cluster}')
# plt.title('K-means Clustering of Music Tracks')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.legend()
# plt.show()


[0.21318209 0.32387567 0.41810782 0.50173762 0.57724378 0.64537093
 0.70872498 0.77134352 0.83072444 0.8856497  0.93577035 0.96672016
 0.99003208 1.        ]


In [20]:
for cluster in range(n_clusters):
    n_top = 5
    top_genres = genre_distribution.loc[cluster].nlargest(n_top)
    top_genres_list = [(genre, round(ratio, 3)) for genre, ratio in top_genres.items()]
    print(f"Cluster {cluster}: Top {n_top} genres: {top_genres_list}")

Cluster 0: Top 5 genres: [('cantopop', 0.046), ('tango', 0.045), ('acoustic', 0.042), ('honky-tonk', 0.039), ('romance', 0.036)]
Cluster 1: Top 5 genres: [('kids', 0.036), ('salsa', 0.032), ('children', 0.03), ('forro', 0.028), ('party', 0.024)]
Cluster 2: Top 5 genres: [('heavy-metal', 0.036), ('death-metal', 0.033), ('j-idol', 0.031), ('grunge', 0.03), ('power-pop', 0.03)]
Cluster 3: Top 5 genres: [('hardstyle', 0.048), ('heavy-metal', 0.04), ('metalcore', 0.039), ('drum-and-bass', 0.035), ('happy', 0.033)]
Cluster 4: Top 5 genres: [('dancehall', 0.033), ('turkish', 0.027), ('salsa', 0.027), ('hip-hop', 0.026), ('afrobeat', 0.026)]
Cluster 5: Top 5 genres: [('pagode', 0.086), ('sertanejo', 0.061), ('samba', 0.049), ('mpb', 0.04), ('brazil', 0.032)]
Cluster 6: Top 5 genres: [('new-age', 0.128), ('ambient', 0.103), ('sleep', 0.097), ('classical', 0.09), ('piano', 0.072)]
Cluster 7: Top 5 genres: [('romance', 0.062), ('sleep', 0.057), ('opera', 0.054), ('honky-tonk', 0.039), ('tango', 0

In [6]:
genre = [
  "acoustic",
  "afrobeat",
  "alt-rock",
  "alternative",
  "ambient",
  "anime",
  "black-metal",
  "bluegrass",
  "blues",
  "bossanova",
  "brazil",
  "breakbeat",
  "british",
  "cantopop",
  "chicago-house",
  "children",
  "chill",
  "classical",
  "club",
  "comedy",
  "country",
  "dance",
  "dancehall",
  "death-metal",
  "deep-house",
  "detroit-techno",
  "disco",
  "disney",
  "drum-and-bass",
  "dub",
  "dubstep",
  "edm",
  "electro",
  "electronic",
  "emo",
  "folk",
  "forro",
  "french",
  "funk",
  "garage",
  "german",
  "gospel",
  "goth",
  "grindcore",
  "groove",
  "grunge",
  "guitar",
  "happy",
  "hard-rock",
  "hardcore",
  "hardstyle",
  "heavy-metal",
  "hip-hop",
  "holidays",
  "honky-tonk",
  "house",
  "idm",
  "indian",
  "indie",
  "indie-pop",
  "industrial",
  "iranian",
  "j-dance",
  "j-idol",
  "j-pop",
  "j-rock",
  "jazz",
  "k-pop",
  "kids",
  "latin",
  "latino",
  "malay",
  "mandopop",
  "metal",
  "metal-misc",
  "metalcore",
  "minimal-techno",
  "movies",
  "mpb",
  "new-age",
  "new-release",
  "opera",
  "pagode",
  "party",
  "philippines-opm",
  "piano",
  "pop",
  "pop-film",
  "post-dubstep",
  "power-pop",
  "progressive-house",
  "psych-rock",
  "punk",
  "punk-rock",
  "r-n-b",
  "rainy-day",
  "reggae",
  "reggaeton",
  "road-trip",
  "rock",
  "rock-n-roll",
  "rockabilly",
  "romance",
  "sad",
  "salsa",
  "samba",
  "sertanejo",
  "show-tunes",
  "singer-songwriter",
  "ska",
  "sleep",
  "songwriter",
  "soul",
  "soundtracks",
  "spanish",
  "study",
  "summer",
  "swedish",
  "synth-pop",
  "tango",
  "techno",
  "trance",
  "trip-hop",
  "turkish",
  "work-out",
  "world-music"
]

print(len(genre))

126
