In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Reducing the num of columns of the dataset using PCA (13 -> 2)
def pca(df):
    # Extract and store track_uri values
    track_uris = df['track_uri'].values
    
    # Drop the track_uri column from the dataframe
    df_numeric = df.drop('track_uri', axis=1)

    # Apply PCA
    pca_init = PCA(n_components=2)
    pca_result = pca_init.fit_transform(df_numeric)
    
    # Combine PCA results with original track_uri values
    result_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
    result_df['track_uri'] = track_uris
    
    print(result_df)
    return result_df, pca_init

In [None]:
# Using elbow method to see how many clusters works best
def kmeans_cluster_tuning(data):
    if 'track_uri' in data.columns:
        data = data.drop('track_uri', axis=1)
        
    #Taking random sample as dataset is too large    
    random_sample = data.sample(frac=0.03, replace=False, random_state=1) 

    #elbow method for identifying optimal cluster
    possible_clusters = [10,20,30,40,50,60,70,80,90,100] 
    inertia = []
    for num_clusters in possible_clusters:
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(random_sample)
        inertia.append(kmeans.inertia_)
    print(inertia)
    differences = [inertia[i + 1] - inertia[i] for i in range(len(inertia) - 1)]
    print(differences)
    print(np.median(differences))
    min_drop_index = differences.index(np.median(differences))
    #plot elbow score by cluster
    plt.plot(possible_clusters, inertia, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
    plt.title('Elbow Method for Optimal Number of Clusters')
    plt.show()    
    return possible_clusters[min_drop_index]

In [None]:
# visualizations 
def visualizing_results(data, kmeans):
    # Plot the predicted clusters
    plt.scatter(data[:, 0], data[:, 1], c=kmeans.labels_, cmap='viridis', marker='o', edgecolor='k', alpha=0.5, label='Predicted Clusters')
    plt.title('Song Attribute Clusters')
    plt.xlabel('PCA_1')
    plt.ylabel('PCA_2')
    plt.show()

In [None]:
def main():
    
    print("\n\n1. Reading Songs DataSet\n")
    df_raw = pd.read_csv('spotify_song_output.csv')
    row, col = df_raw.shape
    print(f'There are {row} rows and {col} columns')

    print("\n\n2. Reducing via PCA\n")
    pca_result, pca_2 = pca(df_raw)
    pca_result = pca_result.to_numpy()

    print("\n\n3. HyperTuning the Parameter for KMeans\n")
    optimum_num_clusters = kmeans_cluster_tuning(df_raw)
    print("optimum num of clusters =", optimum_num_clusters)

    print("\n\n5. K-means clustering")
    track_uris = df_raw['track_uri'].copy()
    song_raw_copy = df_raw.drop('track_uri', axis=1)

    kmeans = KMeans(n_clusters=optimum_num_clusters, random_state=42)
    kmeans.fit(song_raw_copy)

    cluster_df = pd.DataFrame({'track_uri': track_uris, 'cluster_label': kmeans.labels_})

    print("\n\n5. Visualizing the data")
    visualizing_results(pca_result, kmeans)
    
    print('\n cluster_df')
    print(cluster_df)
    return cluster_df

In [None]:
main()