In [2]:
import pandas as pd
from sklearn import datasets # sklearn comes with some toy datasets to practise
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


Now it's time to cluster the songs of the hot_songs and not_hot_songs databases according to the song's audio features. You will need to consider the following:

Are you going to use all the audio features? If not, which ones do you think to make more sense? What is the optimal number of clusters (for methods that need to know this beforehand)? What is the best distance to use? What clustering method provides better results? Does the clustering method need a transformer? Be aware that this process is extremely time-consuming!!! Therefore, when testing different options, save the models into your disk in order to be able to use the best model later. You don't want to retrain the best model again when you know what are the optimal parameters for each.

Add to the hot_songs and not_hot_songs databases a new column for each clustering method with the cluster membership of each song for each method.

In [3]:
pd.set_option('display.max_columns', None)

## Importing the datasets and concatenating


In [4]:
billboard100_final = pd.read_csv('billboard100_final.csv')

In [5]:
billboard100_final.head()

Unnamed: 0,song_id,title,artist,dataset,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
0,4LRPiXqCikLlN15c3yImP7,As It Was,Harry Styles,H,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,0.662,173.93,audio_features,spotify:track:4LRPiXqCikLlN15c3yImP7,https://api.spotify.com/v1/tracks/4LRPiXqCikLl...,https://api.spotify.com/v1/audio-analysis/4LRP...,167303,4
1,1rDQ4oMwGJI7B4tovsBOxc,First Class,Jack Harlow,H,0.905,0.563,8,-6.135,1,0.102,0.0254,1e-05,0.113,0.324,106.998,audio_features,spotify:track:1rDQ4oMwGJI7B4tovsBOxc,https://api.spotify.com/v1/tracks/1rDQ4oMwGJI7...,https://api.spotify.com/v1/audio-analysis/1rDQ...,173948,4
2,1PckUlxKqWQs3RlWXVBLw3,About Damn Time,Lizzo,H,0.836,0.743,10,-6.305,0,0.0656,0.0995,0.0,0.335,0.722,108.966,audio_features,spotify:track:1PckUlxKqWQs3RlWXVBLw3,https://api.spotify.com/v1/tracks/1PckUlxKqWQs...,https://api.spotify.com/v1/audio-analysis/1Pck...,191822,4
3,3USxtqRwSYz57Ewm6wWRMp,Heat Waves,Glass Animals,H,0.761,0.525,11,-6.9,1,0.0944,0.44,7e-06,0.0921,0.531,80.87,audio_features,spotify:track:3USxtqRwSYz57Ewm6wWRMp,https://api.spotify.com/v1/tracks/3USxtqRwSYz5...,https://api.spotify.com/v1/audio-analysis/3USx...,238805,4
4,6Zu3aw7FfjAF9WA0fA81Oq,Big Energy,Latto,H,0.935,0.807,11,-3.838,0,0.114,0.0514,0.0,0.349,0.813,106.017,audio_features,spotify:track:6Zu3aw7FfjAF9WA0fA81Oq,https://api.spotify.com/v1/tracks/6Zu3aw7FfjAF...,https://api.spotify.com/v1/audio-analysis/6Zu3...,173182,4


In [6]:
billboard100_final.shape


(75, 21)

In [8]:
nothotsongs_final = pd.read_csv('nothotsongs_final.csv')

In [15]:
nothotsongs_final = nothotsongs_final.iloc[:, :21]

In [17]:
X = pd.concat([billboard100_final, nothotsongs_final], axis=0)

In [18]:
X.head()

Unnamed: 0,song_id,title,artist,dataset,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
0,4LRPiXqCikLlN15c3yImP7,As It Was,Harry Styles,H,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,0.662,173.93,audio_features,spotify:track:4LRPiXqCikLlN15c3yImP7,https://api.spotify.com/v1/tracks/4LRPiXqCikLl...,https://api.spotify.com/v1/audio-analysis/4LRP...,167303,4
1,1rDQ4oMwGJI7B4tovsBOxc,First Class,Jack Harlow,H,0.905,0.563,8,-6.135,1,0.102,0.0254,1e-05,0.113,0.324,106.998,audio_features,spotify:track:1rDQ4oMwGJI7B4tovsBOxc,https://api.spotify.com/v1/tracks/1rDQ4oMwGJI7...,https://api.spotify.com/v1/audio-analysis/1rDQ...,173948,4
2,1PckUlxKqWQs3RlWXVBLw3,About Damn Time,Lizzo,H,0.836,0.743,10,-6.305,0,0.0656,0.0995,0.0,0.335,0.722,108.966,audio_features,spotify:track:1PckUlxKqWQs3RlWXVBLw3,https://api.spotify.com/v1/tracks/1PckUlxKqWQs...,https://api.spotify.com/v1/audio-analysis/1Pck...,191822,4
3,3USxtqRwSYz57Ewm6wWRMp,Heat Waves,Glass Animals,H,0.761,0.525,11,-6.9,1,0.0944,0.44,7e-06,0.0921,0.531,80.87,audio_features,spotify:track:3USxtqRwSYz57Ewm6wWRMp,https://api.spotify.com/v1/tracks/3USxtqRwSYz5...,https://api.spotify.com/v1/audio-analysis/3USx...,238805,4
4,6Zu3aw7FfjAF9WA0fA81Oq,Big Energy,Latto,H,0.935,0.807,11,-3.838,0,0.114,0.0514,0.0,0.349,0.813,106.017,audio_features,spotify:track:6Zu3aw7FfjAF9WA0fA81Oq,https://api.spotify.com/v1/tracks/6Zu3aw7FfjAF...,https://api.spotify.com/v1/audio-analysis/6Zu3...,173182,4


In [19]:
X.shape

(1949, 21)

## Dropping unnecessary columns

In [25]:
X.drop(columns=['song_id','title', 'artist', 'dataset','type', 'uri', 'track_href', 'analysis_url', 'time_signature'], axis=1, inplace=True)

## Scaling features

In [33]:
import pickle

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
filename = "C:/Users/31612/Documents/IRONHACK/6Week_six/lab-web-scraping-single-page" # Path with filename

with open(filename, "wb") as file:
        pickle.dump(scaler,file)

X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
display(X.head())
print()
display(X_scaled_df.head())

PermissionError: [Errno 13] Permission denied: 'C:/Users/31612/Documents/IRONHACK/6Week_six/lab-web-scraping-single-page'

In [None]:
X_scaled_df.describe()

## Clustering the wines with K-Means

In [None]:
kmeans = KMeans(n_clusters=8, random_state=1234)
kmeans.fit(X_scaled_df)

In [None]:
clusters = kmeans.predict(X_scaled_df)

pd.Series(clusters).value_counts().sort_index()

In [None]:
X["cluster"] = clusters
X.head()

In [None]:
#X[(X['dataset'] == "N") & (X['cluster'] == user_cluster)].sample() dataset== H or NH - cluster==assigned cluster

## Chosing K

In [None]:
K = range(2, 21)

inertia = []
silhouette = []

for k in K:
    print("Training a K-Means model with {} clusters! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,
                    random_state=1234,
                    verbose=1)
    kmeans.fit(X_scaled_df)
    
    filename = "/content/drive/MyDrive/kmeans_" + str(k) + ".pickle" # Path with filename
    with open(filename, "wb") as file:
        pickle.dump(kmeans,file)

    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(X_scaled_df, kmeans.predict(X_scaled_df)))


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1,2,figsize=(16,8))
ax[0].plot(K, inertia, 'bx-')
ax[0].set_xlabel('k')
ax[0].set_ylabel('inertia')
ax[0].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[0].set_title('Elbow Method showing the optimal k')
ax[1].plot(K, silhouette, 'bx-')
ax[1].set_xlabel('k')
ax[1].set_ylabel('silhouette score')
ax[1].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[1].set_title('Silhouette Method showing the optimal k')

## Silouhette

In [None]:
def load(filename = "filename.pickle"): 
    try: 
        with open(filename, "rb") as file: 
            return pickle.load(file) 
    except FileNotFoundError: 
        print("File not found!") 

## Loading the scaler and the best model

In [None]:
scaler2 = load("/content/drive/MyDrive/scaler.pickle")
scaler2

In [None]:
best_model = load("/content/drive/MyDrive/kmeans_9.pickle")