<a href="https://colab.research.google.com/github/CamCranda11/MLFA25Project/blob/main/MLProjectModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

song_data = pd.read_csv('spotify_data.csv')

print("--- Original Data ---")
print(song_data.head())
print("\n")

--- Original Data ---
   Unnamed: 0    artist_name        track_name                track_id  \
0           0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6   
1           1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218   
2           2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F   
3           3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz   
4           4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8   

   popularity  year     genre  danceability  energy  key  loudness  mode  \
0          68  2012  acoustic         0.483   0.303    4   -10.058     1   
1          50  2012  acoustic         0.572   0.454    3   -10.286     1   
2          57  2012  acoustic         0.409   0.234    3   -13.711     1   
3          58  2012  acoustic         0.392   0.251   10    -9.845     1   
4          54  2012  acoustic         0.430   0.791    6    -5.419     0   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  

In [32]:
features_to_cluster = ['danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence']
features_df = song_data[features_to_cluster].copy()

mode_value = features_df['mode'].mode()[0]
features_df['mode'] = features_df['mode'].fillna(mode_value)

scaler = StandardScaler()
scaled_features_df = scaler.fit_transform(features_df)

print("--- Scaled Features (First 5 Rows) ---")
print(scaled_features_df[:5])
print("\n")

--- Scaled Features (First 5 Rows) ---
[[-0.29509342 -1.24461718  0.75872495 -0.39352278  1.04922991 -0.69122871
  -1.17892497]
 [ 0.18734904 -0.68639325  0.75872495 -0.52833738  0.43794003 -0.69119118
   0.22134908]
 [-0.69622536 -1.49969964  0.75872495 -0.47709206  0.04637646 -0.69109175
  -1.15658017]
 [-0.78837729 -1.43685324  0.75872495 -0.44555648  1.36755137 -0.69122871
   0.19528015]
 [-0.58239062  0.55944426 -1.31800069 -0.49364824 -0.70125643 -0.63836256
  -0.88844259]]




In [33]:
optimal_k = 50

kmeans_model = KMeans(n_clusters=optimal_k, init='k-means++', n_init=10, random_state=42)
kmeans_model.fit(scaled_features_df)

cluster_labels = kmeans_model.labels_

song_cluster_df = song_data.copy()
song_cluster_df['cluster_id'] = cluster_labels

print("--- Data with Cluster IDs ---")
display(song_cluster_df.head())
print("\n")

--- Data with Cluster IDs ---


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster_id
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,...,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3,17
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,...,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4,27
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,...,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4,17
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,...,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4,1
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,...,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4,40






In [34]:
def recommend_songs(input_song_name, input_artist_name, data_df, num_recs=3):
    """
    Recommends songs from the same cluster and genre as the input song and artist.
    Includes track_id for generating Spotify links.
    """
    try:
        song_row = data_df[
            (data_df['track_name'] == input_song_name) &
            (data_df['artist_name'] == input_artist_name)
        ].iloc[0]
        song_cluster = song_row['cluster_id']
        song_genre = song_row['genre']

        recommendations = data_df[
            (data_df['cluster_id'] == song_cluster) &
            (data_df['genre'] == song_genre) &
            ((data_df['track_name'] != input_song_name) | (data_df['artist_name'] != input_artist_name))
        ]

        if len(recommendations) == 0:
            return "No similar songs found in the same genre."

        return recommendations.sample(min(num_recs, len(recommendations)))

    except IndexError:
        return f"Song '{input_song_name}' by {input_artist_name} not found in the dataset."

In [118]:
my_song = 'Catapult'
my_artist = 'Arctic Monkeys'
recommendations = recommend_songs(my_song, my_artist, song_cluster_df, num_recs=5)

try:
    input_song_row = song_cluster_df[
        (song_cluster_df['track_name'] == my_song) &
        (song_cluster_df['artist_name'] == my_artist)
    ].iloc[0]
    input_song_track_id = input_song_row['track_id']
    print(f"--- Input Song: '{my_song}' by {my_artist} https://open.spotify.com/track/{input_song_track_id} ---")
    display(input_song_row[['genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'cluster_id']])

    print(f"\n--- Recommendations based on '{my_song}' by {my_artist} ---")
    if isinstance(recommendations, pd.DataFrame):
        display(recommendations[['track_name', 'artist_name', 'track_id', 'genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'cluster_id']])
    else:
        print(recommendations)


except IndexError:
    print(f"Song '{my_song}' by {my_artist} not found in the dataset.")


print("\n")

my_song_2 = 'Trouble Weighs A Ton'
my_artist_2 = 'Dan Auerbach'
recommendations_2 = recommend_songs(my_song_2, my_artist_2, song_cluster_df, num_recs=5)

try:
    input_song_2_row = song_cluster_df[
        (song_cluster_df['track_name'] == my_song_2) &
        (song_cluster_df['artist_name'] == my_artist_2)
    ].iloc[0]
    input_song_2_track_id = input_song_2_row['track_id']
    print(f"--- Input Song: '{my_song_2}' by {my_artist_2} https://open.spotify.com/track/{input_song_2_track_id} ---")
    display(input_song_2_row[['genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'cluster_id']])

    print(f"\n--- Recommendations based on '{my_song_2}' by {my_artist_2} ---")
    if isinstance(recommendations_2, pd.DataFrame):
        display(recommendations_2[['track_name', 'artist_name', 'track_id', 'genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'cluster_id']])
    else:
        print(recommendations_2)

except IndexError:
    print(f"Song '{my_song_2}' by {my_artist_2} not found in the dataset.")

--- Input Song: 'Catapult' by Arctic Monkeys https://open.spotify.com/track/7f1ljYFKKfNKaec4IS6M7I ---


Unnamed: 0,1040293
genre,garage
danceability,0.483
energy,0.937
mode,0
speechiness,0.0759
acousticness,0.000212
instrumentalness,0.0
valence,0.673
cluster_id,20



--- Recommendations based on 'Catapult' by Arctic Monkeys ---


Unnamed: 0,track_name,artist_name,track_id,genre,danceability,energy,mode,speechiness,acousticness,instrumentalness,valence,cluster_id
640272,Amerika First - 1977 Outtakes,The Gizmos,3b1Wo4rdsEEOAFyR7e4pDS,garage,0.306,0.904,0,0.126,0.0296,0.0566,0.556,20
499730,Better In Blak - triple j Like A Version,Beddy Rays,28xmH0wx39DUfTlNZA2pM3,garage,0.312,0.725,0,0.101,0.000134,0.0404,0.406,20
684591,West Texas Sound,The Deadly Snakes,3wUJqJuVSShVJWmNHAu00z,garage,0.206,0.938,0,0.0498,0.000147,0.316,0.64,20
236274,Runaway,Vanilla Gorilla,4l1X2rTgLUhyLNd2750QLP,garage,0.37,0.9,0,0.0426,0.0993,2.3e-05,0.704,20
276730,Comeback Kid,Kasabian,5FYRDy9dLVi2uTa8httrVk,garage,0.486,0.926,0,0.0932,0.00301,0.0,0.551,20




--- Input Song: 'Trouble Weighs A Ton' by Dan Auerbach https://open.spotify.com/track/6CF9VxGntf2m5BltF2Rrim ---


Unnamed: 0,1037585
genre,folk
danceability,0.515
energy,0.0848
mode,1
speechiness,0.0494
acousticness,0.888
instrumentalness,0.0
valence,0.389
cluster_id,1



--- Recommendations based on 'Trouble Weighs A Ton' by Dan Auerbach ---


Unnamed: 0,track_name,artist_name,track_id,genre,danceability,energy,mode,speechiness,acousticness,instrumentalness,valence,cluster_id
496322,Gold,Jake Isaac,71u5Z6SNQR4d5SEm1gr9Md,folk,0.558,0.237,1,0.0367,0.949,0.00401,0.334,1
329708,Overland,I'm With Her,4k5R36pR5HA3SHlabSQX56,folk,0.586,0.102,1,0.0395,0.936,0.0,0.278,1
897293,"Don't Think Twice, It's All Right",Joan Baez,5YqdrbmXUOYUjL7GYaSVSo,folk,0.529,0.117,1,0.0322,0.896,0.00101,0.474,1
273762,"Mother, Mother",Rayland Baxter,6X0hjAJlVW9PiqQIaGsVzd,folk,0.372,0.105,1,0.0298,0.899,0.000459,0.565,1
808990,Melo,Seth Faergolzia,0Qn9stIHY2iUwN4UzScxT1,folk,0.624,0.0514,1,0.0446,0.919,0.174,0.369,1


In [116]:
import pandas as pd

artist_name = "Dan Auerbach"

pd.set_option('display.max_rows', None)

artist_songs = song_data[song_data['artist_name'] == artist_name]

if len(artist_songs) > 0:
    print(f"--- Songs by {artist_name} ---")
    display(artist_songs)
else:
    print(f"No songs found for artist: {artist_name}")

pd.reset_option('display.max_rows')

--- Songs by Dan Auerbach ---


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
257686,257686,Dan Auerbach,Never in My Wildest Dreams,3MJov5mT64K42o1Rffhblq,61,2017,blues,0.559,0.466,5,-6.769,1,0.0362,0.315,0.0039,0.0933,0.623,117.21,176027,4
257700,257700,Dan Auerbach,Undertow,2Zv95uoSB1H4puVfAYbovF,49,2017,blues,0.434,0.726,7,-5.482,0,0.033,0.0351,0.288,0.476,0.7,92.751,203560,4
257703,257703,Dan Auerbach,King of a One Horse Town,4QF8w44G6fU4VNxQJK1OJn,50,2017,blues,0.571,0.698,9,-5.056,0,0.0353,0.0824,0.0037,0.279,0.61,91.647,226200,4
257706,257706,Dan Auerbach,Malibu Man,5fp7TUlIjOMdhNMkJJIvgx,47,2017,blues,0.615,0.782,9,-4.683,1,0.0346,0.127,0.00165,0.078,0.902,105.736,216107,4
257709,257709,Dan Auerbach,Run That Race,2NVjII4zi5d4c9F7ivNYsh,49,2017,blues,0.527,0.956,11,-3.581,0,0.0777,0.0217,0.000836,0.154,0.816,137.063,164293,4
257736,257736,Dan Auerbach,Shine on Me,3SFWCLORNGjFCwBfv3ysgy,43,2017,blues,0.429,0.893,2,-4.661,1,0.034,0.0471,0.0,0.141,0.952,152.816,197680,4
257749,257749,Dan Auerbach,Stand by My Girl,6LEFWv6jPUy0uoTY9hN6wD,38,2017,blues,0.592,0.845,2,-5.589,1,0.0297,0.0328,0.0,0.555,0.96,117.231,233507,4
257761,257761,Dan Auerbach,Cherrybomb,7ejVezMnmqGT84etM4IEmH,38,2017,blues,0.733,0.783,9,-4.93,1,0.0516,0.0465,0.0761,0.148,0.778,114.616,225093,4
257767,257767,Dan Auerbach,Waiting on a Song,1NoWKTDd0FnhUiIevfCU7u,39,2017,blues,0.614,0.915,4,-3.667,1,0.0286,0.19,1e-06,0.0837,0.964,127.203,169853,4
257783,257783,Dan Auerbach,We Gotta Get out of This Place,2344mSPPaLbvT8FLjlu1Za,36,2017,blues,0.599,0.71,5,-7.052,1,0.0265,0.188,7e-06,0.192,0.897,124.638,202154,4


In [None]:
song_cluster_df.to_csv('song_data_with_clusters.csv', index=False)