<a href="https://colab.research.google.com/github/CamCranda11/MLFA25Project/blob/main/MLProjectModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

song_data = pd.read_csv('spotify_data.csv')

print("--- Original Data ---")
print(song_data.head())
print("\n")

--- Original Data ---
   Unnamed: 0    artist_name        track_name                track_id  \
0           0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6   
1           1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218   
2           2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F   
3           3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz   
4           4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8   

   popularity  year     genre  danceability  energy  key  loudness  mode  \
0          68  2012  acoustic         0.483   0.303    4   -10.058     1   
1          50  2012  acoustic         0.572   0.454    3   -10.286     1   
2          57  2012  acoustic         0.409   0.234    3   -13.711     1   
3          58  2012  acoustic         0.392   0.251   10    -9.845     1   
4          54  2012  acoustic         0.430   0.791    6    -5.419     0   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  

In [24]:
features_to_cluster = ['danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence']
features_df = song_data[features_to_cluster].copy()

mode_value = features_df['mode'].mode()[0]
features_df['mode'] = features_df['mode'].fillna(mode_value)

scaler = StandardScaler()
scaled_features_df = scaler.fit_transform(features_df)

print("--- Scaled Features (First 5 Rows) ---")
print(scaled_features_df[:5])
print("\n")

--- Scaled Features (First 5 Rows) ---
[[-0.29509342 -1.24461718  0.75872495 -0.39352278  1.04922991 -0.69122871
  -1.17892497]
 [ 0.18734904 -0.68639325  0.75872495 -0.52833738  0.43794003 -0.69119118
   0.22134908]
 [-0.69622536 -1.49969964  0.75872495 -0.47709206  0.04637646 -0.69109175
  -1.15658017]
 [-0.78837729 -1.43685324  0.75872495 -0.44555648  1.36755137 -0.69122871
   0.19528015]
 [-0.58239062  0.55944426 -1.31800069 -0.49364824 -0.70125643 -0.63836256
  -0.88844259]]




In [25]:
optimal_k = 50

kmeans_model = KMeans(n_clusters=optimal_k, init='k-means++', n_init=10, random_state=42)
kmeans_model.fit(scaled_features_df)

cluster_labels = kmeans_model.labels_

song_cluster_df = song_data.copy()
song_cluster_df['cluster_id'] = cluster_labels

print("--- Data with Cluster IDs ---")
display(song_cluster_df.head())
print("\n")

--- Data with Cluster IDs ---


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster_id
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,...,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3,17
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,...,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4,27
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,...,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4,17
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,...,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4,1
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,...,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4,40






In [26]:
def recommend_songs(input_song_name, input_artist_name, data_df, num_recs=3):
    """
    Recommends songs from the same cluster as the input song and artist.
    Includes track_id for generating Spotify links.
    """
    try:
        song_row = data_df[
            (data_df['track_name'] == input_song_name) &
            (data_df['artist_name'] == input_artist_name)
        ].iloc[0]
        song_cluster = song_row['cluster_id']

        recommendations = data_df[
            (data_df['cluster_id'] == song_cluster) &
            ((data_df['track_name'] != input_song_name) | (data_df['artist_name'] != input_artist_name))
        ]

        if len(recommendations) == 0:
            return "No similar songs found."

        return recommendations.sample(min(num_recs, len(recommendations)))

    except IndexError:
        return f"Song '{input_song_name}' by {input_artist_name} not found in the dataset."

In [27]:
my_song = 'Do I Wanna Know?'
my_artist = 'Arctic Monkeys'
recommendations = recommend_songs(my_song, my_artist, song_cluster_df, num_recs=5)

try:
    input_song_row = song_cluster_df[
        (song_cluster_df['track_name'] == my_song) &
        (song_cluster_df['artist_name'] == my_artist)
    ].iloc[0]
    input_song_track_id = input_song_row['track_id']
    print(f"--- Input Song: '{my_song}' by {my_artist} https://open.spotify.com/track/{input_song_track_id} ---")
    display(input_song_row[['genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'cluster_id']])

    print(f"\n--- Recommendations based on '{my_song}' by {my_artist} ---")
    if isinstance(recommendations, pd.DataFrame):
        display(recommendations[['track_name', 'artist_name', 'track_id', 'genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'cluster_id']])
    else:
        print(recommendations)


except IndexError:
    print(f"Song '{my_song}' by {my_artist} not found in the dataset.")


print("\n")

my_song_2 = 'Some Nights'
my_artist_2 = 'fun.'
recommendations_2 = recommend_songs(my_song_2, my_artist_2, song_cluster_df, num_recs=5)

try:
    input_song_2_row = song_cluster_df[
        (song_cluster_df['track_name'] == my_song_2) &
        (song_cluster_df['artist_name'] == my_artist_2)
    ].iloc[0]
    input_song_2_track_id = input_song_2_row['track_id']
    print(f"--- Input Song: '{my_song_2}' by {my_artist_2} https://open.spotify.com/track/{input_song_2_track_id} ---")
    display(input_song_2_row[['genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'cluster_id']])

    print(f"\n--- Recommendations based on '{my_song_2}' by {my_artist_2} ---")
    if isinstance(recommendations_2, pd.DataFrame):
        display(recommendations_2[['track_name', 'artist_name', 'track_id', 'genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'cluster_id']])
    else:
        print(recommendations_2)

except IndexError:
    print(f"Song '{my_song_2}' by {my_artist_2} not found in the dataset.")

--- Input Song: 'Do I Wanna Know?' by Arctic Monkeys https://open.spotify.com/track/5FVd6KXrgO9B3JPmC8OPst ---


Unnamed: 0,77593
genre,garage
danceability,0.548
energy,0.532
mode,1
speechiness,0.0323
acousticness,0.186
instrumentalness,0.000263
valence,0.405
cluster_id,10



--- Recommendations based on 'Do I Wanna Know?' by Arctic Monkeys ---


Unnamed: 0,track_name,artist_name,track_id,genre,danceability,energy,mode,speechiness,acousticness,instrumentalness,valence,cluster_id
532241,Christ Our King - Live From Camp,Passion,36JWXzz773ljAGmwwz4ISZ,alt-rock,0.301,0.47,1,0.0314,0.195,0.0,0.0724,10
778848,친구의 친구를 사랑했네,Lee Seung Chul,7fl6Cs2uXWpF5tYw3mOol2,k-pop,0.546,0.662,1,0.0378,0.104,0.00228,0.226,10
613864,Letter to the North Star - Live - Set 2,Hot Tuna,3PuScVxsvDkyb4LORrzeBL,psych-rock,0.463,0.456,1,0.0345,0.104,0.022,0.307,10
1065974,Stars,Skye,2GN64qVdQhLrYKOMWg23lR,trip-hop,0.534,0.325,1,0.033,0.0523,8.2e-05,0.222,10
629413,The Grocer's Daughter,Television Personalities,3UPrhHsvYavtFjPIoHCoKQ,club,0.39,0.615,1,0.0296,0.256,0.0,0.536,10




--- Input Song: 'Some Nights' by fun. https://open.spotify.com/track/6t6oULCRS6hnI7rm0h5gwl ---


Unnamed: 0,40353
genre,pop
danceability,0.672
energy,0.738
mode,1
speechiness,0.0506
acousticness,0.0178
instrumentalness,0.000068
valence,0.392
cluster_id,15



--- Recommendations based on 'Some Nights' by fun. ---


Unnamed: 0,track_name,artist_name,track_id,genre,danceability,energy,mode,speechiness,acousticness,instrumentalness,valence,cluster_id
844830,Pop Tailgate. . . Wooooooooooooo,Chingo Bling,62A0KBd0VabDfr8aGc7IYB,comedy,0.78,0.87,1,0.0301,0.00248,0.0,0.42,15
506971,Inside Outside,Mac Miller,2EFqMCOdTTkcFYHoJH21Jr,hip-hop,0.689,0.746,1,0.0549,0.0771,0.0,0.376,15
378022,Anti-Everything,Lost Kings,5d1fRO6RYAtbPPgbpSHnlA,dance,0.734,0.663,1,0.185,0.28,0.0,0.256,15
270580,Show & Tell,Said The Sky,5F3E5Ta8IvhQJiEweHL2Ec,dub,0.667,0.86,1,0.0527,0.235,0.0,0.461,15
365949,Ocean,Parachute,1nFJAKHt0jgzVogbNJT3Zz,acoustic,0.662,0.711,1,0.0279,0.0686,1.2e-05,0.352,15
