<a href="https://colab.research.google.com/github/CamCranda11/MLFA25Project/blob/main/MLProjectModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# --- Step 1: Get Your Feature Data ---
# (Mockup data for this example)
# In your project, you would load your 1-million-song CSV here.
# For example: song_data = pd.read_csv('your_song_dataset.csv')

song_data = pd.read_csv('/content/sample_data/spotify_data.csv')

print("--- Original Data ---")
print(song_data.head())
print("\n")

--- Original Data ---
   Unnamed: 0    artist_name        track_name                track_id  \
0           0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6   
1           1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218   
2           2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F   
3           3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz   
4           4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8   

   popularity  year     genre  danceability  energy  key  loudness  mode  \
0          68  2012  acoustic         0.483   0.303    4   -10.058     1   
1          50  2012  acoustic         0.572   0.454    3   -10.286     1   
2          57  2012  acoustic         0.409   0.234    3   -13.711     1   
3          58  2012  acoustic         0.392   0.251   10    -9.845     1   
4          54  2012  acoustic         0.430   0.791    6    -5.419     0   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  

In [11]:
# --- Step 2: Prepare and Scale Your Data ---

# Select only the numerical features for clustering
features_to_cluster = ['danceability', 'energy', 'mode']
# Explicitly create a copy to avoid SettingWithCopyWarning
features_df = song_data[features_to_cluster].copy()

# Impute missing 'mode' values with the mode of the column
mode_value = features_df['mode'].mode()[0]
features_df['mode'] = features_df['mode'].fillna(mode_value)

# Scale the features
# This is CRITICAL for K-Means to work correctly
scaler = StandardScaler()
scaled_features_df = scaler.fit_transform(features_df)

print("--- Scaled Features (First 5 Rows) ---")
print(scaled_features_df[:5])
print("\n")

--- Scaled Features (First 5 Rows) ---
[[-0.29509342 -1.24461718  0.75872495]
 [ 0.18734904 -0.68639325  0.75872495]
 [-0.69622536 -1.49969964  0.75872495]
 [-0.78837729 -1.43685324  0.75872495]
 [-0.58239062  0.55944426 -1.31800069]]




In [None]:
# --- Step 3: Find the Optimal 'k' (The Elbow Method) ---

inertia_values = []
# Test a wider range of k values for a large dataset
possible_k_values = range(1, 20)  # Increased the range for a larger dataset

for k in possible_k_values:
    kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    kmeans.fit(scaled_features_df)
    inertia_values.append(kmeans.inertia_)

# Plot the Elbow graph
plt.figure(figsize=(8, 5))
plt.plot(possible_k_values, inertia_values, 'bx-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (WSS)')
plt.title('Elbow Method For Finding Optimal k')
plt.show()

print("--- Elbow Plot Generated ---")
print("Look at the plot to find the 'elbow'. It might be less distinct with large datasets.\n")

In [12]:
optimal_k = 4

kmeans_model = KMeans(n_clusters=optimal_k, init='k-means++', n_init=10, random_state=42)
kmeans_model.fit(scaled_features_df)

# Get the cluster labels for each song
cluster_labels = kmeans_model.labels_

# Add the cluster labels back to our original DataFrame
song_cluster_df = song_data.copy()
song_cluster_df['cluster_id'] = cluster_labels

print("--- Data with Cluster IDs ---")
display(song_cluster_df.head())
print("\n")

--- Data with Cluster IDs ---


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster_id
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,...,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3,2
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,...,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4,3
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,...,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4,2
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,...,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4,2
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,...,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4,1






In [13]:
# --- Step 6: Build the Recommendation Logic ---

def recommend_songs(input_song_name, data_df, num_recs=3):
    """
    Recommends songs from the same cluster as the input song.
    """
    try:
        # 1. Find the cluster of the input song
        song_row = data_df[data_df['track_name'] == input_song_name].iloc[0]
        song_cluster = song_row['cluster_id']

        # 2. Get all other songs from the same cluster
        recommendations = data_df[
            (data_df['cluster_id'] == song_cluster) &  # Must be in the same cluster
            (data_df['track_name'] != input_song_name) # Must not be the original song
        ]

        # 3. Return the specified number of recommendations
        if len(recommendations) == 0:
            return "No similar songs found."

        # Use .sample() to pick random songs from the cluster
        return recommendations.sample(min(num_recs, len(recommendations)))

    except IndexError:
        return f"Song '{input_song_name}' not found in the dataset."

In [16]:
# --- Example Usage ---

# Let's get recommendations for 'Happy Tune'
my_song = 'I Won\'t Give Up'
recommendations = recommend_songs(my_song, song_cluster_df, num_recs=5)

print(f"--- Recommendations based on '{my_song}' ---")
display(recommendations[['track_name', 'cluster_id']])

print("\n")

# Let's get recommendations for 'Fast Car'
my_song_2 = 'Fast Car'
recommendations_2 = recommend_songs(my_song_2, song_cluster_df, num_recs=5)

print(f"--- Recommendations based on '{my_song_2}' ---")
display(recommendations_2[['track_name', 'cluster_id']])

--- Recommendations based on 'I Won't Give Up' ---


Unnamed: 0,track_name,cluster_id
250180,Rhythms of Abundance,2
1085029,Star of the County Down,2
1011113,Dearest - Alternate Take,2
526747,Lonely,2
905246,O Fortuna,2




--- Recommendations based on 'Fast Car' ---


Unnamed: 0,track_name,cluster_id
366464,Time After Time,2
424734,While I Can,2
566625,Trees and Rivers,2
566520,Depths of Peace,2
211178,Cada dia te extrano mas,2


--- Rows with missing 'mode' values ---


Unnamed: 0,danceability,energy,mode
62877,0.172,0.03,
