<a href="https://colab.research.google.com/github/CamCranda11/MLFA25Project/blob/main/MLProjectModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === DIET VERSION: This is not the official code I used in my web app, but instead the notebook foundation I used to build the actual python in my web app. ===
# This version is still usable, and has instructions for use on the GitHub Repository Page, but not my official deliverable.

# Import statements for my necessary tools.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load statement for my dataset.
song_data = pd.read_csv('spotify_data.csv')

In [None]:
features_to_cluster = ['danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence'] # Defining my list of audio features for clustering.
features_df = song_data[features_to_cluster].copy() # Create a new dataframe with only the necessary features.

# Imputation of columns with missing values.
mode_value = features_df['mode'].mode()[0]
features_df['mode'] = features_df['mode'].fillna(mode_value)
features_df = features_df.fillna(features_df.mean())

# Scaling so the features being used contribute equally.
scaler = StandardScaler()
scaled_features_df = scaler.fit_transform(features_df)

In [None]:
def recommend_songs(input_song_name, input_artist_name, data_df, clustering_approach='global', num_recs=3, features_to_cluster=None, optimal_k=50): # Function definition for recommendation tool
    try:
        # Find the row for the song input by the user.
        song_row = data_df[
            (data_df['track_name'] == input_song_name) &
            (data_df['artist_name'] == input_artist_name)
        ]

        # To handle a song not in the dataset.
        if song_row.empty:
             return f"Song '{input_song_name}' by {input_artist_name} not found in the dataset."

        # Pull the song information.
        song_row = song_row.iloc[0]
        song_genre = song_row['genre']

        # Global First Clustering approach, not the user friendly choice.
        if clustering_approach == 'global':
            # Check if cluster ID is available
            if 'cluster_id' not in data_df.columns:
                 return "Global clustering approach requires 'cluster_id' column in the dataframe."
            song_cluster = song_row['cluster_id']
            # Filter recommendations to find songs in the same genre.
            recommendations = data_df[
                (data_df['cluster_id'] == song_cluster) &
                (data_df['genre'] == song_genre) &
                ((data_df['track_name'] != input_song_name) | (data_df['artist_name'] != input_artist_name))
            ]

        # Genre First Clustering approach, user friendly and faster.
        elif clustering_approach == 'genre':
            # Check for required parameters.
            if features_to_cluster is None or optimal_k is None:
                 return "Genre clustering approach requires 'features_to_cluster' and 'optimal_k'."

            # Create a new dataframe with songs from the same genre as the input, omitting everything else.
            genre_df = data_df[data_df['genre'] == song_genre].copy()

            # Confirm there's enough data in the genre chosen.
            if len(genre_df) < optimal_k:
                 return f"Not enough songs ({len(genre_df)}) in the genre '{song_genre}' to perform clustering with k={optimal_k}."

            # Prepare features for clustering.
            genre_features_df = genre_df[features_to_cluster].copy()
            mode_value = genre_features_df['mode'].mode()[0]
            genre_features_df['mode'] = genre_features_df['mode'].fillna(mode_value)
            genre_features_df = genre_features_df.fillna(genre_features_df.mean())

            # Scale data.
            genre_scaler = StandardScaler()
            scaled_genre_features_df = genre_scaler.fit_transform(genre_features_df)

            # Perform clustering on all songs in the genre.
            genre_kmeans_model = KMeans(n_clusters=optimal_k, init='k-means++', n_init=10, random_state=42)
            genre_cluster_labels = genre_kmeans_model.fit_predict(scaled_genre_features_df)

            # Add the cluster IDs back into the genre dataframe.
            genre_df_clustered = genre_df.copy()
            genre_df_clustered['genre_cluster_id'] = genre_cluster_labels

            # Find the cluster ID of the input song.
            input_song_genre_cluster = genre_df_clustered[
                (genre_df_clustered['track_name'] == input_song_name) &
                (genre_df_clustered['artist_name'] == input_artist_name)
            ].iloc[0]['genre_cluster_id']

            # Filter recommendations.
            recommendations = genre_df_clustered[
                (genre_df_clustered['genre_cluster_id'] == input_song_genre_cluster) &
                ((genre_df_clustered['track_name'] != input_song_name) | (genre_df_clustered['artist_name'] != input_artist_name))
            ]
        # Handle invalid clustering approach.
        else:
            return "Invalid clustering_approach. Use 'global' or 'genre'."

        # Handle when no similar songs are found
        if len(recommendations) == 0:
            return f"No similar songs found in the same {'genre and cluster' if clustering_approach == 'genre' else 'cluster and genre'}."

        # Return sample of similar songs.
        return recommendations.sample(min(num_recs, len(recommendations)))

    # Miscellaneous/unknown exceptions.
    except IndexError:
        return f"Song '{input_song_name}' by {input_artist_name} not found in the dataset."
    except Exception as e:
        return f"An error occurred: {e}"

In [None]:
# Define inputs, Roman Empire by MISSIO is my test case as it's the only song in the dataset with the name Roman Empire.
my_song = 'Roman Empire'
my_artist = 'MISSIO'

# Define columns to display upon output.
display_columns = ['track_name', 'artist_name', 'track_id', 'genre', 'danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence']

# Find the input song information.
input_song_info = song_data[
    (song_data['track_name'] == my_song) &
    (song_data['artist_name'] == my_artist)
]

# Output the input song information.
print(f"--- Input Song: '{my_song}' by {my_artist} ---")
if not input_song_info.empty:
    display(input_song_info[display_columns])
else:
    print("Input song not found in the dataset.")

# Run recommendations function.
recommendations_genre = recommend_songs(
    my_song,
    my_artist,
    song_data,
    clustering_approach='genre',
    num_recs=5,
    features_to_cluster=['danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence'],
    optimal_k=50
)

# Print recommendations.
print(f"\n--- Recommendations based on '{my_song}' by {my_artist} ---")
if isinstance(recommendations_genre, pd.DataFrame):
    rec_display_columns = display_columns + ['genre_cluster_id'] if 'genre_cluster_id' in recommendations_genre.columns else display_columns
    display(recommendations_genre[rec_display_columns])
else:
    print(recommendations_genre)

--- Input Song: 'Roman Empire' by MISSIO ---


Unnamed: 0,track_name,artist_name,track_id,genre,danceability,energy,mode,speechiness,acousticness,instrumentalness,valence
424227,Roman Empire,MISSIO,4g6XIr9ZvyOsEBmnXBPRLB,alt-rock,0.586,0.751,0,0.0349,0.0101,0.000155,0.445



--- Recommendations based on 'Roman Empire' by MISSIO ---


Unnamed: 0,track_name,artist_name,track_id,genre,danceability,energy,mode,speechiness,acousticness,instrumentalness,valence,genre_cluster_id
311345,Rule The World,Valley Of Wolves,4TznzeF3VmoslYTy6Ppr2n,alt-rock,0.615,0.849,0,0.0694,0.265,5.3e-05,0.497,26
311190,Livin' In a Dream (feat. Nipsey Hussle),The Neighbourhood,6wbbeA1FcbqXiJs7aIA8x7,alt-rock,0.584,0.779,0,0.0721,0.259,0.00606,0.518,26
424206,Fever (CH. I),Lewis Del Mar,5t23Z092UJGSkVpNaA8NYy,alt-rock,0.594,0.809,0,0.0334,0.0138,0.0176,0.36,26
479168,.AmericanBlood.,Dead Poet Society,6oCypWum4KVSAvEIhEc9R7,alt-rock,0.615,0.708,0,0.031,0.000709,0.00175,0.578,26
752502,I Climb,Thousand Foot Krutch,13rs8zN46Bi5YVzS4T0mJe,alt-rock,0.57,0.914,0,0.0403,0.00514,0.0,0.523,26


In [None]:
# Simple search function I created to test and analyze what was in the dataset for a specific artist.
import pandas as pd

# Define artist name.
artist_name = "MISSIO"

# Set pandas to not truncate.
pd.set_option('display.max_rows', None)

# Filter dataframe to only show songs from the specified artist.
artist_songs = song_data[song_data['artist_name'] == artist_name]

# Output the list of songs from the specified artist.
if len(artist_songs) > 0:
    print(f"--- Songs by {artist_name} ---")
    display(artist_songs)
else:
    print(f"No songs found for artist: {artist_name}")

# Reset to avoid affecting other displays.
pd.reset_option('display.max_rows')

--- Songs by MISSIO ---


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
214936,214936,MISSIO,West Coast,6zvbty1Vv0uR5Zajlx43gk,43,2016,alt-rock,0.341,0.612,9,-8.035,0,0.0421,0.0954,0.00114,0.146,0.159,119.971,251873,4
254824,254824,MISSIO,Twisted,4TQcARE7Fd58akNhr3N7AE,64,2017,alt-rock,0.577,0.789,2,-4.661,1,0.0427,0.00532,0.0105,0.214,0.149,97.52,222707,3
254841,254841,MISSIO,Everybody Gets High,3nH3Gtm9GQVfiaYDr4AXji,59,2017,alt-rock,0.432,0.558,11,-6.843,1,0.286,0.0234,0.000358,0.0924,0.46,58.748,212093,4
254875,254875,MISSIO,Bottom Of The Deep Blue Sea,6AvslIXIi9iaGvukefyVVK,56,2017,alt-rock,0.573,0.754,4,-4.788,1,0.0329,0.0248,0.00019,0.104,0.241,132.964,231587,4
254937,254937,MISSIO,I Don't Even Care About You,1WqufDaYuPfploeAnwZuGI,51,2017,alt-rock,0.539,0.704,11,-6.861,0,0.045,0.000708,0.00368,0.0782,0.185,145.056,215413,4
254970,254970,MISSIO,I Don't Give A... (feat. Zeale),67ejyJ02NkUc7eZzab3imz,48,2017,alt-rock,0.504,0.622,4,-5.649,0,0.0557,0.00197,0.000467,0.114,0.108,141.205,211213,4
254994,254994,MISSIO,Middle Fingers,1RgvTl4vQN4qECtnO1guEp,45,2017,alt-rock,0.517,0.553,1,-5.135,1,0.0329,0.000104,4e-06,0.133,0.0775,139.26,215200,4
255137,255137,MISSIO,Can I Exist,1PfstIZqhC77Xpctr80Urv,45,2017,alt-rock,0.428,0.487,5,-7.747,1,0.0331,0.497,0.332,0.111,0.137,119.925,273803,4
255147,255147,MISSIO,Kamikazee,5HDU3hPQinaefJtLzg3Lpm,43,2017,alt-rock,0.45,0.859,5,-4.754,0,0.068,0.00146,0.00139,0.324,0.216,82.504,221387,4
255315,255315,MISSIO,Animal,4OBgiRQZyzztNzR8fpiRbF,40,2017,alt-rock,0.437,0.708,11,-5.811,0,0.0315,0.0152,0.000577,0.13,0.198,140.125,210187,4
