In [88]:
# Import Dependencies
import os
import numpy as np
import pandas as pd
import utils as utils
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
import streamlit as st

from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [89]:
# Import and read the data
df_raw = pd.read_csv("data/top_10000_1960-now.csv")
df_raw.head()

Unnamed: 0,Track URI,Track Name,Artist URI(s),Artist Name(s),Album URI,Album Name,Album Artist URI(s),Album Artist Name(s),Album Release Date,Album Image URL,...,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Album Genres,Label,Copyrights
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,Justified & Ancient - Stand by the Jams,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,spotify:album:4MC0ZjNtVP1nDD5lsLxFjc,Songs Collection,spotify:artist:6dYrdRlNZSKaVxYg5IrvCH,The KLF,1992-08-03,https://i.scdn.co/image/ab67616d0000b27355346b...,...,0.048,0.0158,0.112,0.408,0.504,111.458,4.0,,Jams Communications,"C 1992 Copyright Control, P 1992 Jams Communic..."
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,I Know You Want Me (Calle Ocho),spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,spotify:album:5xLAcbvbSAlRtPXnKkggXA,Pitbull Starring In Rebelution,spotify:artist:0TnOYISbd1XYRBk9myaseg,Pitbull,2009-10-23,https://i.scdn.co/image/ab67616d0000b27326d73a...,...,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0,,Mr.305/Polo Grounds Music/J Records,"P (P) 2009 RCA/JIVE Label Group, a unit of Son..."
2,spotify:track:70XtWbcVZcpaOddJftMcVi,From the Bottom of My Broken Heart,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,spotify:album:3WNxdumkSMGMJRhEgK80qx,...Baby One More Time (Digital Deluxe Version),spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,1999-01-12,https://i.scdn.co/image/ab67616d0000b2738e4986...,...,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0,,Jive,P (P) 1999 Zomba Recording LLC
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,Apeman - 2014 Remastered Version,spotify:artist:1SQRv42e4PjEYfPhS0Tk9E,The Kinks,spotify:album:6lL6HugNEN4Vlc8sj0Zcse,"Lola vs. Powerman and the Moneygoround, Pt. On...",spotify:artist:1SQRv42e4PjEYfPhS0Tk9E,The Kinks,2014-10-20,https://i.scdn.co/image/ab67616d0000b2731e7c53...,...,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0,,Sanctuary Records,"C © 2014 Sanctuary Records Group Ltd., a BMG C..."
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,You Can't Always Get What You Want,spotify:artist:22bE4uQ6baNwSHPVcDxLCe,The Rolling Stones,spotify:album:0c78nsgqX6VfniSNWIxwoD,Let It Bleed,spotify:artist:22bE4uQ6baNwSHPVcDxLCe,The Rolling Stones,1969-12-05,https://i.scdn.co/image/ab67616d0000b27373d927...,...,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0,,Universal Music Group,"C © 2002 ABKCO Music & Records Inc., P ℗ 2002 ..."


In [90]:
# list columns for features and target
df_raw.columns

Index(['Track URI', 'Track Name', 'Artist URI(s)', 'Artist Name(s)',
       'Album URI', 'Album Name', 'Album Artist URI(s)',
       'Album Artist Name(s)', 'Album Release Date', 'Album Image URL',
       'Disc Number', 'Track Number', 'Track Duration (ms)',
       'Track Preview URL', 'Explicit', 'Popularity', 'ISRC', 'Added By',
       'Added At', 'Artist Genres', 'Danceability', 'Energy', 'Key',
       'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Album Genres',
       'Label', 'Copyrights'],
      dtype='object')

In [91]:
# Drop unnecessary columns
# all columns listed, columns to keep are commented out.
df_data = df_raw.drop([#'Track URI',
                       'Track Name',
                       'Artist URI(s)',
                       'Artist Name(s)',
                       'Album URI',
                       'Album Name',
                       'Album Artist URI(s)',
                       'Album Artist Name(s)',
                       'Album Release Date',
                       'Album Image URL',
                       'Disc Number',
                       'Track Number',
                       'Track Duration (ms)',
                       'Track Preview URL',
                       #'Explicit',
                       'Popularity',
                       'ISRC',
                       'Added By',
                       'Added At',
                       #'Artist Genres',
                       #'Danceability',
                       #'Energy',
                       #'Key',
                       'Loudness',
                       'Mode',
                       #'Speechiness',
                       #'Acousticness',
                       #'Instrumentalness',
                       #'Liveness',
                       #'Valence',
                       #'Tempo',
                       #'Time Signature',
                       'Album Genres',
                       'Label',
                       'Copyrights'],
                       axis=1)

In [92]:
# Review remaining column names
df_data.columns 

Index(['Track URI', 'Explicit', 'Artist Genres', 'Danceability', 'Energy',
       'Key', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Time Signature'],
      dtype='object')

In [93]:
# Columns renamed to follow convention
df_data = df_data.rename(columns={
                   'Track URI': 'track_uri',
                   'Album Image URL': 'image',
                   'Explicit': 'explicit',
                   'Popularity': 'popularity',
                   'Artist Genres': 'artist_genres',
                   'Danceability': 'danceability',
                   'Energy': 'energy',
                   'Key': 'key',
                   'Speechiness': 'speechiness',
                   'Acousticness': 'acousticness',
                   'Instrumentalness': 'instrumentalness',
                   'Liveness': 'liveness',
                   'Valence': 'valence',
                   'Tempo': 'tempo',
                   'Time Signature': 'time_signature'
       })


In [94]:
# Verify Update
df_data.columns

Index(['track_uri', 'explicit', 'artist_genres', 'danceability', 'energy',
       'key', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature'],
      dtype='object')

In [95]:
# Dropping null columns
df_data = df_data.dropna(how="any")

In [96]:
# Reset index on dataframe
df_data = df_data.reset_index(drop=True)


In [97]:
# utils.plot_correlation_heatmap(df_data)

In [98]:
# utils.plot_numeric_distributions(df_data)

#### Cleaning and encoding the ['Artist Genres'] column

In [99]:
## Cleaning and encoding the 'artist genres' column
# Explore the values
df_data['artist_genres'].value_counts()

artist_genres
dance pop,pop                                                       254
australian rock                                                     243
pop                                                                 229
australian pop,australian talent show                                86
australian pop                                                       73
                                                                   ... 
australian dance,australian pop,nyc rap                               1
edm,house,indietronica,uk dance,art pop,metropopolis,nz pop,pop       1
uk contemporary r&b,uk pop,new jersey underground rap,trap queen      1
classic country pop,country,country rock,soft rock                    1
disco house,vocal house,dance pop,europop,new wave pop                1
Name: count, Length: 2815, dtype: int64

In [100]:
## Cleaning and encoding the 'artist genres' column
# how many unique genre combos are there?
# Explore the values
df_data['artist_genres'].nunique()

2815

In [101]:
## Cleaning and encoding the 'artist genres' column
# Add a space after any commas if one is not already present
df_data['artist_genres'] = df_data['artist_genres'].str.replace(
    r',(?=\S)', ', ', regex=True
    )


In [102]:
## Cleaning and encoding the 'artist genres' column
# Verify spaces added
df_data['artist_genres'].value_counts()

artist_genres
dance pop, pop                                                            254
australian rock                                                           243
pop                                                                       229
australian pop, australian talent show                                     86
australian pop                                                             73
                                                                         ... 
australian dance, australian pop, nyc rap                                   1
edm, house, indietronica, uk dance, art pop, metropopolis, nz pop, pop      1
uk contemporary r&b, uk pop, new jersey underground rap, trap queen         1
classic country pop, country, country rock, soft rock                       1
disco house, vocal house, dance pop, europop, new wave pop                  1
Name: count, Length: 2815, dtype: int64

In [103]:
## Cleaning and encoding the 'artist genres' column
# replace spaces with and underscore where a letter character is on either side
df_data['artist_genres'] = df_data['artist_genres'].str.replace(
    r'(?<=[a-zA-Z]) (?=[a-zA-Z])', '_', regex=True
    )

In [104]:
## Cleaning and encoding the 'artist genres' column
# Verify underscores inserted
df_data['artist_genres'].value_counts()

artist_genres
dance_pop, pop                                                            254
australian_rock                                                           243
pop                                                                       229
australian_pop, australian_talent_show                                     86
australian_pop                                                             73
                                                                         ... 
australian_dance, australian_pop, nyc_rap                                   1
edm, house, indietronica, uk_dance, art_pop, metropopolis, nz_pop, pop      1
uk_contemporary_r&b, uk_pop, new_jersey_underground_rap, trap_queen         1
classic_country_pop, country, country_rock, soft_rock                       1
disco_house, vocal_house, dance_pop, europop, new_wave_pop                  1
Name: count, Length: 2815, dtype: int64

In [105]:
## Cleaning and encoding the 'artist genres' column
# Split the ['artist_genres'] stings into lists
df_data['artist_genres'] = df_data['artist_genres'].str.split(', ')

In [106]:
df_data.head()

Unnamed: 0,track_uri,explicit,artist_genres,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,False,"[acid_house, ambient_house, big_beat, hip_house]",0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458,4.0
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,False,"[dance_pop, miami_hip_hop, pop]",0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0
2,spotify:track:70XtWbcVZcpaOddJftMcVi,False,"[dance_pop, pop]",0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,False,"[album_rock, art_rock, british_invasion, class...",0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,False,"[album_rock, british_invasion, classic_rock, r...",0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0


In [107]:
## Cleaning and encoding the 'artist genres' column
# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [108]:
## Cleaning and encoding the 'artist genres' column
# fit and transform 'Artist Genres' column
encoded_genres = mlb.fit_transform(df_data['artist_genres'])
df_encoded_genres = pd.DataFrame(encoded_genres)
df_encoded_genres.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,898,899,900,901,902,903,904,905,906,907
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
## Cleaning and encoding the 'artist genres' column
# concatenate back into the original DataFrame
df_encoded = pd.concat([df_data.drop(columns=['artist_genres']), df_encoded_genres], axis=1)

# Handle missing values (if any)
#df_encoded.fillna(0, inplace=True)


In [110]:
# Encode the ['explicit'] column
df_encoded['explicit'] = df_encoded['explicit'].map({True: 1, False: 0})
df_encoded.head()

Unnamed: 0,track_uri,explicit,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,...,898,899,900,901,902,903,904,905,906,907
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,0,0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,...,0,0,0,0,0,0,0,0,0,0
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,0,0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,...,0,0,0,0,0,0,0,0,0,0
2,spotify:track:70XtWbcVZcpaOddJftMcVi,0,0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,...,0,0,0,0,0,0,0,0,0,0
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,0,0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,...,0,0,0,0,0,0,0,0,0,0
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,0,0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,...,0,0,0,0,0,0,0,0,0,0


In [111]:
df_encoded.dtypes

track_uri        object
explicit          int64
danceability    float64
energy          float64
key             float64
                 ...   
903               int64
904               int64
905               int64
906               int64
907               int64
Length: 920, dtype: object

In [112]:
# Create features dataframe
# Set column names as strings
df_x = df_encoded.drop(columns='track_uri')
df_x.columns = df_x.columns.astype(str)

In [113]:
df_x.head()


Unnamed: 0,explicit,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,898,899,900,901,902,903,904,905,906,907
0,0,0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458,...,0,0,0,0,0,0,0,0,0,0
1,0,0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045,...,0,0,0,0,0,0,0,0,0,0
2,0,0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981,...,0,0,0,0,0,0,0,0,0,0
3,0,0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311,...,0,0,0,0,0,0,0,0,0,0
4,0,0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818,...,0,0,0,0,0,0,0,0,0,0


In [114]:
df_data.head()

Unnamed: 0,track_uri,explicit,artist_genres,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,False,"[acid_house, ambient_house, big_beat, hip_house]",0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458,4.0
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,False,"[dance_pop, miami_hip_hop, pop]",0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0
2,spotify:track:70XtWbcVZcpaOddJftMcVi,False,"[dance_pop, pop]",0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,False,"[album_rock, art_rock, british_invasion, class...",0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,False,"[album_rock, british_invasion, classic_rock, r...",0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0


In [115]:
# Running pca without genres column
# # Scale data with Standard Scaler
scaler = StandardScaler()

df_test = df_data.drop(columns=['track_uri', 'artist_genres'])

#scaled_data = scaler.fit_transform(df_test)

# call PCA
pca = PCA(n_components=2)

# fit and apply
genres_pca = pca.fit_transform(df_test)

# Create DataFrame with PCA results
genres_pca_df = pd.DataFrame(
    genres_pca,
    columns=['genre_pca_1',
            'genre_pca_2'
            ])



In [116]:
pca.explained_variance_ratio_

array([0.98133885, 0.01820221])

In [117]:
# create pca dataframe
pca_test_df = pd.DataFrame()

In [118]:
# determine k value


In [119]:
# elbow

In [120]:
df_data.drop(columns=['artist_genres','time_signature'])

Unnamed: 0,track_uri,explicit,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,spotify:track:1XAZlnVtthcDZt2NI1Dtxo,False,0.617,0.872,8.0,0.0480,0.015800,0.112000,0.4080,0.504,111.458
1,spotify:track:6a8GbQIlV8HBUW3c6Uk9PH,False,0.825,0.743,2.0,0.1490,0.014200,0.000021,0.2370,0.800,127.045
2,spotify:track:70XtWbcVZcpaOddJftMcVi,False,0.677,0.665,7.0,0.0305,0.560000,0.000001,0.3380,0.706,74.981
3,spotify:track:1NXUWyPJk5kO6DQJ5t7bDu,False,0.683,0.728,9.0,0.2590,0.568000,0.000051,0.0384,0.833,75.311
4,spotify:track:72WZtWs6V7uu3aMgMmEkYe,False,0.319,0.627,0.0,0.0687,0.675000,0.000073,0.2890,0.497,85.818
...,...,...,...,...,...,...,...,...,...,...,...
9444,spotify:track:3kcKlOkQQEPVwxwljbGJ5p,False,0.623,0.727,11.0,0.0562,0.184000,0.000020,0.3090,0.400,125.975
9445,spotify:track:5k9QrzJFDAp5cXVdzAi02f,False,0.720,0.841,9.0,0.0340,0.000354,0.011200,0.3380,0.767,130.978
9446,spotify:track:5ydeCNaWDmFbu4zl0roPAH,False,0.719,0.806,9.0,0.0389,0.000132,0.088900,0.3610,0.626,123.037
9447,spotify:track:0zKbDrEXKpnExhGQRe9dxt,False,0.534,0.855,1.0,0.1830,0.060700,0.000263,0.3460,0.420,122.060


In [121]:
# ChattGPT reference code#####################################

#KNN Model 

import pandas as pd
import numpy as np
import streamlit as st
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors



# Extract Features and Scale Data
features = ["danceability", "energy", "speechiness", "instrumentalness", "valence"]
X = df_data[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Train KNN Model
knn = NearestNeighbors(n_neighbors=1, metric="euclidean")  
knn.fit(X_scaled)

#Take User Input
def get_user_input(features):
    st.title("Spotify Song Recommendation")
    st.write("Enter song characteristics to get recommended tracks!")
    danceability = st.slider("Danceability (Low Danceability <-> High Danceability)", 0.0, 1.0, 0.5)
    energy = st.slider("Energy (Low Energy <-> High Energy)", 0.0, 1.0, 0.5)
    speechiness = st.slider("Speechiness (Low Presence of Words <-> High Presence of Words)", 0.0, 1.0, 0.1)
    instrumentalness = st.slider("Instrumentalness (Low Presence of Instruments <-> High Presence of Instruments)", 0.0, 1.0, 0.0)
    valence = st.slider("Mood (Sad <-> Happy)", 0.0, 1.0, 0.5)

    user_features = np.array([[danceability, energy, speechiness, instrumentalness, valence]])
    user_features_scaled = scaler.transform(user_features)  
    return user_features_scaled

# :dart: Step 4: Find the Best Matching Song
def find_best_match(user_input):
    _, index = knn.kneighbors(user_input) #enter in slider data here 
    best_match_index = index[0][0]
    return df_data.iloc[best_match_index]["track_uri"]


user_input_scaled = get_user_input(features)
best_match = find_best_match(user_input_scaled)
print(f"\n:notes: Best match for your input: **{best_match}** :musical_note:")

2025-02-13 19:03:31.621 
  command:

    streamlit run /opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]



:notes: Best match for your input: **spotify:track:6S4Z214kph4Pnx1Xkr9Obs** :musical_note:


In [122]:
features = np.array([[danceability, energy, speechiness, instrumentalness, valence]])
    danceability = float(input("Danceability (0-1): "))
    energy = float(input("Energy (0-1): "))
    tempo = float(input("Tempo (BPM): "))
    user_features = np.array([[danceability, energy, tempo]])
    user_features_scaled = scaler.transform(user_features)  # Scale input
    return user_features_scaled

# :dart: Step 4: Find the Best Matching Song
def find_best_match(user_input):
    _, index = knn.kneighbors(user_input) #enter in slider data here 
    best_match_index = index[0][0]
    return df.iloc[best_match_index]["track_name"]

# :dart: Step 5: Run the Matcher
user_input_scaled = get_user_input()
best_match = find_best_match(user_input_scaled)
print(f"\n:notes: Best match for your input: **{best_match}** :musical_note:")

IndentationError: unexpected indent (415869406.py, line 2)

In [4]:
#K-Means

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Feed in our Dataframe

df = pd.DataFrame(data)

#Step 1: Preprocess Features
features = ["danceability", "energy", "tempo"]
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Step 2: Train K-Means Model
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)  # Adjust clusters as needed
df["cluster"] = kmeans.fit_predict(X_scaled)

# Step 3: Streamlit UI
st.title(":musical_note: K-Means Song Recommendation System")
# User input sliders
danceability = st.slider("Danceability (0-1)", min_value=0.0, max_value=1.0, step=0.01, value=0.5)
energy = st.slider("Energy (0-1)", min_value=0.0, max_value=1.0, step=0.01, value=0.5)
tempo = st.slider("Tempo (BPM)", min_value=50, max_value=200, step=1, value=120)
# Function to find closest cluster
def find_best_cluster(user_input):
    user_scaled = scaler.transform(user_input)  # Scale input
    cluster_label = kmeans.predict(user_scaled)[0]  # Find closest cluster
    return cluster_label
# Function to recommend a song from the closest cluster
def recommend_song(cluster_label):
    cluster_songs = df[df["cluster"] == cluster_label]
    return cluster_songs.sample(1)["track_name"].values[0]  # Random song from cluster
# :dart: Step 4: Predict & Recommend
if st.button("Find My Song :headphones:"):
    user_features = np.array([[danceability, energy, tempo]])
    best_cluster = find_best_cluster(user_features)
    best_match = recommend_song(best_cluster)
    st.success(f":notes: Best match from Cluster {best_cluster}: **{best_match}** :musical_note:")

NameError: name 'K' is not defined

In [6]:
#Gaussian Mixture Model

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

# 🎵 Example Dataset (Replace with actual dataset)
data = {
    "track_name": ["Song A", "Song B", "Song C", "Song D", "Song E", "Song F", "Song G"],
    "danceability": [0.8, 0.6, 0.9, 0.4, 0.7, 0.5, 0.3],
    "energy": [0.7, 0.5, 0.9, 0.3, 0.6, 0.4, 0.2],
    "tempo": [120, 130, 110, 100, 125, 140, 90]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# 🎯 Step 1: Preprocess Features
features = ["danceability", "energy", "tempo"]
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 🎯 Step 2: Train Gaussian Mixture Model (GMM)
gmm = GaussianMixture(n_components=3, random_state=42, covariance_type='full')
df["gmm_cluster"] = gmm.fit_predict(X_scaled)

# 🎯 Step 3: Streamlit UI
st.title("🎵 Gaussian Mixture Model Song Recommendation")

# User input sliders
danceability = st.slider("Danceability (0-1)", min_value=0.0, max_value=1.0, step=0.01, value=0.5)
energy = st.slider("Energy (0-1)", min_value=0.0, max_value=1.0, step=0.01, value=0.5)
tempo = st.slider("Tempo (BPM)", min_value=50, max_value=200, step=1, value=120)

# Function to find closest cluster
def find_best_cluster(user_input):
    user_scaled = scaler.transform(user_input)  # Scale input
    cluster_label = gmm.predict(user_scaled)[0]  # Predict closest cluster
    return cluster_label

# Function to recommend a song from the closest cluster
def recommend_song(cluster_label):
    cluster_songs = df[df["gmm_cluster"] == cluster_label]
    if cluster_songs.empty:
        return "No match found, try adjusting your input."
    return cluster_songs.sample(1)["track_name"].values[0]  # Random song from cluster

# 🎯 Step 4: Predict & Recommend
if st.button("Find My Song 🎧"):
    user_features = np.array([[danceability, energy, tempo]])
    best_cluster = find_best_cluster(user_features)
    best_match = recommend_song(best_cluster)
    
    st.success(f"🎶 Best match from GMM: **{best_match}** 🎵")

In [None]:
#DBSAN  

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# 🎵 Example Dataset (Replace with actual dataset)
data = {
    "track_name": ["Song A", "Song B", "Song C", "Song D", "Song E", "Song F", "Song G"],
    "danceability": [0.8, 0.6, 0.9, 0.4, 0.7, 0.5, 0.3],
    "energy": [0.7, 0.5, 0.9, 0.3, 0.6, 0.4, 0.2],
    "tempo": [120, 130, 110, 100, 125, 140, 90]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# 🎯 Step 1: Preprocess Features
features = ["danceability", "energy", "tempo"]
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 🎯 Step 2: Train DBSCAN Model
dbscan = DBSCAN(eps=1.0, min_samples=2)  # Adjust parameters based on data
df["dbscan_cluster"] = dbscan.fit_predict(X_scaled)

# 🎯 Step 3: Streamlit UI
st.title("🎵 DBSCAN Song Recommendation System")

# User input sliders
danceability = st.slider("Danceability (0-1)", min_value=0.0, max_value=1.0, step=0.01, value=0.5)
energy = st.slider("Energy (0-1)", min_value=0.0, max_value=1.0, step=0.01, value=0.5)
tempo = st.slider("Tempo (BPM)", min_value=50, max_value=200, step=1, value=120)

# Function to find closest cluster
def find_best_cluster(user_input):
    user_scaled = scaler.transform(user_input)  # Scale input
    cluster_label = dbscan.fit_predict(user_scaled)[0]  # DBSCAN does not predict well
    return cluster_label

# Function to recommend a song from the closest cluster
def recommend_song(cluster_label):
    cluster_songs = df[df["dbscan_cluster"] == cluster_label]
    if cluster_songs.empty:
        return "No match found, try adjusting your input."
    return cluster_songs.sample(1)["track_name"].values[0]  # Random song from cluster

# 🎯 Step 4: Predict & Recommend
if st.button("Find My Song 🎧"):
    user_features = np.array([[danceability, energy, tempo]])
    best_cluster = find_best_cluster(user_features)
    best_match = recommend_song(best_cluster)
    
    st.success(f"🎶 Best match from DBSCAN: **{best_match}** 🎵")

In [7]:
#Agglomerative Clustering

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

# 🎵 Example Dataset (Replace with actual dataset)
data = {
    "track_name": ["Song A", "Song B", "Song C", "Song D", "Song E", "Song F", "Song G"],
    "danceability": [0.8, 0.6, 0.9, 0.4, 0.7, 0.5, 0.3],
    "energy": [0.7, 0.5, 0.9, 0.3, 0.6, 0.4, 0.2],
    "tempo": [120, 130, 110, 100, 125, 140, 90]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# 🎯 Step 1: Preprocess Features
features = ["danceability", "energy", "tempo"]
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 🎯 Step 2: Train Agglomerative Clustering Model
agg_clustering = AgglomerativeClustering(n_clusters=3)
df["agg_cluster"] = agg_clustering.fit_predict(X_scaled)

# 🎯 Step 3: Streamlit UI
st.title("🎵 Agglomerative Clustering Song Recommendation")

# User input sliders
danceability = st.slider("Danceability (0-1)", min_value=0.0, max_value=1.0, step=0.01, value=0.5)
energy = st.slider("Energy (0-1)", min_value=0.0, max_value=1.0, step=0.01, value=0.5)
tempo = st.slider("Tempo (BPM)", min_value=50, max_value=200, step=1, value=120)

# Function to find closest cluster
def find_best_cluster(user_input):
    user_scaled = scaler.transform(user_input)  # Scale input
    distances = cdist(user_scaled, X_scaled)  # Compute distances to existing points
    closest_point = np.argmin(distances)  # Find closest song in dataset
    cluster_label = df.iloc[closest_point]["agg_cluster"]
    return cluster_label

# Function to recommend a song from the closest cluster
def recommend_song(cluster_label):
    cluster_songs = df[df["agg_cluster"] == cluster_label]
    if cluster_songs.empty:
        return "No match found, try adjusting your input."
    return cluster_songs.sample(1)["track_name"].values[0]  # Random song from cluster

# 🎯 Step 4: Predict & Recommend
if st.button("Find My Song 🎧"):
    user_features = np.array([[danceability, energy, tempo]])
    best_cluster = find_best_cluster(user_features)
    best_match = recommend_song(best_cluster)
    
    st.success(f"🎶 Best match from Agglomerative Clustering: **{best_match}** 🎵")

In [29]:
df_test.head()

Unnamed: 0,explicit,danceability,energy,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,False,0.617,0.872,8.0,0.048,0.0158,0.112,0.408,0.504,111.458,4.0
1,False,0.825,0.743,2.0,0.149,0.0142,2.1e-05,0.237,0.8,127.045,4.0
2,False,0.677,0.665,7.0,0.0305,0.56,1e-06,0.338,0.706,74.981,4.0
3,False,0.683,0.728,9.0,0.259,0.568,5.1e-05,0.0384,0.833,75.311,4.0
4,False,0.319,0.627,0.0,0.0687,0.675,7.3e-05,0.289,0.497,85.818,4.0


In [30]:
# Scale data with Standard Scaler
scaler = StandardScaler()

scaled_data = scaler.fit_transform(df_x)

# call PCA
pca = PCA(n_components=1)

# fit and apply
genres_pca = pca.fit_transform(scaled_data)

# Create DataFrame with PCA results
genres_pca_df = pd.DataFrame(
    genres_pca,
    columns=['genre_pca_1',
            #  'genre_pca_2',
            #  'genre_pca_3',
            #  'genre_pca_4',
            #  'genre_pca_5',
            #  'genre_pca_6',
            #  'genre_pca_7',
            #  'genre_pca_8',
            #  'genre_pca_9',
            #  'genre_pca_10',
            #  'genre_pca_11',
            #  'genre_pca_12',
            #  'genre_pca_13',
            #  'genre_pca_14',
            #  'genre_pca_15',
            #  'genre_pca_16',
            #  'genre_pca_17',
            #  'genre_pca_18',
            #  'genre_pca_19',
            #  'genre_pca_20'
             ])

genres_pca_df

Unnamed: 0,genre_pca_1
0,-0.392433
1,-1.658729
2,-0.626919
3,6.196573
4,2.855155
...,...
9444,-2.052801
9445,-0.687546
9446,-0.892368
9447,-5.820127


In [31]:
pca.explained_variance_ratio_

array([0.00759061])

In [32]:
# Sum the explained variance ratios
total_explained_variance = pca.explained_variance_ratio_.sum()
# Print the total explained variance
print(f"Total Explained Variance: {total_explained_variance}")

Total Explained Variance: 0.007590606205265214


In [33]:
# # Select only numeric columns for modeling
# numeric_features = df_cleaned.select_dtypes(include=['float64', 'int64']).columns

# # Create feature matrix X
# X = df_cleaned[numeric_features]

# # Optional: Create new features
# # Example: Combining features or creating ratios
# X['energy_valence_ratio'] = X['energy'] / X['valence']

In [34]:
# # Remove the problematic energy_valence_ratio column if it exists
# if 'energy_valence_ratio' in X.columns:
#     X = X.drop('energy_valence_ratio', axis=1)

# # Create the ratio feature with handling for zero values
# X['energy_valence_ratio'] = X['energy'] / X['valence'].replace(0, 0.0001)  # Replace zeros with small value

# # Now scale the features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [35]:
# # PCA for dimensionality reduction
# pca = PCA(n_components=0.95)  # Keep 95% of variance
# X_pca = pca.fit_transform(X_scaled)

# # Or t-SNE for non-linear dimensionality reduction
# tsne = TSNE(n_components=2, random_state=42)
# X_tsne = tsne.fit_transform(X_scaled)

In [36]:
# Using IQR method to detect outliers
def remove_outliers(df, columns):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean

# Apply outlier removal
X_no_outliers = remove_outliers(X_scaled, X_scaled.columns)

NameError: name 'X_scaled' is not defined

In [None]:
#unique_genres = set(df_cleaned['Artist Genres'].str.split(',').explode().value_counts())
unique_genres = df_cleaned['artist__genres'].str.split(',').explode().value_counts()
print(len(unique_genres))
print(unique_genres.head(20))

In [None]:
# Looking at the error message and available columns, we see that 'Album Genres' doesn't exist
# Let's use 'artist__genres' instead since we already have that data

# Get genre counts from the already exploded artist__genres
genre_counts = df_cleaned['artist__genres'].value_counts()

# Select top N genres (e.g., top 20)
top_n_genres = 20
top_genres = genre_counts.head(top_n_genres).index

# Create dummies only for top genres
genre_dummies = pd.get_dummies(
    df_cleaned['artist__genres'].where(df_cleaned['artist__genres'].isin(top_genres), 'other'),
    prefix='genre'
)

# Group by index and join with original dataframe
genre_dummies = genre_dummies.groupby(df_cleaned.index).sum()
df_with_top_genres = pd.concat([df_cleaned, genre_dummies], axis=1)

# No need to drop 'Album Genres' since it doesn't exist
# df_with_top_genres = df_with_top_genres.drop('Album Genres', axis=1)

print("\nShape with top genres only:", df_with_top_genres.shape)
print("\nTop genre columns:", genre_dummies.columns.tolist())

In [None]:
df_with_top_genres.info()

In [None]:
df_with_top_genres.head()

In [None]:
# Test the models
# Unsupervised models K-means, Gaussian 

In [None]:
# visualize model accuracy
# the elbow thing
# mushroom pizza
# 