In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from scipy.sparse import hstack
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/DrakeData/MSDS495-Capstone/main/SPOTIFY_FROM_KAGGLE.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


## EDA

In [3]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,114000.0,56999.5,32909.109681,0.0,28499.75,56999.5,85499.25,113999.0
popularity,114000.0,33.238535,22.305078,0.0,17.0,35.0,50.0,100.0
duration_ms,114000.0,228029.153114,107297.712645,0.0,174066.0,212906.0,261506.0,5237295.0
danceability,114000.0,0.5668,0.173542,0.0,0.456,0.58,0.695,0.985
energy,114000.0,0.641383,0.251529,0.0,0.472,0.685,0.854,1.0
key,114000.0,5.30914,3.559987,0.0,2.0,5.0,8.0,11.0
loudness,114000.0,-8.25896,5.029337,-49.531,-10.013,-7.004,-5.003,4.532
mode,114000.0,0.637553,0.480709,0.0,0.0,1.0,1.0,1.0
speechiness,114000.0,0.084652,0.105732,0.0,0.0359,0.0489,0.0845,0.965
acousticness,114000.0,0.31491,0.332523,0.0,0.0169,0.169,0.598,0.996


In [4]:
# Check for Nulls
pd.isnull(df).sum()

Unnamed: 0          0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

## Extract relevant features

In [5]:
# Select relevant features
numerical_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo']
categorical_feature = ['track_genre']

## Build model

In [6]:
# Normalize the numerical feature values
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [7]:
# Encode the categorical feature using one-hot encoding
encoder = OneHotEncoder(sparse=False)
genre_encoded = encoder.fit_transform(df[categorical_feature])



In [10]:
# Combine the numerical and categorical features
feature_matrix = np.concatenate((genre_encoded, df[numerical_features]), axis=1)

# Randomly select a subset of the data
sample_size = 40000  # Adjust the sample size as per your requirements
random_indices = np.random.choice(df.shape[0], size=sample_size, replace=False)
df_sample = df.iloc[random_indices].reset_index(drop=True)  # Subset of the original dataframe

# Compute cosine similarity matrix
cosine_similarities = cosine_similarity(feature_matrix[random_indices])

In [11]:
# Function to get track recommendations
def get_recommendations(track_name, cosine_similarities, df, top_n=5):
    # Get the index of the track with the given name
    idx = df[df['track_name'] == track_name].index[0]
    
    # Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_similarities[idx]))
    
    # Sort the tracks based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N similar tracks
    top_tracks = [(df.iloc[score[0]]['track_name'], df.iloc[score[0]]['artists']) for score in sim_scores[1:top_n+1]]
    
    return top_tracks

In [12]:
artisits_check = df_sample[['track_name', 'artists']]

artisits_check[artisits_check['artists']=='My Chemical Romance']

Unnamed: 0,track_name,artists
292,The Foundations of Decay,My Chemical Romance
597,Cancer,My Chemical Romance
1172,Teenagers,My Chemical Romance
1892,House of Wolves,My Chemical Romance
2585,Dead!,My Chemical Romance
2698,The Foundations of Decay,My Chemical Romance
6663,Thank You for the Venom,My Chemical Romance
7816,Disenchanted,My Chemical Romance
8696,Teenagers,My Chemical Romance
10558,Disenchanted,My Chemical Romance


In [15]:
# Example usage
input_track = "Disenchanted"
recommendations = get_recommendations(input_track, cosine_similarities, df_sample)

print(f"Recommendations for '{input_track}':")
for i, (track, artist) in enumerate(recommendations):
    print(f"{i+1}. {track} by {artist}")

Recommendations for 'Disenchanted':
1. Stolen by Dashboard Confessional
2. Awful Things by Lil Peep;Lil Tracy
3. Jeda by For Revenge
4. All I Wanted by Paramore
5. All I Wanted by Paramore
