# MUSIC RECOMMENDATION SYSTEM

## Importing Packages

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [41]:
df = pd.read_csv('/content/drive/MyDrive/spotify_dataset.csv')
df = df.drop(columns=['Unnamed: 0'], axis=1, errors='ignore')

In [42]:
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [43]:
df.isna().sum()

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [44]:
df = df.dropna()

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          113999 non-null  object 
 1   artists           113999 non-null  object 
 2   album_name        113999 non-null  object 
 3   track_name        113999 non-null  object 
 4   popularity        113999 non-null  int64  
 5   duration_ms       113999 non-null  int64  
 6   explicit          113999 non-null  bool   
 7   danceability      113999 non-null  float64
 8   energy            113999 non-null  float64
 9   key               113999 non-null  int64  
 10  loudness          113999 non-null  float64
 11  mode              113999 non-null  int64  
 12  speechiness       113999 non-null  float64
 13  acousticness      113999 non-null  float64
 14  instrumentalness  113999 non-null  float64
 15  liveness          113999 non-null  float64
 16  valence           113999 

In [46]:
df['explicit'] = df['explicit'].astype(int)

In [47]:
df['track_id'].duplicated().sum()

24259

In [48]:
df = df.drop_duplicates(subset=['track_id'])

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89740 entries, 0 to 113999
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          89740 non-null  object 
 1   artists           89740 non-null  object 
 2   album_name        89740 non-null  object 
 3   track_name        89740 non-null  object 
 4   popularity        89740 non-null  int64  
 5   duration_ms       89740 non-null  int64  
 6   explicit          89740 non-null  int64  
 7   danceability      89740 non-null  float64
 8   energy            89740 non-null  float64
 9   key               89740 non-null  int64  
 10  loudness          89740 non-null  float64
 11  mode              89740 non-null  int64  
 12  speechiness       89740 non-null  float64
 13  acousticness      89740 non-null  float64
 14  instrumentalness  89740 non-null  float64
 15  liveness          89740 non-null  float64
 16  valence           89740 non-null  float64
 1

In [50]:
artists_df = df[['artists', 'popularity']]

In [51]:
artists_df = artists_df.groupby('artists')['popularity'].mean().reset_index()

In [52]:
artists_df = artists_df.sort_values('popularity', ascending=False).head(50)

In [53]:
artists_df

Unnamed: 0,artists,popularity
24119,Sam Smith;Kim Petras,100.0
3629,Bizarrap;Quevedo,99.0
17442,Manuel Turizo,98.0
2857,Bad Bunny;Chencho Corleone,97.0
2855,Bad Bunny;Bomba Estéreo,95.0
13565,Joji,94.0
3443,Beyoncé,93.0
11491,Harry Styles,92.0
22845,Rema;Selena Gomez,92.0
7938,Drake;21 Savage,91.0


In [54]:
pickle.dump(artists_df, open('artists_top50.pkl', 'wb'))

In [55]:
albums_df = df[['album_name', 'popularity']]

In [56]:
albums_df = albums_df.groupby('album_name')['popularity'].mean().reset_index().sort_values('popularity', ascending=False).head(50)

In [57]:
albums_df

Unnamed: 0,album_name,popularity
40423,Unholy (feat. Kim Petras),100.0
30144,"Quevedo: Bzrp Music Sessions, Vol. 52",99.0
20596,La Bachata,98.0
18430,Indigo (Extended),96.0
17470,I Ain’t Worried (Music From The Motion Picture...,96.0
30261,RENAISSANCE,93.0
27991,PROVENZA,93.0
6684,Calm Down (with Selena Gomez),92.0
35683,Super Freaky Girl,92.0
16124,Harry's House,92.0


In [58]:
pickle.dump(albums_df, open('albums_top50.pkl', 'wb'))

In [59]:
genres_df = df[['track_genre', 'popularity']].groupby('track_genre')['popularity'].mean().reset_index().head(50)

In [60]:
genres_df

Unnamed: 0,track_genre,popularity
0,acoustic,42.483
1,afrobeat,24.407407
2,alt-rock,33.896897
3,alternative,22.218673
4,ambient,44.208208
5,anime,48.776884
6,black-metal,22.431727
7,bluegrass,25.681363
8,blues,31.179104
9,brazil,44.645678


In [61]:
pickle.dump(genres_df, open('genres_top50.pkl', 'wb'))

In [62]:
tracks_df = df[['track_name', 'popularity']].sort_values('popularity', ascending=False).head(50)

In [63]:
tracks_df

Unnamed: 0,track_name,popularity
20001,Unholy (feat. Kim Petras),100
51664,"Quevedo: Bzrp Music Sessions, Vol. 52",99
20008,I'm Good (Blue),98
67356,La Bachata,98
67358,Me Porto Bonito,97
67359,Tití Me Preguntó,97
67559,Efecto,96
20000,Under The Influence,96
79000,I Ain't Worried,96
81052,As It Was,95


In [64]:
pickle.dump(tracks_df, open('tracks_top50.pkl', 'wb'))

In [65]:
df.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')

In [66]:
features_df = df.drop(columns=['track_id', 'artists', 'album_name', 'track_name', 'popularity', 'key', 'mode' ,'time_signature', 'track_genre'], axis=1)

In [67]:
n = 1000

In [68]:
features_df = features_df.head(n)

In [69]:
features_df.index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       990, 991, 992, 993, 994, 995, 996, 997, 998, 999],
      dtype='int64', length=1000)

In [70]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features_df)

In [71]:
similarity_score = cosine_similarity(scaled_features, scaled_features)

In [86]:
def recommend(track_name, similarity_score, df, top_n=5):
    data = []
    track_indices = df[df['track_name'] == track_name].index
    track_idx = track_indices[0] if len(track_indices) > 0 else None
    if track_idx is not None:
        sim_scores = list(enumerate(similarity_score[track_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1]  # Exclude the track itself (similarity score of 1)
        track_indices = [x[0] for x in sim_scores]
        for index in track_indices:
            item = [
                df.loc[index, 'track_name'],
                df.loc[index, 'artists'],
                df.loc[index, 'album_name'],
                df.loc[index, 'track_genre']
            ]
            data.append(item)
    return data

In [73]:
pickle.dump(similarity_score, open('similarity_score.pkl', 'wb'))

In [74]:
df_final = df[df.index.isin(features_df.index)]

In [75]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          1000 non-null   object 
 1   artists           1000 non-null   object 
 2   album_name        1000 non-null   object 
 3   track_name        1000 non-null   object 
 4   popularity        1000 non-null   int64  
 5   duration_ms       1000 non-null   int64  
 6   explicit          1000 non-null   int64  
 7   danceability      1000 non-null   float64
 8   energy            1000 non-null   float64
 9   key               1000 non-null   int64  
 10  loudness          1000 non-null   float64
 11  mode              1000 non-null   int64  
 12  speechiness       1000 non-null   float64
 13  acousticness      1000 non-null   float64
 14  instrumentalness  1000 non-null   float64
 15  liveness          1000 non-null   float64
 16  valence           1000 non-null   float64
 17  t

In [76]:
pickle.dump(df_final, open('df.pkl', 'wb'))

In [77]:
df_final.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [87]:
recommend('Comedy', similarity_score, df_final, 5)

[['Look For The Good (Single Version)',
  'Jason Mraz',
  'Look For The Good',
  'acoustic'],
 ['Pop Virus', 'Gen Hoshino', 'POP VIRUS', 'acoustic'],
 ['Days I Will Remember', 'Tyrone Wells', 'Days I Will Remember', 'acoustic'],
 ['The Lotto', 'Ingrid Michaelson;AJR', 'The Lotto', 'acoustic'],
 ['Outside Villanova', 'Eric Hutchinson', 'Sounds Like This', 'acoustic']]