### 1. Importing libraries

In [273]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsRegressor

%matplotlib inline

### 2. Loading the song dataset

In [274]:
songs = pd.read_csv('user_spotify_v3.json.tracks1.csv')
print(songs.shape)
songs.head(10)

(109233, 15)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genres,id
0,0.779,0.587,259550,0.299,0.0,8,0.123,-7.365,1,0.0263,94.992,3,0.356,pop dance pop pop pop rap post-teen pop r&b,1bhUWB0zJMIKr9yVPrkEuI
1,0.245,0.638,205748,0.658,4e-06,3,0.0919,-6.318,1,0.0456,105.076,4,0.33,dance pop edm pop tropical house uk funky danc...,2xmrfQpmS2iJExTlklLoAL
2,0.633,0.765,229573,0.688,0.0,4,0.0734,-5.566,1,0.0841,90.013,4,0.434,pop rap rap,42CeaId2XNlxugDvyqHfDf
3,0.129,0.72,197993,0.807,0.0,11,0.183,-4.59,0,0.0432,124.946,4,0.305,dance pop pop post-teen pop brostep edm progre...,0tBbt8CrmxbjRP0pueQkyU
4,0.00413,0.653,202805,0.718,0.0,3,0.0537,-5.232,0,0.213,82.034,4,0.216,hip hop pop rap rap southern hip hop trap musi...,0OI7AFifLSoGzpb8bdBLLV
5,0.0835,0.648,190643,0.608,0.0,8,0.105,-5.16,1,0.0587,126.12,4,0.488,dance pop pop pop christmas,7eFmN6wnsb7WowRKAqRFfs
6,0.032,0.667,174800,0.726,0.0,8,0.0745,-4.172,1,0.054,103.001,4,0.77,dance pop pop post-teen pop big room dance pop...,5Gu0PDLN4YJeW75PpBSg9p
7,0.316,0.661,212120,0.715,0.0,5,0.178,-5.651,0,0.119,148.027,4,0.411,,2amzBJRBPOGszBem4FedfE
8,0.0233,0.845,187521,0.709,0.0,10,0.094,-4.547,0,0.0714,98.062,4,0.62,dance pop pop pop rap post-teen pop pop rap ra...,2z4pcBLQXF2BXKFvd0BuB6
9,0.398,0.751,199095,0.579,2.3e-05,2,0.133,-4.036,1,0.0321,105.031,4,0.349,dance pop pop post-teen pop latin latin hip ho...,3whrwq4DtvucphBPUogRuJ


In [275]:
songs.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0
mean,0.380904,0.552116,269176.0,0.543602,0.148816,5.229436,0.194541,-10.618803,0.664369,0.151028,117.32221,3.850732,0.440351
std,0.363885,0.174904,246220.8,0.286918,0.301031,3.561608,0.167634,7.077973,0.472213,0.245035,30.634027,0.568357,0.252394
min,0.0,0.0,1155.0,0.0,0.0,0.0,0.0,-58.555,0.0,0.0,0.0,0.0,0.0
25%,0.0293,0.443,189493.0,0.303,0.0,2.0,0.0968,-13.548,0.0,0.0354,94.995,4.0,0.236
50%,0.248,0.574,223962.0,0.576,0.000109,5.0,0.125,-8.279,1.0,0.0488,116.538,4.0,0.417
75%,0.753,0.678,272000.0,0.792,0.0496,8.0,0.237,-5.577,1.0,0.103,135.522,4.0,0.632
max,0.996,0.985,5925082.0,1.0,1.0,11.0,0.997,1.974,1.0,0.969,232.69,5.0,0.999


In [276]:
# Removing duplicate rows and rows with null values
print("Original shape: {}".format(songs.shape))
songs.drop_duplicates(inplace=True)
songs.dropna(how='any', inplace=True)
print("Shape of dataset after modifications: {}".format(songs.shape))

Original shape: (109233, 15)
Shape of dataset after modifications: (56452, 15)


In [277]:
# Getting genres (taking the first genre of the list)
genre = []

for s in songs['genres']:
    g = s[:s.find(" ")]
    genre.append(g)
#     print(s)
    
songs['genre'] = genre

In [278]:
songs.head(10)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genres,id,genre
0,0.779,0.587,259550,0.299,0.0,8,0.123,-7.365,1,0.0263,94.992,3,0.356,pop dance pop pop pop rap post-teen pop r&b,1bhUWB0zJMIKr9yVPrkEuI,pop
1,0.245,0.638,205748,0.658,4e-06,3,0.0919,-6.318,1,0.0456,105.076,4,0.33,dance pop edm pop tropical house uk funky danc...,2xmrfQpmS2iJExTlklLoAL,dance
2,0.633,0.765,229573,0.688,0.0,4,0.0734,-5.566,1,0.0841,90.013,4,0.434,pop rap rap,42CeaId2XNlxugDvyqHfDf,pop
3,0.129,0.72,197993,0.807,0.0,11,0.183,-4.59,0,0.0432,124.946,4,0.305,dance pop pop post-teen pop brostep edm progre...,0tBbt8CrmxbjRP0pueQkyU,dance
4,0.00413,0.653,202805,0.718,0.0,3,0.0537,-5.232,0,0.213,82.034,4,0.216,hip hop pop rap rap southern hip hop trap musi...,0OI7AFifLSoGzpb8bdBLLV,hip
5,0.0835,0.648,190643,0.608,0.0,8,0.105,-5.16,1,0.0587,126.12,4,0.488,dance pop pop pop christmas,7eFmN6wnsb7WowRKAqRFfs,dance
6,0.032,0.667,174800,0.726,0.0,8,0.0745,-4.172,1,0.054,103.001,4,0.77,dance pop pop post-teen pop big room dance pop...,5Gu0PDLN4YJeW75PpBSg9p,dance
8,0.0233,0.845,187521,0.709,0.0,10,0.094,-4.547,0,0.0714,98.062,4,0.62,dance pop pop pop rap post-teen pop pop rap ra...,2z4pcBLQXF2BXKFvd0BuB6,dance
9,0.398,0.751,199095,0.579,2.3e-05,2,0.133,-4.036,1,0.0321,105.031,4,0.349,dance pop pop post-teen pop latin latin hip ho...,3whrwq4DtvucphBPUogRuJ,dance
10,0.413,0.827,187250,0.419,0.0,10,0.115,-10.329,0,0.112,119.974,4,0.227,underground hip hop,3al2hpm92xE0pBalqWQHdD,underground


In [279]:
songs = songs.reset_index(drop=True)

### Selecting four features to define similarity: acousticness, danceability, energy and liveness

We need to take a couple of steps:
1. Scale all the data
2. Select a random song for a particular genre 
3. Get the closest X songs on those features (by euclidean distance)




#### 1. Scaling the data

In [280]:
# Getting features
features = songs.iloc[:,:(songs.shape[1]-3)]


# Scaling featues
scaler = MinMaxScaler().fit(features)
data = scaler.transform(features)
data = pd.DataFrame(data, columns= features.columns)
data['genre'] = songs['genre']
data['id'] = songs['id']

In [281]:
data.head(100)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id
0,0.782129,0.595939,0.047149,0.299,0.000000,0.727273,0.123370,0.827932,1.0,0.027198,0.408234,0.6,0.356356,pop,1bhUWB0zJMIKr9yVPrkEuI
1,0.245984,0.647716,0.037332,0.658,0.000004,0.272727,0.092177,0.847222,1.0,0.047156,0.451571,0.8,0.330330,dance,2xmrfQpmS2iJExTlklLoAL
2,0.635542,0.776650,0.041680,0.688,0.000000,0.363636,0.073621,0.861078,1.0,0.086970,0.386837,0.8,0.434434,pop,42CeaId2XNlxugDvyqHfDf
3,0.129518,0.730964,0.035917,0.807,0.000000,1.000000,0.183551,0.879060,0.0,0.044674,0.536963,0.8,0.305305,dance,0tBbt8CrmxbjRP0pueQkyU
4,0.004147,0.662944,0.036795,0.718,0.000000,0.272727,0.053862,0.867232,0.0,0.220269,0.352546,0.8,0.216216,hip,0OI7AFifLSoGzpb8bdBLLV
5,0.083835,0.657868,0.034576,0.608,0.000000,0.727273,0.105316,0.868558,1.0,0.060703,0.542009,0.8,0.488488,dance,7eFmN6wnsb7WowRKAqRFfs
6,0.032129,0.677157,0.031685,0.726,0.000000,0.727273,0.074724,0.886762,1.0,0.055843,0.442653,0.8,0.770771,dance,5Gu0PDLN4YJeW75PpBSg9p
7,0.023394,0.857868,0.034006,0.709,0.000000,0.909091,0.094283,0.879853,0.0,0.073837,0.421428,0.8,0.620621,dance,2z4pcBLQXF2BXKFvd0BuB6
8,0.399598,0.762437,0.036118,0.579,0.000023,0.181818,0.133400,0.889268,1.0,0.033195,0.451377,0.8,0.349349,dance,3whrwq4DtvucphBPUogRuJ
9,0.414659,0.839594,0.033957,0.419,0.000000,0.909091,0.115346,0.773321,0.0,0.115822,0.515596,0.8,0.227227,underground,3al2hpm92xE0pBalqWQHdD


#### 2. Select a random song from the given genre

In [282]:
print("Number of genres available: {}".format(len(data['genre'].unique())))
data['genre'].unique()

Number of genres available: 716


array(['pop', 'dance', 'hip', 'underground', 'bmore', 'dwn', 'latin',
       'rap', 'big', 'brostep', 'edm', 'detroit', 'drill', 'post-teen',
       'dirty', 'deep', 'east', 'ra', 'crunk', 'bass', 'indie',
       'chillwave', 'danish', 'canadian', 'irish', 'blues-rock',
       'alternative', 'alt-indie', 'escape', 'modern', 'emo', 'garage',
       'melodic', 'glam', 'folk-pop', 'contemporary', 'australian', 'lift',
       'christian', 'chicago', 'bachata', 'trap', 'reggaeton', 'cumbia',
       'reggaeto', 'colombian', 'aussietronica', 'house', 'chamber', 'boy',
       'acoustic', 'vapor', 'brooklyn', 'progressive', 'quebecoi',
       'indiecoustic', 'viral', 'channel', 'dreamo', 'folk-po', 'focu',
       'classify', 'compositional', 'new', 'ambient', 'soul', 'nu', 'bow',
       'scorecore', 'focus', 'austindie', 'funk', 'neo', 'folk', 'freak',
       'portland', 'michigan', 'chanson', 'anti-folk', 'vancouver',
       'norwegian', 'seattle', 'electroclash', 'bay', 'adult', 'tropical',
 

In [283]:
selected_genre = 'hip'
N = 10

genre_data = data[data.genre==selected_genre]


ind = data[data.genre==selected_genre].index
r = np.random.choice(ind,1)[0]


In [284]:
seed = data.iloc[r,:]
seed

acousticness                     0.0310241
danceability                      0.804061
duration_ms                      0.0372906
energy                               0.819
instrumentalness                         0
key                               0.454545
liveness                          0.390171
loudness                          0.863805
mode                                     1
speechiness                      0.0896587
tempo                             0.386695
time_signature                         0.8
valence                           0.433433
genre                                latin
id                  209gZgcfLq2aUuu51vOWBl
Name: 310, dtype: object

In [285]:
# Getting feature values for our seed song
acousticness = seed.acousticness
danceability = seed.danceability
energy = seed.energy
liveness = seed.liveness

#### 3. Get the closest N songs on those features (by euclidean distance)

In [286]:
# Calculating euclidean distance for every song with respect to the seed song
distance = []

for i in genre_data.index:
#     print(i)
    d = np.sqrt((genre_data.loc[i,'acousticness']-acousticness)**2 + (genre_data.loc[i,'danceability']-danceability)**2 + (genre_data.loc[i,'energy']-energy)**2 + (genre_data.loc[i,'liveness']-liveness)**2)
    distance.append(d)
    
distance

[0.38205973041772373,
 0.31278655013287865,
 0.34445683986250869,
 0.4182031470724904,
 0.33609912044361712,
 0.39585411019629219,
 0.31135675171649219,
 0.10273232356119896,
 0.40162816958781145,
 0.28726323918816438,
 0.11304665926781311,
 0.25484061587947671,
 0.31614973033491967,
 0.1635348460485348,
 0.19357182217939589,
 0.0,
 0.37998283948606859,
 0.3882219140266549,
 0.42406557253391564,
 0.15158914839174195,
 0.27837844196261985,
 0.35559523165094759,
 0.23145341036195699,
 0.34288766645877339,
 0.27241681097419485,
 0.37665854740665039,
 0.36925976233722674,
 0.37586937398929982,
 0.19874268624054281,
 0.39143039367378091,
 0.31586751127040252,
 0.35027861401201543,
 0.32674994879226454,
 0.15187693537179045,
 0.2419075457246497,
 0.15428628820239312,
 0.33393011625435454,
 0.98914168925880674,
 0.20343667058613177,
 0.37810908918306796,
 0.986748040266199,
 0.53708731193487236,
 1.0079983546893985,
 0.87638477142436666,
 0.24352174481757433,
 0.15993234669234815,
 0.52320644

In [287]:
genre_data = genre_data.reset_index(drop=True)
genre_data['distance'] = distance
genre_data

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id,distance
0,0.180723,0.786802,0.035089,0.679,0.000073,0.363636,0.068205,0.871783,0.0,0.139607,0.713430,0.8,0.619620,latin,3Ga6eKrUFf12ouh9Yw3v2D,0.382060
1,0.032932,0.737056,0.031488,0.889,0.000000,0.000000,0.092778,0.896287,1.0,0.044364,0.412669,0.8,0.649650,latin,2hl6q70unbviGo3g1R7uFx,0.312787
2,0.025904,0.757360,0.037331,0.794,0.000000,0.090909,0.049850,0.846983,1.0,0.072182,0.450501,0.8,0.447447,latin,6Zosz8w6CmdGMZM6p4JlbQ,0.344457
3,0.170683,0.739086,0.050810,0.541,0.000000,0.090909,0.118355,0.859272,1.0,0.032885,0.558791,0.8,0.206206,latin,0gCPvo1GkbtPhMqg5Gbx1K,0.418203
4,0.048394,0.691371,0.039954,0.788,0.000000,0.090909,0.075527,0.883980,1.0,0.081179,0.730667,0.8,0.839840,latin,33bnxcjePIkcmNjEFTJX0l,0.336099
5,0.016466,0.543147,0.034356,0.678,0.000013,1.000000,0.128385,0.873238,0.0,0.103206,0.441278,0.8,0.287287,latin,4ipnJyDU3Lq15qBAYNqlqK,0.395854
6,0.024598,0.758376,0.042266,0.864,0.000045,0.727273,0.085557,0.905021,0.0,0.061944,0.395496,0.8,0.754755,latin,2cnKEkpVUSV4wnjQiTWfH6,0.311357
7,0.090562,0.759391,0.035035,0.884,0.000000,0.909091,0.362086,0.898111,1.0,0.051810,0.408466,0.8,0.678679,latin,4z3GJkrtH97Bj6fRta983T,0.102732
8,0.287149,0.716751,0.049253,0.689,0.000000,0.545455,0.123370,0.861686,1.0,0.175801,0.619395,0.8,0.411411,latin,5LVoGJTyG6fLJgoGkY2QAZ,0.401628
9,0.011245,0.712690,0.033286,0.840,0.000000,0.000000,0.119358,0.898277,1.0,0.090900,0.438308,0.8,0.624625,latin,0yyZN5ASdrYu0XYWFzfxUu,0.287263


In [288]:
genre_data = genre_data.sort_values(by=['distance'], ascending=True)

In [289]:
playlist = genre_data.iloc[:N,:]
print(playlist.shape)
playlist

(10, 16)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id,distance
15,0.031024,0.804061,0.037291,0.819,0.0,0.454545,0.390171,0.863805,1.0,0.089659,0.386695,0.8,0.433433,latin,209gZgcfLq2aUuu51vOWBl,0.0
93,0.049699,0.811168,0.070135,0.8,1.6e-05,0.090909,0.332999,0.874675,1.0,0.076525,0.39544,0.8,0.414414,latin,5P45f6HakSHDwdFj5dO5K5,0.063473
113,0.056124,0.84467,0.038008,0.772,0.000486,0.909091,0.441324,0.863602,1.0,0.103413,0.412635,0.8,0.704705,latin,7pk3EpFtmsOdj8iUhjmeCM,0.08429
132,0.082932,0.753299,0.035324,0.858,0.0,0.909091,0.365095,0.872501,1.0,0.049948,0.408479,0.8,0.640641,latin,0JcNysfWVWaMS7R6vzGB2k,0.086145
7,0.090562,0.759391,0.035035,0.884,0.0,0.909091,0.362086,0.898111,1.0,0.05181,0.408466,0.8,0.678679,latin,4z3GJkrtH97Bj6fRta983T,0.102732
89,0.011647,0.793909,0.04828,0.864,0.00268,0.454545,0.292879,0.931405,0.0,0.041572,0.412777,0.8,0.866867,latin,0atfY1ew83Ql9nNnJiD2t5,0.109404
10,0.010643,0.793909,0.04828,0.868,0.00252,0.454545,0.290873,0.932031,0.0,0.041365,0.412747,0.8,0.884885,latin,693iqPOQvhI7PobtR8CC8v,0.113047
82,0.090462,0.873096,0.040409,0.817,0.00181,0.545455,0.287864,0.893708,0.0,0.118925,0.412605,0.8,0.723724,latin,2oEe1ELrLZjZ2UkMoUeP4E,0.137002
68,0.121486,0.73401,0.060327,0.848,0.301,0.363636,0.312939,0.839466,0.0,0.040228,0.528669,0.8,0.864865,latin,76qDfugauy5dqRKUdhg9cr,0.141054
19,0.118474,0.711675,0.042166,0.845,8e-06,0.181818,0.311936,0.913183,0.0,0.06908,0.739301,0.8,0.756757,latin,4XCQtUoRSh22Ep18ULmDnT,0.151589
