In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

<h1>Import the data</h1>

In [2]:
tracks_df = pd.read_csv('tracks.csv')
features_df = pd.read_csv('tracks_features.csv')
features_df.rename(columns={'id': 'track_id'}, inplace=True)

featured_tracks = tracks_df.merge(features_df, on='track_id', how='inner')

In [3]:
featured_tracks.head()

Unnamed: 0,track_id,title,artist_id,popularity,duration_ms_x,like,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms_y,time_signature
0,4iK0yULrceBlRGBYww7A1E,Amina,4WnAHZz1pgl8hus8hidIRV,50.0,204626.0,1.0,0.79,0.902,1.0,-5.757,1.0,0.0876,0.129,0.0,0.132,0.785,93.766,204627.0,3.0
1,6UkpaWKK4hYUXC2x4lsuJU,Woju (Remix),6eev6LNl6SEAKqLIo7vd9D,43.0,204210.0,1.0,0.791,0.906,7.0,-2.705,1.0,0.0478,0.302,3e-06,0.0356,0.97,113.955,204211.0,4.0
2,2B3FCVxi308OK2z8suLD7r,Afro Trap Pt. 4 (Fais le mouv),4WnAHZz1pgl8hus8hidIRV,58.0,155706.0,1.0,0.952,0.758,7.0,-4.628,1.0,0.266,0.0181,7e-06,0.0447,0.849,126.981,155707.0,4.0
3,6eB0OjBglqL9KmouIfJ9hn,Maman j’ai mal,4WnAHZz1pgl8hus8hidIRV,59.0,203680.0,1.0,0.723,0.701,0.0,-7.363,0.0,0.136,0.61,0.0,0.188,0.534,116.019,203680.0,4.0
4,66mNivkwO5D16jIajJ5phZ,Afro Trap Pt. 5 (Ngatie Abedi),4WnAHZz1pgl8hus8hidIRV,62.0,154946.0,1.0,0.888,0.828,11.0,-3.535,0.0,0.0787,0.526,0.0,0.107,0.768,127.996,154947.0,4.0


<h1>Data Preparation</h1>
Perform some scaling on the data to have everything well distributed

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

num_col = features_df.describe().columns
scaled_features_df = features_df
scaled_features_df[num_col] = pd.DataFrame(scaler.fit_transform(scaled_features_df[num_col]), columns=num_col)

scaled_features_df.head()

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,4iK0yULrceBlRGBYww7A1E,0.764877,0.858647,0.090909,0.690484,1.0,0.158704,0.141372,0.0,0.129398,0.790755,0.182368,0.185953,0.0
1,6UkpaWKK4hYUXC2x4lsuJU,0.766328,0.864662,0.636364,0.858361,1.0,0.049423,0.331324,3e-06,0.019977,0.988257,0.317665,0.184982,0.5
2,2B3FCVxi308OK2z8suLD7r,1.0,0.642105,0.636364,0.752585,1.0,0.648545,0.019605,8e-06,0.030306,0.85908,0.404959,0.071692,0.5
3,6eB0OjBglqL9KmouIfJ9hn,0.667634,0.556391,0.0,0.602145,0.0,0.291598,0.669505,0.0,0.192963,0.522793,0.331497,0.183741,0.5
4,66mNivkwO5D16jIajJ5phZ,0.907112,0.747368,1.0,0.812706,0.0,0.134267,0.577274,0.0,0.101022,0.772606,0.411761,0.069916,0.5


<h2>Distances between tracks</h2>

In [31]:
from scipy.spatial.distance import squareform, pdist
dist_df = pd.DataFrame(squareform(pdist(scaled_features_df.ix[:, 1:])), columns=scaled_features_df.track_id.unique(),
             index=scaled_features_df.track_id.unique())

liked_tracks_id = tracks_df[tracks_df.like == 1]['track_id']
#On the colums, we keep only the song the user like
dist_df = dist_df[liked_tracks_id]

#On the rows, we drop the song we like
dist_df = dist_df.drop(liked_tracks_id)

#On the rows, we have the new tracks and on the column, we have the song the user like

#dist_df['overall_dist'] = dist_df.quantile(q=0.2, axis=1)
dist_df['overall_dist'] = dist_df.mean(axis=1)

dist_df.sort_values(['overall_dist'], ascending=True, inplace=True)
dist_df.head()
#sns.heatmap(data=dist_df, cmap="YlGnBu")

Unnamed: 0,4iK0yULrceBlRGBYww7A1E,6UkpaWKK4hYUXC2x4lsuJU,2B3FCVxi308OK2z8suLD7r,6eB0OjBglqL9KmouIfJ9hn,66mNivkwO5D16jIajJ5phZ,0deXVyvt3sUw813RtOQy01,0v9yrgq9lNv5RJ3NPkCxmg,410olNFklIex3RgsmUgWuo,6B06Inx97HC77t4y0wHxux,0J5yZ8WhWPyNnDYg1EWN4J,...,0bBXpeBxFLRCLAzbmd7cbM,1FkWnltHMW1mDxRapOsMMd,4C6kfMCoUimaW1dqHVy033,0udOlv0R9AiEft5rucfo03,06B0pbiRWZJAfxJXu6R4wY,0B1X6jlzNv6rUWmdDBOG9S,0GHHH0v6Q6WT2mhAgnUC1v,0xOCWFZiMkOBGIWvgEKfWD,6KjIPqQlgf2jetOWtF6Zs8,overall_dist
2f4cgNrOtiN7VUxu3Dnziw,0.799102,0.486933,0.484471,1.399365,1.227802,0.8522,1.224934,1.025055,0.715387,1.191409,...,0.676538,1.72107,0.823632,0.691971,1.093473,1.003714,1.271938,0.452457,0.83066,0.499811
1vBzd0jlsXdbkcah5NTRGh,0.872705,0.349131,0.604554,1.411104,1.148942,0.816222,1.139999,0.914382,0.672616,1.19377,...,0.708772,1.729439,0.765166,0.686848,1.110628,0.834868,1.274961,0.61244,0.583844,0.560372
247ZfUqi8jh9a4U629uQrX,0.861899,0.383676,0.750848,1.414739,1.173206,0.97031,1.221631,0.984613,0.649197,1.209313,...,0.750644,1.755842,0.830726,0.601354,1.141387,0.909248,1.28226,0.6854,0.62659,0.561802
4F53vz6YY7LLTXlmQQqC9h,0.947251,0.405921,0.76186,1.458066,1.126634,1.041449,1.241226,0.983785,0.754143,1.20069,...,0.804568,1.784769,1.044098,0.660523,1.099762,0.78453,1.279255,0.722862,0.57093,0.572237
6leKqvSviIydDluSDB7gQz,0.834377,0.584583,0.496073,1.386824,1.268837,0.826009,1.225781,0.993233,0.763937,1.221244,...,0.725284,1.674485,0.777265,0.663864,1.033384,1.010062,1.259481,0.482285,0.870097,0.578656


In [34]:
#Get the 10 best tracks
best_tracks_id = dist_df.head(20).index

tracks_df[tracks_df.track_id.isin(best_tracks_id)]

Unnamed: 0,track_id,title,artist_id,popularity,duration_ms,like
58,5czvDLUDDQLIrU9RsyFooq,Napo,6eev6LNl6SEAKqLIo7vd9D,29.0,194455.0,0.0
62,4Cd2DY6EjV8aqI9sjVdJk6,Jambole,3eTpitQsrNQdmkQJHS2v2j,30.0,194063.0,0.0
75,6leKqvSviIydDluSDB7gQz,Kele Kele,1hNaHKp2Za5YdOAG0WnRbc,32.0,222693.0,0.0
80,2v3l2JMdEy88igmezrkzQp,Sytia Loss,5BfpzKNakWiXUNm1RfBgUi,23.0,211987.0,0.0
92,4h3NiCORd1xcXUYxps2Uks,Folarin,1hNaHKp2Za5YdOAG0WnRbc,17.0,251240.0,0.0
118,1vBzd0jlsXdbkcah5NTRGh,Kadondo Style,3eTpitQsrNQdmkQJHS2v2j,36.0,234240.0,0.0
121,4F53vz6YY7LLTXlmQQqC9h,Rouler moutou,1Vgb3eqBF3DLP6FwY7MuB4,22.0,248600.0,0.0
148,6DPIMsoageIVwsUV4VZVlv,Toi et moi,2nded5zgVbSiKxdh70DoP7,11.0,199505.0,0.0
168,43F5evseGamQ5ZXPBjdjv6,Letter To TINA,374sWpAJsbZckf98df2jJJ,28.0,244801.0,0.0
191,2bPaCYSYW8MWmA5xbjJdMA,C'est pas possible (feat. Youness) - Radio Edit,1Cy58GZRk3TYmX0pb6pExg,33.0,179685.0,0.0


We have the reduce the dist matrix. We should only see the distance between the song we like and the others

ValueError: labels ['4iK0yULrceBlRGBYww7A1E' '6UkpaWKK4hYUXC2x4lsuJU' '2B3FCVxi308OK2z8suLD7r'
 '6eB0OjBglqL9KmouIfJ9hn' '66mNivkwO5D16jIajJ5phZ' '0deXVyvt3sUw813RtOQy01'
 '0v9yrgq9lNv5RJ3NPkCxmg' '410olNFklIex3RgsmUgWuo' '6B06Inx97HC77t4y0wHxux'
 '0J5yZ8WhWPyNnDYg1EWN4J' '2XT9mDhAtP0uoxBqJ8HF36' '3q6cDwOPDPZ9ZBvMrrfdwU'
 '3FA1Ktv9lTUtFTwOXr6wMN' '3a1PoUzZS5L7tUN8e1W9A0' '1k8A2fQEwbO7yN5JWp7jQn'
 '29umcFLM8LZvjBS9lTlLQk' '41bp03eAj9ZuYbGNkCkkg3' '0ban5QRXUveaKtQPGuaoQ9'
 '6Nh2UEWq9mI3iIWIR8L8n2' '3ePRRiOYaYuFjbdjbAxeh4' '67VH1BYRIMjGj3fZQBHDS2'
 '5iL9eHXCoetboTwcQGz4Oy' '0h1UKhueXm1YFLpQjzwv0E' '0NtK87XUpxAnrxncVh8xzs'
 '2WLpYiWNYtHeS5izWhrh6Q' '3WjrpeXCi4lQBUjVLSlpmu' '14iHXsW6PMGtNhhhQqd7jH'
 '2MjuGAGJAbv0wjn77g3s4L' '5Mgv97QW3dcwODgUrtmIlN' '1H6BCeydQQQjH8y8Cc9Xbz'
 '1yGnWQboreZCHMfT1fNA6P' '4DUlmxfCl6jrZdYAdfoYD6' '4okdTLfW8dskRvzzlxnG4S'
 '5a39V8NZADQeq7yxRJ8H7l' '4EcXepJeuWXZ8nYI0aiBHC' '4OrnVTYj3Od79lBiyFHg4p'
 '7umarL2ppWxQkjcMhTS0tu' '6E42NBef4akzQdRfSsAfDz' '0fJbAoGJxzwszQdOs3LiTC'
 '5irIgvabRfzByac6xFozD8' '0bBXpeBxFLRCLAzbmd7cbM' '1FkWnltHMW1mDxRapOsMMd'
 '4C6kfMCoUimaW1dqHVy033' '0udOlv0R9AiEft5rucfo03' '06B0pbiRWZJAfxJXu6R4wY'
 '0B1X6jlzNv6rUWmdDBOG9S' '0GHHH0v6Q6WT2mhAgnUC1v' '0xOCWFZiMkOBGIWvgEKfWD'
 '6KjIPqQlgf2jetOWtF6Zs8'] not contained in axis