In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

<h1>Import the data</h1>

In [2]:
tracks_df = pd.read_csv('tracks.csv')
features_df = pd.read_csv('tracks_features.csv')
features_df.rename(columns={'id': 'track_id'}, inplace=True)

featured_tracks = tracks_df.merge(features_df, on='track_id', how='inner')

In [3]:
featured_tracks.head()

Unnamed: 0,track_id,title,artist_id,popularity,duration_ms_x,like,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms_y,time_signature
0,5mhUPDWQH3s544WCsa0r9w,Human,0C0XlULifJtAgn6ZNCW2eu,63.0,245360.0,1.0,0.547,0.837,10.0,-7.085,1.0,0.0676,0.00112,0.000624,0.0937,0.571,135.47,245360.0,4.0
1,10ViidwjGLCfVtGPfdcszR,Home,7giUHu5pv6YTZgSkxxCcgh,68.0,306320.0,1.0,0.545,0.59,2.0,-6.693,1.0,0.0327,0.32,0.000389,0.124,0.125,111.739,306320.0,4.0
2,1Thv8uCYzyOFC7PME9J936,The Island - Pt. I (Dawn),7MqnCTCAX6SsIYYdJCQj9B,56.0,320173.0,1.0,0.578,0.893,9.0,-3.615,0.0,0.0557,6.2e-05,0.00629,0.147,0.37,126.018,320173.0,4.0
3,0Yo8GhK9HEaeIZetPARrW9,Radioactive,53XhwfbYqKCa1cC15pYq2q,53.0,188120.0,1.0,0.46,0.799,9.0,-3.502,1.0,0.0573,0.124,0.000598,0.288,0.266,136.26,188120.0,4.0
4,3ZH2HcN2Q5jXrsnRznS98E,Storytime - Radio Edit,2NPduAUeLVsfIauhRwuft1,19.0,239173.0,1.0,0.48,0.975,5.0,-3.142,0.0,0.0712,0.00485,0.000604,0.121,0.53,156.185,239173.0,4.0


<h1>Data Preparation</h1>
Perform some scaling on the data to have everything well distributed

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

num_col = features_df.describe().columns
scaled_features_df = features_df
scaled_features_df[num_col] = pd.DataFrame(scaler.fit_transform(scaled_features_df[num_col]), columns=num_col)

scaled_features_df.head()

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,5mhUPDWQH3s544WCsa0r9w,0.501351,0.832668,0.909091,0.725104,1.0,0.139402,0.00115,0.000658,0.072838,0.579278,0.590024,0.382135,0.5
1,10ViidwjGLCfVtGPfdcszR,0.498649,0.558954,0.181818,0.74496,1.0,0.028326,0.330233,0.00041,0.105877,0.096907,0.423641,0.543116,0.5
2,1Thv8uCYzyOFC7PME9J936,0.543243,0.894725,0.818182,0.900871,0.0,0.101528,5.8e-05,0.006635,0.130956,0.361886,0.523754,0.579698,0.5
3,0Yo8GhK9HEaeIZetPARrW9,0.383784,0.790559,0.818182,0.906595,1.0,0.10662,0.127962,0.000631,0.284702,0.249405,0.595563,0.230979,0.5
4,3ZH2HcN2Q5jXrsnRznS98E,0.410811,0.985594,0.454545,0.92483,0.0,0.150859,0.004999,0.000637,0.102606,0.534934,0.735262,0.365797,0.5


<h2>Distances between tracks</h2>

In [5]:
from scipy.spatial.distance import squareform, pdist
dist_df = pd.DataFrame(squareform(pdist(scaled_features_df.ix[:, 1:])), columns=scaled_features_df.track_id.unique(),
             index=scaled_features_df.track_id.unique())

liked_tracks_id = tracks_df[tracks_df.like == 1]['track_id']
#On the colums, we keep only the song the user like
dist_df = dist_df[liked_tracks_id]

#On the rows, we drop the song we like
dist_df = dist_df.drop(liked_tracks_id)

#On the rows, we have the new tracks and on the column, we have the song the user like

#dist_df['overall_dist'] = dist_df.quantile(q=0.2, axis=1)
dist_df['overall_dist'] = dist_df.mean(axis=1)

dist_df.sort_values(['overall_dist'], ascending=True, inplace=True)
dist_df.head()
#sns.heatmap(data=dist_df, cmap="YlGnBu")

Unnamed: 0,5mhUPDWQH3s544WCsa0r9w,10ViidwjGLCfVtGPfdcszR,1Thv8uCYzyOFC7PME9J936,0Yo8GhK9HEaeIZetPARrW9,3ZH2HcN2Q5jXrsnRznS98E,36482hNESSwELpr9sS3NbE,0qnqsfFYgBo0sPHM2JmfTq,3FKmqPkNa8f1UIZuw3SSfV,6GyFP1nfCDB8lbD2bG0Hq9,5s0dPzEGFSuXlQxTrxIWYa,...,20I8RduZC2PWMWTDCZuuAN,0h1UKhueXm1YFLpQjzwv0E,32IMcJ5pQ71rOFD0bIdneJ,4q7XSM6laraTPDglxDqn28,0q6LuUqGLUiCPP1cbdwFs3,1KQADAb1WscjfKMMF5JMFD,35BjsiYKNkMA3JPZFXjgXt,3rN3NK7KO1NiI1j79L5Bi8,3ChV0OY2OMofstepo4uVXy,overall_dist
5G0dcveKXu0WqaZywtTWPg,1.117449,1.189017,0.504666,1.087378,0.469002,0.612425,1.068343,0.75274,0.571399,1.509676,...,0.605243,0.269633,0.57513,1.174003,1.248005,1.183668,1.583937,1.284489,0.492888,0.965438
2hl8lxRV5RLvXZhiG2yGxC,1.076793,1.24777,0.35371,1.091031,0.390355,0.599233,1.059619,0.920812,0.520823,1.483568,...,0.696777,0.332272,0.468619,1.194398,1.27173,1.217612,1.531986,1.299894,0.545742,0.975736
6nsLzJfvp5OLd4mgqUJkpq,1.129861,1.220161,0.555448,1.139106,0.483184,0.614535,1.155997,0.635049,0.673837,1.575227,...,0.628776,0.410986,0.76054,1.14085,1.250653,1.125305,1.686923,1.246569,0.416035,0.982643
2takcwOaAZWiXQijPHIx7B,1.025832,1.323398,0.27797,1.084573,0.527561,0.658452,1.130817,0.837572,0.32182,1.599992,...,0.683888,0.458215,0.63837,1.17416,1.324684,1.187652,1.590044,1.398525,0.487003,0.983675
7ec55GDkRIBmsI8XKrG1lu,1.048987,1.306546,0.338924,1.145833,0.331288,0.594023,1.097972,0.906955,0.574252,1.547155,...,0.642179,0.465405,0.634631,1.148427,1.271003,1.160525,1.616248,1.261196,0.563622,0.988261


In [6]:
#Get the 10 best tracks
best_tracks_id = dist_df.head(20).index

tracks_df[tracks_df.track_id.isin(best_tracks_id)]

Unnamed: 0,track_id,title,artist_id,popularity,duration_ms,like
47,0kYUrLVQOfx21xuXu7OGrT,All These Things That I've Done,0C0XlULifJtAgn6ZNCW2eu,63.0,301573.0,0.0
50,4prEPl61C8qZpeo3IkYSMl,Sleepyhead,7gjAu1qr5C2grXeQFFOGeh,64.0,174760.0,0.0
51,4VbDJMkAX3dWNBdn3KH6Wx,Helena Beat,7gP3bB2nilZXLfPHJhMdvc,63.0,276173.0,0.0
54,2takcwOaAZWiXQijPHIx7B,Time Is Running Out,12Chz98pHFMPJEknJQMWvI,63.0,237039.0,0.0
63,12HB8AmFTovKrFcGG36KbL,Delilah,1moxjboGR7GNWYIMWsRjgG,58.0,293468.0,0.0
70,3HreJkWbRih4QPnIMJQK1i,The Riddler,2NPduAUeLVsfIauhRwuft1,29.0,315960.0,0.0
82,7ec55GDkRIBmsI8XKrG1lu,Disco 2000,36E7oYfz3LLRto6l2WmDcD,56.0,273733.0,0.0
107,2hl8lxRV5RLvXZhiG2yGxC,Sing - Yellow Claw & Cesqeaux Remix,5zYJziKktyqWwmoAWXrShP,41.0,243216.0,0.0
113,2K7j4xrQENCi5r3Hii4cVe,Shatter Me Featuring Lzzy Hale,378dH6EszOLFShpRzAQkVM,57.0,280799.0,0.0
123,7pPoBydlANHqWDYgm2DkSK,Cloud 9,1feoGrmmD8QmNqtK2Gdwy8,55.0,277577.0,0.0


We have the reduce the dist matrix. We should only see the distance between the song we like and the others