In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")

from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

In [9]:
spotify = pd.read_csv('./src/spotify_dataset.csv')
spotify.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,popularity,decade
0,Jealous Kind Of Fella,Garland Green,spotify:track:1dtKN6wwlolkM8XZy2y9C1,0.417,0.62,3,-7.727,1,0.0403,0.49,0.0,0.0779,0.845,185.655,173533,3,32.94975,9,1,60s
1,Initials B.B.,Serge Gainsbourg,spotify:track:5hjsmSnUefdUqzsDogisiX,0.498,0.505,3,-12.475,1,0.0337,0.018,0.107,0.176,0.797,101.801,213613,4,48.8251,10,0,60s
2,Melody Twist,Lord Melody,spotify:track:6uk8tI6pwxxdVTNlNOJeJh,0.657,0.649,5,-13.392,1,0.038,0.846,4e-06,0.119,0.908,115.94,223960,4,37.22663,12,0,60s
3,Mi Bomba Sonó,Celia Cruz,spotify:track:7aNjMJ05FvUXACPWZ7yJmv,0.59,0.545,7,-12.058,0,0.104,0.706,0.0246,0.061,0.967,105.592,157907,4,24.75484,8,0,60s
4,Uravu Solla,P. Susheela,spotify:track:1rQ0clvgkzWr001POOPJWx,0.515,0.765,11,-3.515,0,0.124,0.857,0.000872,0.213,0.906,114.617,245600,4,21.79874,14,0,60s


In [10]:
encoder = OneHotEncoder()

X = spotify.drop(["track", "artist", "uri", "popularity", "key", "time_signature", "decade"], axis = 1)

qualitative_feature = ['key', 'time_signature', 'decade']
for feature in qualitative_feature:
    enc = encoder.fit(spotify[[feature]])
    enc_df = pd.DataFrame(enc.transform(spotify[[feature]]).toarray(), columns = [feature+":"+str(c) for c in enc.categories_[0]])
    X = pd.concat([X, enc_df], axis=1)
X

Unnamed: 0,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,time_signature:1,time_signature:3,time_signature:4,time_signature:5,decade:00s,decade:10s,decade:60s,decade:70s,decade:80s,decade:90s
0,0.417,0.620,-7.727,1,0.0403,0.4900,0.000000,0.0779,0.8450,185.655,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.498,0.505,-12.475,1,0.0337,0.0180,0.107000,0.1760,0.7970,101.801,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.657,0.649,-13.392,1,0.0380,0.8460,0.000004,0.1190,0.9080,115.940,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.590,0.545,-12.058,0,0.1040,0.7060,0.024600,0.0610,0.9670,105.592,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.515,0.765,-3.515,0,0.1240,0.8570,0.000872,0.2130,0.9060,114.617,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41094,0.172,0.358,-14.430,1,0.0342,0.8860,0.966000,0.3140,0.0361,72.272,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
41095,0.910,0.366,-9.954,1,0.0941,0.0996,0.000000,0.2610,0.7400,119.985,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
41096,0.719,0.804,-4.581,1,0.0355,0.0132,0.000003,0.1390,0.6050,119.999,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
41097,0.600,0.177,-16.070,1,0.0561,0.9890,0.868000,0.1490,0.5600,120.030,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
y = spotify["popularity"]
y

0        1
1        0
2        0
3        0
4        0
        ..
41094    0
41095    1
41096    1
41097    0
41098    0
Name: popularity, Length: 41099, dtype: int64

Fonction permettant de voir rapidement les résultats d'un modèle

In [12]:
def results(m):
    print("Méthode :", m)
    print()
    print("Score : " + str(m.score(X,y)))
    print()
    print(pd.crosstab(m.predict(X), y))
    
#results(m_tree)

### $k$-NN

In [13]:
knn = KNeighborsClassifier(n_neighbors=5).fit(X, y)
results(knn)

Méthode : KNeighborsClassifier()

Score : 0.7237159054964841

popularity      0      1
row_0                   
0           13879   4683
1            6672  15865
