In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")

from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

In [35]:
spotify = pd.read_csv("./src/tracks.csv", low_memory=False)
print(spotify.shape)
spotify.columns

(586672, 20)


Index(['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
       'id_artists', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature'],
      dtype='object')

In [36]:
spotify.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [37]:
def format_sample_df(df):
    df['artists'] = df['artists'].apply(lambda x: x[2:-2])
    df.rename(columns={'name':'track'}, inplace=True)
    cols1 = ['track','artists','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness']
    cols2 = ['liveness','valence','tempo','duration_ms','popularity','explicit','release_date']
    return df[cols1+cols2]

In [38]:
spotify = format_sample_df(spotify)
spotify.head()

Unnamed: 0,track,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,popularity,explicit,release_date
0,Carve,Uli,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,126903,6,0,1922-02-22
1,Capítulo 2.16 - Banquero Anarquista,Fernando Pessoa,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,98200,0,0,1922-06-01
2,Vivo para Quererte - Remasterizado,Ignacio Corsini,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,181640,0,0,1922-03-21
3,El Prisionero - Remasterizado,Ignacio Corsini,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,176907,0,0,1922-03-21
4,Lady of the Evening,Dick Haymes,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,163080,0,0,1922


Ici, nous enlevons les variables inutilisables telles que le nom du titre et de l'artiste. De plus, nous enlevons la variable qualitative `key`. On va la rajouter en l'encodant en one-hot (avec **OneHotEncoder**).

D'abord, on va transformer la variable `release_date` pour obtenir des tranches d'années qui permettront une meilleure représentation du modèle

In [39]:
def create_decennies_categ(x):
    if x < 1940:
        return '-40s'
    if (x >= 1940) & (x < 1950):
        return '40s'
    if (x >= 1950) & (x < 1960):
        return '50s'
    if (x >= 1960) & (x < 1970):
        return '60s'
    if (x >= 1970) & (x < 1980):
        return '70s'
    if (x >= 1980) & (x < 1990):
        return '80s'
    if (x >= 1990) & (x < 2000):
        return '90s'
    if (x >= 2000) & (x < 2010):
        return '2000s'
    if (x >= 2010) & (x < 2020):
        return '2010s'
    else:
        return '+2020s'

In [40]:
spotify_cp = spotify.copy()
spotify_cp['year'] = spotify_cp['release_date'].apply(lambda x: x[:4])
spotify_cp['year'] = spotify_cp['year'].astype(int)
spotify_cp['decade'] = spotify_cp['year'].apply(create_decennies_categ)
spotify_cp.drop(['year', 'release_date'], axis=1, inplace=True)

In [41]:
encoder = OneHotEncoder()

In [42]:
encoder.fit_transform(spotify[['key']]).todense()

matrix([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
encoder.fit(spotify_cp[['key']]).categories_[0]


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [44]:
X = spotify_cp.drop(["track", "artists", "key", "decade"], axis=1)

var_qualitatives = ['key', 'decade']
for v in var_qualitatives:
    # on fit pour que le modèle ajuste l'encodage one hot sur les données d'entrainement
    onehot = encoder.fit(spotify_cp[[v]])
    # on convertit les nouvelles données en onehot avec transform() puis on cree un dataframe
    onehot_df = pd.DataFrame(onehot.transform(spotify_cp[[v]]).toarray(), columns = [v+":"+str(c) for c in onehot.categories_[0]])
    X = pd.concat([X, onehot_df], axis = 1)

X

Unnamed: 0,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,decade:+2020s,decade:-40s,decade:2000s,decade:2010s,decade:40s,decade:50s,decade:60s,decade:70s,decade:80s,decade:90s
0,0.645,0.4450,-13.338,1,0.4510,0.674,0.744000,0.1510,0.1270,104.851,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.695,0.2630,-22.136,1,0.9570,0.797,0.000000,0.1480,0.6550,102.009,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.434,0.1770,-21.180,1,0.0512,0.994,0.021800,0.2120,0.4570,130.418,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.321,0.0946,-27.961,1,0.0504,0.995,0.918000,0.1040,0.3970,169.980,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.402,0.1580,-16.900,0,0.0390,0.989,0.130000,0.3110,0.1960,103.220,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586667,0.560,0.5180,-7.471,0,0.0292,0.785,0.000000,0.0648,0.2110,131.896,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
586668,0.765,0.6630,-5.223,1,0.0652,0.141,0.000297,0.0924,0.6860,150.091,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
586669,0.535,0.3140,-12.823,0,0.0408,0.895,0.000150,0.0874,0.0663,145.095,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
586670,0.696,0.6150,-6.212,1,0.0345,0.206,0.000003,0.3050,0.4380,90.029,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
y = spotify["explicit"]
y

0         0
1         0
2         0
3         0
4         0
         ..
586667    0
586668    0
586669    0
586670    0
586671    0
Name: explicit, Length: 586672, dtype: int64

#### Fonction permettant de voir rapidement les résultats d'un modèle

In [46]:
def results(m):
    print("Méthode :", m)
    print()
    print("Score : " + str(m.score(X,y)))
    print()
    print(pandas.crosstab(m.predict(X), y))
    
#results(m_tree)

#### $k$-NN

In [47]:
m_knn = KNeighborsClassifier(n_neighbors=5).fit(X, y)
results(m_knn)

Méthode : KNeighborsClassifier()



#### Arbres de décisions

In [None]:
m_tree = DecisionTreeClassifier().fit(X, y)

In [None]:
pruning = m_tree.cost_complexity_pruning_path(X, y)

fig, ax = plt.subplots()
ax.plot(pruning.ccp_alphas[:-1], pruning.impurities[:-1], marker="o") # , drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()

In [None]:
m_tree_elague = DecisionTreeClassifier(ccp_alpha=0.002).fit(X, y)

results(m_tree_elague)

### Réseaux de neurones

In [None]:
multi_layer_perceptron = MLPClassifier(hidden_layer_sizes=(15, 10)).fit(X,y)

results(multi_layer_perceptron)