# Deep learning : cas d’étude

Cette étude consiste à réaliser un outil de classification basé sur un algorithme portant sur les réseaux de neurones artificiels. 

In [68]:
import warnings
warnings.filterwarnings('ignore')

from math import isnan
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import torch 
from torch.utils.data import DataLoader

import custom_functions

## Importation des données

Spotify_songs : fichier csv contenant + de 30 000 chansons

In [69]:
spotify = pd.read_csv(".\data\spotify_songs.csv")
pd.set_option('display.max_columns',500)
spotify

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102000,0.000000,0.0653,0.5180,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.072400,0.004210,0.3570,0.6930,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.079400,0.000023,0.1100,0.6130,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.718,0.930,7,-3.778,1,0.1020,0.028700,0.000009,0.2040,0.2770,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.650,0.833,1,-4.672,1,0.0359,0.080300,0.000000,0.0833,0.7250,123.976,189052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32828,7bxnKAamR3snQ1VGLuVfC1,City Of Lights - Official Radio Edit,Lush & Simon,42,2azRoBBWEEEYhqV6sb7JrT,City Of Lights (Vocal Mix),2014-04-28,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,progressive electro house,0.428,0.922,2,-1.814,1,0.0936,0.076600,0.000000,0.0668,0.2100,128.170,204375
32829,5Aevni09Em4575077nkWHz,Closer - Sultan & Ned Shepard Remix,Tegan and Sara,20,6kD6KLxj7s8eCE3ABvAyf5,Closer Remixed,2013-03-08,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,progressive electro house,0.522,0.786,0,-4.462,1,0.0420,0.001710,0.004270,0.3750,0.4000,128.041,353120
32830,7ImMqPP3Q1yfUHvsdn7wEo,Sweet Surrender - Radio Edit,Starkillers,14,0ltWNSY9JgxoIZO4VzuCa6,Sweet Surrender (Radio Edit),2014-04-21,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,progressive electro house,0.529,0.821,6,-4.899,0,0.0481,0.108000,0.000001,0.1500,0.4360,127.989,210112
32831,2m69mhnfQ1Oq6lGtXuYhgX,Only For You - Maor Levi Remix,Mat Zo,15,1fGrOkHnHJcStl14zNx8Jy,Only For You (Remixes),2014-01-01,♥ EDM LOVE 2020,6jI1gFr6ANFtT8MmTvA2Ux,edm,progressive electro house,0.626,0.888,2,-3.361,1,0.1090,0.007920,0.127000,0.3430,0.3080,128.008,367432


## Statistiques descriptives

In [70]:
#Dimensions de notre jeu de données
spotify.shape #+ de 30 000 lignes pour 23 colonnes

(32833, 23)

In [71]:
#Description des variables et leur type 
spotify.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudne

In [49]:
#Description des variables quantitatives
spotify.describe() #beaucoup de disparités au niveau des unités, il faudra normaliser nos données

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0,32833.0
mean,42.477081,0.65485,0.698619,5.374471,-6.719499,0.565711,0.107068,0.175334,0.084747,0.190176,0.510561,120.881132,225799.811622
std,24.984074,0.145085,0.18091,3.611657,2.988436,0.495671,0.101314,0.219633,0.22423,0.154317,0.233146,26.903624,59834.006182
min,0.0,0.0,0.000175,0.0,-46.448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000.0
25%,24.0,0.563,0.581,2.0,-8.171,0.0,0.041,0.0151,0.0,0.0927,0.331,99.96,187819.0
50%,45.0,0.672,0.721,6.0,-6.166,1.0,0.0625,0.0804,1.6e-05,0.127,0.512,121.984,216000.0
75%,62.0,0.761,0.84,9.0,-4.645,1.0,0.132,0.255,0.00483,0.248,0.693,133.918,253585.0
max,100.0,0.983,1.0,11.0,1.275,1.0,0.918,0.994,0.994,0.996,0.991,239.44,517810.0


In [72]:
#description des variables qualitatives
spotify.describe(include='object') #beaucoup de variables qualitatives différentes par colonne, on ne pourra pas toutes les garder pour notre modèle
                                #mais possibilité d'encoder notre variable à prédire : playlist_genre

Unnamed: 0,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre
count,32833,32828,32828,32833,32828,32833,32833,32833,32833,32833
unique,28356,23449,10692,22545,19743,4530,449,471,6,24
top,7BKLCZ1jbUBVqRi2FVlTVw,Poison,Martin Garrix,5L1xcowSxwzFUSJzvyMp48,Greatest Hits,2020-01-10,Indie Poptimism,4JkkvMpVl4lSioqQjeAL0q,edm,progressive electro house
freq,10,22,161,42,139,270,308,247,6043,1809


In [73]:
#On affiche s'il y a des valeurs NaN
print("Number of NaN :", spotify.isna().sum().sum()) #à priori, 15 NaN dans notre fichier

Number of NaN : 15


In [74]:
#Détails sur quelles colonnes
spotify.isna().sum() #track_name, track_artist et track_album_name contiennent respectivement 5 NaN, peut-être les mêmes individus concernés ?

track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64

In [75]:
#Lignes correspondant au NaN dans track_name
spotify[spotify['track_name'].isna()] #en regardant les 3 colonnes, ce sont bien les mêmes individus qui sont concernés

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
8151,69gRFGOWY9OMpFJgFol1u0,,,0,717UG2du6utFe7CdmpuUe3,,2012-01-05,HIP&HOP,5DyJsJZOpMJh34WvUrQzMV,rap,southern hip hop,0.714,0.821,6,-7.635,1,0.176,0.041,0.0,0.116,0.649,95.999,282707
9282,5cjecvX0CmC9gK0Laf5EMQ,,,0,3luHJEPw434tvNbme3SP8M,,2017-12-01,GANGSTA Rap,5GA8GDo7RQC3JEanT81B3g,rap,gangster rap,0.678,0.659,11,-5.364,0,0.319,0.0534,0.0,0.553,0.191,146.153,202235
9283,5TTzhRSWQS4Yu8xTgAuq6D,,,0,3luHJEPw434tvNbme3SP8M,,2017-12-01,GANGSTA Rap,5GA8GDo7RQC3JEanT81B3g,rap,gangster rap,0.465,0.82,10,-5.907,0,0.307,0.0963,0.0,0.0888,0.505,86.839,206465
19568,3VKFip3OdAvv4OfNTgFWeQ,,,0,717UG2du6utFe7CdmpuUe3,,2012-01-05,Reggaeton viejito🔥,0si5tw70PIgPkY1Eva6V8f,latin,reggaeton,0.675,0.919,11,-6.075,0,0.0366,0.0606,0.00653,0.103,0.726,97.017,252773
19811,69gRFGOWY9OMpFJgFol1u0,,,0,717UG2du6utFe7CdmpuUe3,,2012-01-05,latin hip hop,3nH8aytdqNeRbcRCg3dw9q,latin,latin hip hop,0.714,0.821,6,-7.635,1,0.176,0.041,0.0,0.116,0.649,95.999,282707


In [76]:
#Etant donné que ce sont variabes qualitatives, il est difficile de les remplacer par la moyenne ou la médiane
#ou par le mod puisque chaque ligne représente une chanson différente. On va donc supprimer ces lignes
spotify = spotify.dropna()

In [77]:
#Représentation de notre variable à prédire par rapport à la variable playlist_subgenre
spotify[['playlist_genre', 'playlist_subgenre']] = spotify[['playlist_genre', 'playlist_subgenre']] \
                                                  .apply(lambda x: x.str.capitalize(), axis=1)
fig = px.sunburst(spotify,
                  path=['playlist_genre', 'playlist_subgenre'], 
                  color='track_popularity', 
                  labels={'track_popularity': 'Popularity'})
fig.show()

In [78]:
#Variable à prédire
y = spotify["playlist_genre"]
y_label = spotify["playlist_genre"].unique()
spotify.drop(columns=['playlist_genre'], inplace=True) #on retire la variable cible du dataset
print(y_label) #représente le genre de la musique
y_label.shape #on a 6 modalités, pop, rap, rock, latino, rhythm and blues (r&b) et electronic (edm)

['Pop' 'Rap' 'Rock' 'Latin' 'R&b' 'Edm']


(6,)

In [79]:
#les classes sont-elles équilibrées ? Si non, possibilité d'over- ou under-sampling
nb_y = y.value_counts()
print(nb_y)
freq_y = y.value_counts() / len(y)
print(freq_y) #les classes sont bien équilibrées

playlist_genre
Edm      6043
Rap      5743
Pop      5507
R&b      5431
Latin    5153
Rock     4951
Name: count, dtype: int64
playlist_genre
Edm      0.184081
Rap      0.174942
Pop      0.167753
R&b      0.165438
Latin    0.156970
Rock     0.150816
Name: count, dtype: float64


In [80]:
#On garde seulement l'année de sortie de l'album et on label encoding
spotify['track_album_release_date'] = spotify['track_album_release_date'].str[:4]
year = spotify['track_album_release_date'].unique()
print(year.sort())
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#spotify['track_album_release_date'] = le.fit_transform(spotify['track_album_release_date'])

#Classe d'années


None


In [64]:
#Suppression des variables trop corrélées à a variable cible et des variables qualitatives avec beaucoup de modalités
spotify.drop(columns=['track_id','track_artist','track_album_id', 'track_album_name','track_album_release_date','track_name', 'playlist_id','playlist_subgenre','playlist_name'], axis=1, inplace=True)

In [65]:
spotify.head(5)

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,66,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,67,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,70,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,60,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,69,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [63]:
#modify y by assigning a number to each class
y = y.replace('Pop', 0)
y = y.replace('Rap', 1)
y = y.replace('Rock', 2)
y = y.replace('R&b', 3)
y = y.replace('Latin', 4)
y = y.replace('Edm', 5)
print(y)

0        0
1        0
2        0
3        0
4        0
        ..
32828    5
32829    5
32830    5
32831    5
32832    5
Name: playlist_genre, Length: 32828, dtype: int64


In [66]:
#normalisation des données
scaler = StandardScaler()
spotify = scaler.fit_transform(spotify)

## Test avec d'autres modèles de classification
Aucun travail de classification a été fait avec ce jeu de données, nous allons tester d'autres modèles de machine learning afin de pouvoir comparer nos résultats

In [36]:
train_X, test_X, train_y, test_y = train_test_split(spotify, y, 
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=100,
                                                    stratify=y)

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Logistic Regression
logreg = LogisticRegression()
logreg.fit(train_X, train_y)
y_pred = logreg.predict(test_X)
print('Accuracy score of logistic regression classifier on test set: {:.2f}'.format(accuracy_score(test_y, y_pred)))

#Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(train_X, train_y)
y_pred = clf.predict(test_X)
print('Accuracy score of Decision Tree classifier on test set: {:.2f}'.format(accuracy_score(test_y, y_pred)))

#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(train_X, train_y)
y_pred = knn.predict(test_X)
print('Accuracy score of KNN classifier on test set: {:.2f}'.format(accuracy_score(test_y, y_pred)))

#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(train_X, train_y)
y_pred = rf.predict(test_X)
print('Accuracy score of Random Forest classifier on test set: {:.2f}'.format(accuracy_score(test_y, y_pred)))

Accuracy score of logistic regression classifier on test set: 0.47
Accuracy score of Decision Tree classifier on test set: 0.43
Accuracy score of KNN classifier on test set: 0.46
Accuracy score of Random Forest classifier on test set: 0.55


## Implémentation

In [39]:
#change Y to numpy.ndarray
train_y = train_y.to_numpy()
test_y = test_y.to_numpy()

## Transformer les données en Tenseurs 
X_train = torch.FloatTensor(train_X)
X_test = torch.FloatTensor(test_X)
Y_train = torch.LongTensor(train_y)
Y_test = torch.LongTensor(test_y)
 
print("Dimension de X train",X_train.shape)
print("Dimension de X test",X_test.shape)
print("Dimension de Y", Y_train.shape)

Dimension de X train torch.Size([22979, 13])
Dimension de X test torch.Size([9849, 13])
Dimension de Y torch.Size([22979])


On transforme la variable cible pour obtenir un codage disjonctif complet, autrement appelé *one hot encoding* :

- 0 -> (1,0,0,0,0,0)    pop
- 1 -> (0,1,0,0,0,0)    rap
- 2 -> (0,0,1,0,0,0)    rock
- 3 -> (0,0,0,1,0,0)    r&b
- 4 -> (0,0,0,0,1,0)    latin
- 5 -> (0,0,0,0,0,1)    edm


In [40]:
Y_train_cat = torch.nn.functional.one_hot(Y_train)
Y_test_cat = torch.nn.functional.one_hot(Y_test)

In [41]:
dataset_train = custom_functions.CustomDataset(X_train, Y_train_cat)
batch_size = 1000
dataloader = custom_functions.get_dataloader(dataset_train, batch_size, shuffle=True)

## Perceptron avec une couche cachée

### Description du modèle

In [42]:
# Le modèle : un simple MLP à 1 couche cachée
D_in = 13  # dimension de l'input : 4 
D_hidden = 32 # dimension de la couche cachée : 64
D_hidden2 = 64 # dimension de la couche cachée : 64
D_out = 6 # dimension de l'output : 3
 
# Utilisation d'un constructeur séquentiel pour définir l'architecture du réseau
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, D_hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(D_hidden, D_hidden2),
    torch.nn.ReLU(),
    torch.nn.Linear(D_hidden2, D_out),
)
print(model) 
 
loss_fn = torch.nn.CrossEntropyLoss(reduction='sum') # La fonction de coût cross entropy
learning_rate = 1e-2   # le pas d'apprentissage
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # l'optimiseur
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # l'optimiseur

Sequential(
  (0): Linear(in_features=13, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=6, bias=True)
)


In [43]:
custom_functions.summary(model)

Couche 0: [32, 13] ([34m416[0m paramètres entrainables)
Couche 1: [32] ([34m32[0m paramètres entrainables)
Couche 2: [64, 32] ([34m2048[0m paramètres entrainables)
Couche 3: [64] ([34m64[0m paramètres entrainables)
Couche 4: [6, 64] ([34m384[0m paramètres entrainables)
Couche 5: [6] ([34m6[0m paramètres entrainables)
  = [31m2950[0m paramètres entrainables


### Apprentissage du modèle

In [129]:
Nepochs = 200 # Nombre d'époques (Nombre de passes sur toutes les données) 
Nprint  = Nepochs/10  # fréquence de l'affichage

# mesures que l'on souhaite enregistrer pour monitorer l'apprentissage (log)
log_loss_batch = []
log_loss = []
log_acc_val = []

for epoch in range(Nepochs):
    total=0.
    for batch_idx, (data, target) in enumerate(dataloader): # 1 batch
        loss_batch=0.
        lenB = data.shape[0]
        optimizer.zero_grad()
        prediction = model(data)
        loss = loss_fn(prediction, target.double())
        loss.backward()  # backpropagation à travers le réseau       
        optimizer.step() # mise à jour des paramètres du réseau ( w = w -lr * w.grad) équivalent à une itération du SGD
        loss_batch+=loss # calcul de la loss sur tous les exemples du batch
        total += loss_batch # calcul de la loss sur tous les exemples d'entraînement
        # on enregistre les informations pour le monitoring
        log_loss_batch.append(loss_batch.item())
        pred_val = model(X_test)
        acc_val = custom_functions.get_accuracy_multiclass(pred_val,Y_test_cat,6)
        log_acc_val.append(acc_val)
    # affichage
    if epoch%Nprint==0:
        print(f"epoch {epoch} : loss {total:.4f} val {acc_val:.2%}")
    log_loss.append(total.item())

epoch 0 : loss 34663.1931 val 47.56%
epoch 20 : loss 26276.7308 val 54.67%
epoch 40 : loss 25429.1092 val 54.83%
epoch 60 : loss 25105.6383 val 54.72%
epoch 80 : loss 24684.7096 val 53.85%
epoch 100 : loss 24608.4738 val 54.04%
epoch 120 : loss 24447.5088 val 53.62%
epoch 140 : loss 24366.9685 val 53.80%
epoch 160 : loss 24265.9719 val 53.59%
epoch 180 : loss 24184.8575 val 53.80%


### Prédiction du modèle

In [130]:
predictions_test = model(X_test)
acc_val = custom_functions.get_accuracy_multiclass(predictions_test,Y_test_cat,6)
print(f"accuracy : {acc_val:.2%}")

accuracy : 54.00%
