# **Deep Learning - Rendu n°2 : Cas d'études**

Elyes KHALFALLAH - 5230635

13/12/2024

---


## **1 :** Jeu de données


### **1.2 :** Quelques statistiques descriptives


In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
# Chargement des fichiers CSV
train_df = pd.read_csv("train.csv", sep="\t")
test_df = pd.read_csv("test.csv", sep="\t")


In [None]:
# Afficher les quelques premieres lignes
train_df.head()


In [None]:
# Obtenir les types des données de chaque colonnes, ainsi que vérifier
train_df.info()


- "`float64(9)`, `int64(4)`", nous avons 13 valeurs numériques
- "`object(7)`", nous avons 7 valeurs numériques


In [None]:
# Obtenir des statistiques descriptives des colonnes numériques
train_df.describe()


In [None]:
# Distribution des genres musicaux des morceaux regardés
genre_counts = train_df["playlist_genre"].value_counts()
print(genre_counts)


In [None]:
# Distribution des sous-genres musicaux des morceaux regardés
subgenre_counts = train_df["playlist_subgenre"].value_counts()
print(subgenre_counts)


In [None]:
# Histogrammes des variables numériques

num_cols = [
    "track_popularity",
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "duration_ms",
]

train_df[num_cols].hist(figsize=(16, 16))
plt.tight_layout()
plt.show()


In [None]:
# Histogramme pour la répartition des genres

plt.figure(figsize=(10, 6))
ax = genre_counts.plot(kind="bar", color="skyblue", edgecolor="black")
plt.title("Distribution des genres de playlist", fontsize=16)
plt.xlabel("Genre", fontsize=14)
plt.ylabel("Nombre de pistes", fontsize=14)
plt.grid(True, which="both", linestyle="--", linewidth=1, alpha=0.7)

# Le bout de code suivant, gérant l'affichage des valeurs sur les barres a été généré avec ChatGPT :
# Affichage des valeurs au-dessus des barres
for p in ax.patches:
    height = p.get_height()
    plt.text(
        p.get_x() + p.get_width() / 2,  # Position en x (centre de la barre)
        height,  # Position en y (hauteur de la barre)
        f"{int(height)}",  # Texte à afficher
        ha="center",  # Alignement horizontal
        va="bottom",  # Alignement vertical
        fontsize=12,
        color="black",
    )


plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 10))
corr_matrix = train_df[num_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Matrice de corrélation des variables numériques")
plt.show()


In [None]:
popularity_by_genre = train_df.groupby("playlist_genre")["track_popularity"].mean()
print(popularity_by_genre)


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=train_df, x="danceability", y="energy", hue="playlist_genre")
plt.title("Relation entre la dansabilité et l'énergie par genre")
plt.show()


### **1.3 :** Application d'algorithmes standards


In [None]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Chargement des données d'entraînement
train_df = pd.read_csv("train.csv", sep="\t")


In [None]:
# Liste des colonnes quantitatives
quantitative_columns = [
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "duration_ms",
]

# Sélection des features (X)
X = train_df[quantitative_columns]


In [None]:
# Variable cible
y = train_df["playlist_genre"]

# Encodage des labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)


In [None]:
scaler = StandardScaler()

# Ajustement sur les données d'entraînement
X_train_scaled = scaler.fit_transform(X_train)

# Transformation des données de test
X_test_scaled = scaler.transform(X_test)


#### SVM


In [None]:
svm_classifier = SVC()
svm_classifier.fit(X_train_scaled, y_train)

y_pred_svm = svm_classifier.predict(X_test_scaled)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy du SVM : {accuracy_svm:.2f}")


#### Arbre de décision


In [None]:
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

y_pred_dt = dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy de l'Arbre de Décision : {accuracy_dt:.2f}")


#### K-NN


In [None]:
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train_scaled, y_train)

y_pred_knn = knn_classifier.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy du KNN : {accuracy_knn:.2f}")


#### Random Forest


In [None]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy du Random Forest : {accuracy_rf:.2f}")


#### Regression Logistique


In [None]:
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_scaled, y_train)

y_pred_lr = lr_classifier.predict(X_test_scaled)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy de la Régression Logistique : {accuracy_lr:.2f}")


#### Resultats


In [None]:
print("\nRésultats des modèles :\n")
print(f"Accuracy du SVM : {accuracy_svm:.2f}")
print(f"Accuracy de l'Arbre de Décision : {accuracy_dt:.2f}")
print(f"Accuracy du KNN : {accuracy_knn:.2f}")
print(f"Accuracy du Random Forest : {accuracy_rf:.2f}")
print(f"Accuracy de la Régression Logistique : {accuracy_lr:.2f}")


##### **INTERESTING TO LOOK AT BECAUSE RANDOM FOREST IS THE BEST ONE**


In [None]:
print("\nRapport de classification pour le Random Forest :")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

print("\n\n\nMatrice de confusion pour le Random Forest :")
print(confusion_matrix(y_test, y_pred_rf))


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_classifier, X, y_encoded, cv=5)
print(f"Scores de validation croisée (Random Forest) : {scores}")
print(f"Score moyen : {scores.mean():.2f}")


La case suivante prend 8 minutes à elle seule pour tourner, les résultats que j'ai obtenu sont les suivants :

- Meilleurs paramètres : {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}
- Meilleure score : 0.56


In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 5, 10, 20],
#     'min_samples_split': [2, 5, 10]
# }

# grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# print(f"Meilleurs paramètres : {grid_search.best_params_}")
# print(f"Meilleure score : {grid_search.best_score_:.2f}")


In [None]:
importances = rf_classifier.feature_importances_
indices = np.argsort(importances)[::-1]

print("Importance des caractéristiques :")
for f in range(X.shape[1]):
    print(
        f"{f + 1}. {quantitative_columns[indices[f]]} ({importances[indices[f]]:.4f})"
    )
