In [1]:
import pandas as pd

In [2]:
# Chargement
df = pd.read_csv("./dataset/dataset.csv")
print("Dataset chargé avec", df.shape[0], "lignes et", df.shape[1], "colonnes.")

Dataset chargé avec 954 lignes et 29 colonnes.


In [3]:
# Vérifier les valeurs manquantes avant traitement
missing_values = df.isnull().sum()
print("Valeurs manquantes par colonne avant traitement :\n", missing_values[missing_values > 0])

Valeurs manquantes par colonne avant traitement :
 in_shazam_charts     50
key                  95
artist_mb           631
artist_lastfm       631
country_mb          641
country_lastfm      691
tags_lastfm         677
dtype: int64


In [4]:
# Renommer les colonnes du DataFrame
renommer_colonnes = {
    "track_name": "track",
    "artist.s._name": "artist_name",
    "in_spotify_playlists": "spotify_playlists",
    "in_spotify_charts": "spotify_charts",
    "in_apple_playlists": "apple_playlists",
    "in_apple_charts": "apple_charts",
    "in_deezer_playlists": "deezer_playlists",
    "in_deezer_charts": "deezer_charts",
    "in_shazam_charts": "shazam_charts",
    "danceability_.": "danceability",
    "valence_.": "valence",
    "energy_.": "energy",
    "acousticness_.": "acousticness",
    "instrumentalness_.": "instrumentalness",
    "liveness_.": "liveness",
    "speechiness_.": "speechiness",
}

df.rename(columns=renommer_colonnes, inplace=True)
print("Nouveaux noms de colonnes :")
print(df.columns)


Nouveaux noms de colonnes :
Index(['track', 'artist_name', 'artist_count', 'released_year',
       'released_month', 'released_day', 'spotify_playlists', 'spotify_charts',
       'streams', 'apple_playlists', 'apple_charts', 'deezer_playlists',
       'deezer_charts', 'shazam_charts', 'bpm', 'key', 'mode', 'danceability',
       'valence', 'energy', 'acousticness', 'instrumentalness', 'liveness',
       'speechiness', 'artist_mb', 'artist_lastfm', 'country_mb',
       'country_lastfm', 'tags_lastfm'],
      dtype='object')


In [5]:
# Supprimer les colonnes inutiles
df.drop(columns=['artist_mb', 'artist_lastfm', 'country_lastfm'], inplace=True)
print("Colonnes après suppression :")
print(df.columns)

Colonnes après suppression :
Index(['track', 'artist_name', 'artist_count', 'released_year',
       'released_month', 'released_day', 'spotify_playlists', 'spotify_charts',
       'streams', 'apple_playlists', 'apple_charts', 'deezer_playlists',
       'deezer_charts', 'shazam_charts', 'bpm', 'key', 'mode', 'danceability',
       'valence', 'energy', 'acousticness', 'instrumentalness', 'liveness',
       'speechiness', 'country_mb', 'tags_lastfm'],
      dtype='object')


In [6]:
# Compter les doublons avant suppression
duplicates_before = df.duplicated().sum()
df = df.drop_duplicates()
duplicates_after = df.duplicated().sum()
print(f"{duplicates_before - duplicates_after} doublons supprimés.")

0 doublons supprimés.


In [7]:
# Normalisation
colonnes_a_normaliser = ['track', 'artist_name', 'artist_count', 'mode', 'key', 'country_mb', 'tags_lastfm']

for col in colonnes_a_normaliser:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.lower()
        print(f"Colonne '{col}' normalisée.")


Colonne 'track' normalisée.
Colonne 'artist_name' normalisée.
Colonne 'artist_count' normalisée.
Colonne 'mode' normalisée.
Colonne 'key' normalisée.
Colonne 'country_mb' normalisée.
Colonne 'tags_lastfm' normalisée.


In [8]:
# Sauvegarde
df.to_csv("./dataset/dataset_filtered.csv", index=False)
print("Sauvegardé !")


Sauvegardé !
