In [5]:
%load_ext autoreload
%autoreload 2


In [6]:
import sys
from pathlib import Path


# === 1. Définir la racine du projet (répertoire parent du script)

base_dir = Path().resolve().parent
# === 2. Définir les sous-dossiers
src_dir = base_dir / "src"
inputs_dir = base_dir / "data"
outputs_dir = base_dir / "outputs"
notebooks_dir = base_dir / "notebooks"


# === 3. Créer les dossiers s’ils n’existent pas
for folder in [src_dir, inputs_dir, outputs_dir,notebooks_dir]:
    folder.mkdir(parents=True, exist_ok=True)
    sys.path.append(str(folder))  


# Téléchargement

In [None]:
"""import os
import subprocess

#  Créer le dossier 'data'
os.makedirs("data", exist_ok=True)
print("Dossier 'data' prêt.")

# 2️Aller dans 'data'
os.chdir("data")
print(" Position :", os.getcwd())

# URLs à télécharger
urls = [
    "https://os.unil.cloud.switch.ch/fma/fma_metadata.zip",
    "https://os.unil.cloud.switch.ch/fma/fma_small.zip",
    "https://os.unil.cloud.switch.ch/fma/fma_medium.zip",
    "https://os.unil.cloud.switch.ch/fma/fma_large.zip",
    "https://os.unil.cloud.switch.ch/fma/fma_full.zip",
]

# Télécharger chaque fichier si pas déjà présent
for url in urls:
    file = os.path.basename(url)
    if not os.path.exists(file):
        print(f" Téléchargement de {file} ...")
        subprocess.run([
            "powershell",
            "-Command",
            f"Invoke-WebRequest -Uri '{url}' -OutFile '{file}'"
        ], check=True)
    else:
        print(f"{file} existe déjà.")
"""

Dossier 'data' prêt.
 Position : c:\Users\beedi.goua_square-ma\Desktop\Gheb\projet perso\music-recommender-hybrid\notebooks\data
 Téléchargement de fma_metadata.zip ...
 Téléchargement de fma_small.zip ...


In [None]:
"""
# Vérifier les SHA1
checksums = {
    "fma_metadata.zip": "f0df49ffe5f2a6008d7dc83c6915b31835dfe733",
    "fma_small.zip":    "ade154f733639d52e35e32f5593efe5be76c6d70",
    "fma_medium.zip":   "c67b69ea232021025fca9231fc1c7c1a063ab50b",
    "fma_large.zip":    "497109f4dd721066b5ce5e5f250ec604dc78939e",
    "fma_full.zip":     "0f0ace23fbe9ba30ecb7e95f763e435ea802b8ab",
}

for file, expected_hash in checksums.items():
    result = subprocess.run(
        ["powershell", "-Command", f"Get-FileHash -Path '{file}' -Algorithm SHA1 | Select-Object -ExpandProperty Hash"],
        capture_output=True, text=True
    )
    file_hash = result.stdout.strip()
    if file_hash == expected_hash:
        print(f" {file} : checksum OK")
    else:
        print(f" {file} : checksum FAIL")

#  Décompresser chaque fichier s'il n'est pas déjà extrait
for zip_file in checksums.keys():
    folder = os.path.splitext(zip_file)[0]
    if not os.path.exists(folder):
        print(f" Décompression de {zip_file} ...")
        subprocess.run(["powershell", "-Command", f"Expand-Archive -Path '{zip_file}' -DestinationPath '.\\'"], check=True)
    else:
        print(f" {zip_file} déjà décompressé.")


"""

In [None]:
"""#  Remonter au dossier parent
os.chdir("..")
print(" Retour :", os.getcwd())"""

## Parsing playlists

In [7]:
import os
import pandas as pd
import pickle

spotify_file = inputs_dir / "SpotifyFeatures.csv"


# Charger le CSV
df = pd.read_csv(spotify_file)

print(f"CSV chargé : {df.shape}")


CSV chargé : (232725, 18)


In [None]:

# Vérifier colonnes disponibles
print("Colonnes dispo :", df.columns.tolist())


Colonnes dispo : ['genre', 'artist_name', 'track_name', 'track_id', 'popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence']


In [9]:
# ➜ Colonnes clés
COLUMNS = ['track_id', 'track_name', 'artist_name', 'genre', 'duration_ms']

# Extraire sous-DataFrame
songs_df = df[COLUMNS].copy()

# Renommer propre
songs_df.rename(columns={
    'track_name': 'title',
    'artist_name': 'artist'
}, inplace=True)


In [None]:
# Convertir durée en secondes
songs_df['duration_sec'] = songs_df['duration_ms'] / 1000
songs_df.drop(columns=['duration_ms'], inplace=True)


In [12]:
# Forcer ID en str
songs_df['track_id'] = songs_df['track_id'].astype(str)
songs_df = songs_df.set_index('track_id')


In [13]:
# Drop NA sans year
songs_df = songs_df.dropna(subset=['title', 'artist', 'genre'])

print(f"songs_df prêt : {songs_df.shape}")
print(songs_df.head(3))

songs_df prêt : (232724, 4)
                                                   title             artist  \
track_id                                                                      
0BRjO6ga9RKCKjfDqeFgWV       C'est beau de faire un Show     Henri Salvador   
0BjC1NfoEOOusryehmNudP  Perdu d'avance (par Gad Elmaleh)  Martin & les fées   
0CoSDzoNIKCRs124s9uTVy    Don't Let Me Be Lonely Tonight    Joseph Williams   

                        genre  duration_sec  
track_id                                     
0BRjO6ga9RKCKjfDqeFgWV  Movie        99.373  
0BjC1NfoEOOusryehmNudP  Movie       137.373  
0CoSDzoNIKCRs124s9uTVy  Movie       170.267  


In [14]:
# Sauver version brute
songs_df.to_csv(os.path.join(outputs_dir, "songs_metadata.csv"))
print("Sauvegarde : songs_metadata.csv")

Sauvegarde : songs_metadata.csv


In [15]:
# Dédupliquer pour embeddings
songs_df_dedup = songs_df.reset_index().drop_duplicates(subset=['title', 'artist']).reset_index(drop=True)
songs_df_dedup['dedup_id'] = songs_df_dedup.index
songs_df_dedup = songs_df_dedup.set_index('dedup_id')


In [16]:
songs_df_dedup.to_csv(os.path.join(outputs_dir, "songs_metadata_dedup.csv"))
print(f"Sauvegarde : songs_metadata_dedup.csv ({songs_df_dedup.shape})")

Sauvegarde : songs_metadata_dedup.csv ((176513, 5))


In [17]:




# Simuler playlists factices par genre
playlists = []
for genre, group in songs_df.groupby('genre'):
    pl = list(group.index)
    if len(pl) > 2:
        playlists.append(pl)

print(f"Playlists simulées : {len(playlists)} exemples")
print(f"Exemple : {playlists[0][:5]}")

with open(os.path.join(outputs_dir, "clean_playlists.pkl"), 'wb') as f:
    pickle.dump(playlists, f)

print("Sauvegarde : clean_playlists.pkl")

Playlists simulées : 27 exemples
Exemple : ['0PuWvFJqZPJAxZNrFgw8xL', '1iXJKuzDH0E6PnTk2lQXAq', '3XxwdU13609bTGaAFRg3PA', '0crWDkAajRTnsGuZ6CAU85', '6TK4I15u1oym5H2eETng4F']
Sauvegarde : clean_playlists.pkl


In [18]:
songs_df_dedup

Unnamed: 0_level_0,track_id,title,artist,genre,duration_sec
dedup_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0BRjO6ga9RKCKjfDqeFgWV,C'est beau de faire un Show,Henri Salvador,Movie,99.373
1,0BjC1NfoEOOusryehmNudP,Perdu d'avance (par Gad Elmaleh),Martin & les fées,Movie,137.373
2,0CoSDzoNIKCRs124s9uTVy,Don't Let Me Be Lonely Tonight,Joseph Williams,Movie,170.267
3,0Gc6TVm52BwZD07Ki6tIvf,Dis-moi Monsieur Gordon Cooper,Henri Salvador,Movie,152.427
4,0IuslXpMROHdEPvSl1fTQK,Ouverture,Fabien Nataf,Movie,82.625
...,...,...,...,...,...
176508,2XoAEpBuM4AtQIQYUEowRy,They Like It Slow,H-Town,Soul,279.510
176509,1U0OMWvR89Cm20vCNar50f,Quickly (feat. Brandy),John Legend,Soul,222.667
176510,2gGqKJWfWbToha2YmDxnnj,P.O.P.,Belly,Soul,201.173
176511,1qWZdkBl4UVPj9lK6HuuFM,Burning Fire,Jr Thomas & The Volcanos,Soul,282.447
