<!--
 Copyright 2025 beedi.goua_square-ma
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
     https://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->



In [8]:

# Activer autoreload pour dev local

%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
import pandas as pd
import numpy as np


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:

# Définir les dossiers du projet

base_dir = Path().resolve().parent

src_dir = base_dir / "src"
inputs_dir = base_dir / "data"
outputs_dir = base_dir / "outputs"
notebooks_dir = base_dir / "notebooks"

# Ajouter aux chemins Python
for folder in [src_dir, inputs_dir, outputs_dir, notebooks_dir]:
    sys.path.append(str(folder))



In [10]:

# Charger vocab et metadata

vocab = pd.read_csv(inputs_dir / "processed/word2vec_vocab.csv")
meta = pd.read_csv(outputs_dir / "songs_metadata_dedup.csv", index_col=0)

print(f"Vocab shape : {vocab.shape}")
print(f"Metadata shape : {meta.shape}")
print(f"--------------------")
print(meta.head())


Vocab shape : (158207, 1)
Metadata shape : (158368, 5)
--------------------
                        track_id                             title  \
dedup_id                                                             
0         0BRjO6ga9RKCKjfDqeFgWV       C'est beau de faire un Show   
1         0BjC1NfoEOOusryehmNudP  Perdu d'avance (par Gad Elmaleh)   
2         0CoSDzoNIKCRs124s9uTVy    Don't Let Me Be Lonely Tonight   
3         0Gc6TVm52BwZD07Ki6tIvf    Dis-moi Monsieur Gordon Cooper   
4         0IuslXpMROHdEPvSl1fTQK                         Ouverture   

                     artist  genre  duration_sec  
dedup_id                                          
0            Henri Salvador  Movie        99.373  
1         Martin & les fées  Movie       137.373  
2           Joseph Williams  Movie       170.267  
3            Henri Salvador  Movie       152.427  
4              Fabien Nataf  Movie        82.625  


In [11]:

# Vérifier duplicata

print(f"Duplicata track_id : {meta['track_id'].duplicated().sum()}")
print(f"Duplicata (title, artist) : {meta.duplicated(subset=['title', 'artist']).sum()}")


Duplicata track_id : 161
Duplicata (title, artist) : 0


In [12]:

# Vérifier correspondance vocab <> metadata

vocab_ids = set(vocab['track_id'].astype(str))
meta_ids = set(meta['track_id'].astype(str))

missing_in_meta = vocab_ids - meta_ids
print(f"IDs du vocab manquants dans metadata : {len(missing_in_meta)}")
if missing_in_meta:
    print(f"Exemple : {list(missing_in_meta)[:5]}")


#  Supprimer duplicata track_id

meta_clean = meta.drop_duplicates(subset=['track_id'])
print(f"Shape après drop_duplicates : {meta_clean.shape}")
print(f"Duplicata track_id après clean : {meta_clean['track_id'].duplicated().sum()}")


IDs du vocab manquants dans metadata : 0
Shape après drop_duplicates : (158207, 5)
Duplicata track_id après clean : 0


In [13]:

# Vérifier de nouveau cohérence
meta_ids_clean = set(meta_clean['track_id'].astype(str))
missing_after_clean = vocab_ids - meta_ids_clean
print(f"IDs du vocab encore manquants après clean : {len(missing_after_clean)}")


IDs du vocab encore manquants après clean : 0


In [14]:

# Vérifier colonnes NaN sur version clean

print(meta_clean.isna().sum())




track_id        0
title           0
artist          0
genre           0
duration_sec    0
dtype: int64


In [15]:

#  Sauvegarder version clean prête

save_path = inputs_dir / "processed/songs_metadata_clean.csv"
meta_clean.to_csv(save_path, index=False)
print(f"Sauvegardé : {save_path}")


Sauvegardé : C:\Users\beedi.goua_square-ma\Desktop\Gheb\projet perso\music-recommender-hybrid\data\processed\songs_metadata_clean.csv
