### Étape 1 : nettoyage & fusion des colonnes texte

In [1]:
import pandas as pd
import re

# Charger ton dataset
df = pd.read_csv("../data/processed/sample_albums.csv")
print(f"{len(df)} lignes brutes importées.")

# Supprimer doublons et NaN de base
df.drop_duplicates(subset=["source_url"], inplace=True)
df.dropna(subset=["album_name", "artist_name"], inplace=True)
df.reset_index(drop=True, inplace=True)

# Nettoyage de texte
def clean_text(t):
    if not isinstance(t, str):
        return ""
    t = re.sub(r"\s+", " ", t)         # espaces multiples → un seul
    t = re.sub(r"[^\w\s,.!?;:()-]", "", t)  # enlever caractères bizarres
    return t.strip().lower()

for col in ["album_name", "artist_name", "styles", "chronique", "informations"]:
    df[col] = df[col].apply(clean_text)

# Fusion en un seul champ texte complet
df["text_full"] = (
    df["artist_name"] + " " +
    df["album_name"] + " " +
    df["styles"] + " " +
    df["chronique"] + " " +
    df["informations"]
)

df.to_csv("../data/processed/sample_albums_clean.csv", index=False, encoding="utf-8")
print(f"✅ Données nettoyées exportées vers sample_albums_clean.csv ({len(df)} lignes)")


14073 lignes brutes importées.
✅ Données nettoyées exportées vers sample_albums_clean.csv (14072 lignes)


### Étape 2 : création des embeddings

In [4]:
# pip install -U sentence-transformers (pour installer la librairie))
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

texts = df["text_full"].tolist()
embeddings = model.encode(texts, show_progress_bar=True)

df["embedding"] = [emb.tolist() for emb in embeddings]

df.to_parquet("../data/processed/sample_albums_embedded.parquet", index=False)
print("✅ Embeddings créés et enregistrés.")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 440/440 [10:12<00:00,  1.39s/it]


✅ Embeddings créés et enregistrés.
