In [None]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path


#  Définir la racine du projet (répertoire parent du script)

base_dir = Path().resolve().parent
#  Définir les sous-dossiers
src_dir = base_dir / "src"
inputs_dir = base_dir / "data"
outputs_dir = base_dir / "outputs"
notebooks_dir = base_dir / "notebooks"


# Créer les dossiers s’ils n’existent pas
for folder in [src_dir, inputs_dir, outputs_dir,notebooks_dir]:
    sys.path.append(str(folder))  


In [2]:
import pandas as pd 
# Charger le corpus metadata

songs_df = pd.read_csv(outputs_dir / "songs_metadata_dedup.csv", index_col=0)

print(f"Shape corpus : {songs_df.shape}")
print(songs_df.head(3))
print("\nColonnes disponibles :", songs_df.columns.tolist())

Shape corpus : (158368, 5)
                        track_id                             title  \
dedup_id                                                             
0         0BRjO6ga9RKCKjfDqeFgWV       C'est beau de faire un Show   
1         0BjC1NfoEOOusryehmNudP  Perdu d'avance (par Gad Elmaleh)   
2         0CoSDzoNIKCRs124s9uTVy    Don't Let Me Be Lonely Tonight   

                     artist  genre  duration_sec  
dedup_id                                          
0            Henri Salvador  Movie        99.373  
1         Martin & les fées  Movie       137.373  
2           Joseph Williams  Movie       170.267  

Colonnes disponibles : ['track_id', 'title', 'artist', 'genre', 'duration_sec']


In [3]:
# Créer le champ texte concaténé
texts = (
    'Song title: "' + songs_df['title'].astype(str) + '". ' +
    "Artist: " + songs_df['artist'].astype(str) + ". " +
    "Genre: " + songs_df['genre'].astype(str) + "."
).tolist()

print(f"Exemple phrase : {texts[0]}")
print(f"Nb total de textes à encoder : {len(texts)}")


Exemple phrase : Song title: "C'est beau de faire un Show". Artist: Henri Salvador. Genre: Movie.
Nb total de textes à encoder : 158368


In [4]:
import torch
print(torch.__version__)


2.2.2+cpu


In [5]:
from sentence_transformers import SentenceTransformer

# Charger SentenceTransformer

model_st = SentenceTransformer('all-MiniLM-L6-v2')

print("Modèle SentenceTransformer chargé : all-MiniLM-L6-v2")






Modèle SentenceTransformer chargé : all-MiniLM-L6-v2


In [6]:

# Encoder tous les textes

content_embeddings = model_st.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True  # important pour cosine similarity direct
)


Batches:   0%|          | 0/2475 [00:00<?, ?it/s]

In [7]:

print(f"Shape embeddings : {content_embeddings.shape}")

Shape embeddings : (158368, 384)


In [8]:
# Sauver embeddings
import numpy as np 
output_path = inputs_dir / "processed/content_embeddings.npy"
np.save(output_path, content_embeddings)

print(f"Embeddings sauvegardés sous : {output_path}")


Embeddings sauvegardés sous : C:\Users\beedi.goua_square-ma\Desktop\Gheb\projet perso\music-recommender-hybrid\data\processed\content_embeddings.npy


In [9]:

# Vérification : sim cosinus

from numpy import dot
from numpy.linalg import norm

vec1 = content_embeddings[0]
vec2 = content_embeddings[1]

cos_sim = dot(vec1, vec2)  
print(f"Cosine Similarity [0 vs 1] : {cos_sim:.4f}")


Cosine Similarity [0 vs 1] : 0.4839
