<!--
 Copyright 2025 beedi.goua_square-ma
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
     https://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->



In [4]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from gensim.models import Word2Vec
import pickle
from pathlib import Path
import sys

base_dir = Path().resolve().parent
src_dir = base_dir / "src"
inputs_dir = base_dir / "data"
outputs_dir = base_dir / "outputs"
notebooks_dir = base_dir / "notebooks"
processed_dir = inputs_dir / "processed"

# Ajouter aux chemins
for folder in [src_dir, inputs_dir, outputs_dir, notebooks_dir, processed_dir]:
    sys.path.append(str(folder))


In [11]:

# Charger playlists filtrées

with open(outputs_dir / "clean_playlists_filtered.pkl", "rb") as f:
    playlists_ids = pickle.load(f)

print(f"Type de l'objet : {type(playlists_ids)}")
print(f"Nb playlists : {len(playlists_ids)}")

print(f"Genres : {list(playlists_ids.keys())[:3]}")


Type de l'objet : <class 'dict'>
Nb playlists : 24
Genres : ['A Capella', 'Alternative', 'Anime']


In [None]:
playlists_ids


{'A Capella': [{'track_id': '0PuWvFJqZPJAxZNrFgw8xL',
   'title': 'For the Longest Time',
   'artist': 'MC6 A Cappella'},
  {'track_id': '1iXJKuzDH0E6PnTk2lQXAq',
   'title': 'Clair',
   'artist': 'The Singers Unlimited'},
  {'track_id': '3XxwdU13609bTGaAFRg3PA',
   'title': 'Come Go With Me',
   'artist': 'MC6 A Cappella'},
  {'track_id': '0crWDkAajRTnsGuZ6CAU85',
   'title': 'The Lion Sleeps Tonight',
   'artist': 'MC6 A Cappella'},
  {'track_id': '6TK4I15u1oym5H2eETng4F',
   'title': 'A Debtor to Mercy Alone',
   'artist': 'Glad'},
  {'track_id': '0PCgsZKSCmsDlHBXRTdejp',
   'title': 'O Wondrous Love',
   'artist': 'Glad'},
  {'track_id': '54eOtB9Vz0nli6U5Ez00ep',
   'title': 'Blue Moon',
   'artist': 'MC6 A Cappella'},
  {'track_id': '6iAU2N1jJqY9qUGvBTHtWa',
   'title': 'Runaround Sue',
   'artist': 'MC6 A Cappella'},
  {'track_id': '5FeU992vNw1J8j8aEw7vcR',
   'title': 'Teenager in Love',
   'artist': 'MC6 A Cappella'},
  {'track_id': '6kYfnFjxQYx2WkBW5jIiiI',
   'title': 'Sherry

In [None]:
# Construire les playlists 
playlists = []
for genre, songs in playlists_ids.items():
    ids = [song["track_id"] for song in songs if "track_id" in song]
    if len(ids) > 1:
        playlists.append(ids)

print(f"Nombre de playlists prêtes : {len(playlists)}")
print(f"Exemple playlist : {playlists[0][:5]}")

Nombre de playlists prêtes : 24
Exemple playlist : ['0PuWvFJqZPJAxZNrFgw8xL', '1iXJKuzDH0E6PnTk2lQXAq', '3XxwdU13609bTGaAFRg3PA', '0crWDkAajRTnsGuZ6CAU85', '6TK4I15u1oym5H2eETng4F']


In [None]:
# Paramètres Word2Vec
w2v_params = {
    "vector_size": 64,
    "window": 20,
    "min_count": 1,
    "workers": 4,
    "negative": 10
}

print(f"Paramètres Word2Vec : {w2v_params}")

Paramètres Word2Vec : {'vector_size': 64, 'window': 20, 'min_count': 1, 'workers': 4, 'negative': 10}


In [None]:
# Entraînement 
model_w2v = Word2Vec(sentences=playlists, **w2v_params)
print("Entraînement Word2Vec terminé.")

Entraînement Word2Vec terminé.


In [None]:
# Sauvegarde 
model_path = processed_dir / "word2vec.model"
model_w2v.save(str(model_path))
print(f"Modèle sauvegardé : {model_path}")

Modèle sauvegardé : C:\Users\beedi.goua_square-ma\Desktop\Gheb\projet perso\music-recommender-hybrid\data\processed\word2vec.model


In [None]:
# Vérification `most_similar` 
example_song_id = playlists[0][0]
print(f"Exemple song ID : {example_song_id}")
similars = model_w2v.wv.most_similar(positive=[example_song_id], topn=5)
print("Suggestions proches :")
for sid, score in similars:
    print(f"{sid} ➜ score : {score:.4f}")

Exemple song ID : 0PuWvFJqZPJAxZNrFgw8xL
Suggestions proches :
5sMUwUR3JI2woCOrNbizUt ➜ score : 0.5632
1RzkioFRFHsTai18FcdupF ➜ score : 0.5457
3vGGwlmCUF0pqnDpOj2Ua7 ➜ score : 0.4989
70PPOFwmuD4luTtLpnjesg ➜ score : 0.4686
22TSP0snNLPUkyhqcm93BM ➜ score : 0.4623


In [None]:
# Vérification vocabulaire 
vocab = model_w2v.wv.index_to_key
print(f"Vocabulaire total : {len(vocab)} titres.")
print(f"Exemple vocabulaire : {vocab[:5]}")

Vocabulaire total : 158207 titres.
Exemple vocabulaire : ['1F1QmI8TMHir9SUFrooq5F', '62YYaU3RQgE7tIa77HSYwu', '4AFGAXKRA8XpLnWJBlDCkC', '32BTFbqhSvYKftE0e8a8d4', '0dSRoWYf0GOzX9L44g53sZ']


In [None]:







import pandas as pd


# Export CSV 
df_vocab = pd.DataFrame(vocab, columns=["track_id"])
df_vocab.to_csv(processed_dir / "word2vec_vocab.csv", index=False)
print(f"Vocabulaire sauvegardé : {processed_dir / 'word2vec_vocab.csv'}")


Vocabulaire sauvegardé : C:\Users\beedi.goua_square-ma\Desktop\Gheb\projet perso\music-recommender-hybrid\data\processed\word2vec_vocab.csv


In [19]:
df_vocab

Unnamed: 0,track_id
0,1F1QmI8TMHir9SUFrooq5F
1,62YYaU3RQgE7tIa77HSYwu
2,4AFGAXKRA8XpLnWJBlDCkC
3,32BTFbqhSvYKftE0e8a8d4
4,0dSRoWYf0GOzX9L44g53sZ
...,...
158202,3uIHmC70ic3BeU8uIQdpO2
158203,465Yv44Dq1kDa6gPruVMxw
158204,79WjeGL4IR8svbAVC2FF0I
158205,4fgQMbN9r8d9V2exSTsLFs
