In [None]:
import json
%load_ext cudf.pandas
import math
import json

import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from safetensors.torch import save_file
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
Encoding songs:   0%|          | 0/4757 [00:00<?, ?it/s]

In [None]:
# Configuration constants
CONTEXT_SIZE = 11
DATA_PATH = "/content/spotify_dataset.csv"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
OUTPUT_FILE = "spotify_dataset.safetensors"
TRAIN_RATIO = 0.6
VAL_RATIO = 0.2  # Test ratio = 1 - TRAIN_RATIO - VAL_RATIO

def save_song_embeddings(song_embeddings:dict[str, np.ndarray]):
    json.dump(song_embeddings, open("song_embeddings.json", "w"))
    print("Saved song embeddings")
    return True

def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """Strip whitespace and remove quotes from column names"""
    return df.rename(columns=lambda x: x.strip().replace('"', ''))

def preprocess_text(text: str) -> str:
    """Clean and normalize text data"""
    return str(text).strip().lower()

def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Process raw dataframe into playlist-song groupings"""
    # Clean text data
    df = df.map(lambda x: preprocess_text(x) if pd.notna(x) else x)
    df = df.dropna()

    # Combine track and artist information
    df['trackname'] = df['trackname'] + ' by ' + df['artistname']

    # Group playlists and filter by size
    playlist_df = (
            df.groupby('playlistname', as_index=False)
            .agg(tracklist=('trackname', list))
        )
    playlist_df = playlist_df[playlist_df['tracklist'].map(len) > 40].reset_index(drop=True)

    return playlist_df[['tracklist']]

def main():
    # Load and preprocess data
    df = pd.read_csv(DATA_PATH, on_bad_lines='skip')
    df = clean_column_names(df)
    processed_df = preprocess_dataframe(df)

    # Initialize embedding model
    encoder = SentenceTransformer(EMBEDDING_MODEL,device='cuda')

    # Precompute song embeddings (unique songs only)
    unique_songs = set(song for tracklist in processed_df['tracklist'] for song in tracklist)
    song_embeddings = {song: encoder.encode(song) for song in tqdm(unique_songs, desc="Encoding songs")}
    save_song_embeddings(song_embeddings)
    # Split data into chunks and create datasets
    train, val, test = [], [], []

    for tracklist in tqdm(processed_df['tracklist'], desc="Processing playlists"):
        n_chunks = len(tracklist) // CONTEXT_SIZE
        if n_chunks < 1:
            continue

        # Create fixed-size chunks
        chunks = [
            tracklist[i * CONTEXT_SIZE : (i + 1) * CONTEXT_SIZE]
            for i in range(n_chunks)
        ]

        # Split chunks according to ratios
        train_end = math.floor(n_chunks * TRAIN_RATIO)
        val_end = train_end + math.floor(n_chunks * VAL_RATIO)

        # Store embeddings for each chunk
        for chunk in chunks[:train_end]:
            train.append(np.array([song_embeddings[song] for song in chunk]))
        for chunk in chunks[train_end:val_end]:
            val.append(np.array([song_embeddings[song] for song in chunk]))
        for chunk in chunks[val_end:]:
            test.append(np.array([song_embeddings[song] for song in chunk]))

    # Convert to tensors
    train_tensor = torch.tensor(np.array(train))
    val_tensor = torch.tensor(np.array(val))
    test_tensor = torch.tensor(np.array(test))

    # Save results
    save_file(
        {"train": train_tensor, "validation": val_tensor, "test": test_tensor},
        OUTPUT_FILE
    )

if __name__ == "__main__":
    main()