In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import unicodedata
import re

# Load datasets
print("Loading datasets...")
spotify_df = pd.read_csv("data/spotify_df_cleaned.csv", encoding='utf-8', encoding_errors='ignore')
recommendationInfo_df = pd.read_csv('data/recommendation_info.csv', encoding='utf-8', encoding_errors='ignore')
playlists_df = pd.read_csv(
    "data/spotify_playlists.csv", 
    on_bad_lines='skip',
    quoting=1,
    encoding='utf-8',
    encoding_errors='ignore',
    nrows=500000
)

playlists_df.columns = playlists_df.columns.str.strip().str.replace('"', '')
recommendationInfo_df.columns = recommendationInfo_df.columns.str.strip().str.replace('"', '')

print(f"Loaded spotify_df: {len(spotify_df)} songs")
print(f"Loaded recommendationInfo_df: {len(recommendationInfo_df)} songs")
print(f"Loaded playlists_df: {len(playlists_df)} entries")
print(f"\nColumns in spotify_df: {spotify_df.columns.tolist()}")

Loading datasets...
Loaded spotify_df: 32828 songs
Loaded recommendationInfo_df: 32828 songs
Loaded playlists_df: 500000 entries

Columns in spotify_df: ['track_id', 'track_popularity', 'track_album_release_date', 'playlist_name', 'playlist_genre', 'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'release_year']
Loaded spotify_df: 32828 songs
Loaded recommendationInfo_df: 32828 songs
Loaded playlists_df: 500000 entries

Columns in spotify_df: ['track_id', 'track_popularity', 'track_album_release_date', 'playlist_name', 'playlist_genre', 'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'release_year']


In [None]:
def normalize_text(s):
    if pd.isna(s):
        return ''
    s = str(s).lower().strip()
    s = unicodedata.normalize('NFKD', s)
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'\s*\([^)]*\)\s*', ' ', s)
    s = re.sub(r'\s*\[[^]]*\]\s*', ' ', s)
    s = re.sub(r'- remaster(ed)?', '', s)
    s = re.sub(r"'", '', s)
    s = s.strip()
    return s

playlists_df['t_norm'] = playlists_df['trackname'].astype(str).map(normalize_text)
playlists_df['a_norm'] = playlists_df['artistname'].astype(str).map(normalize_text)
recommendationInfo_df['t_norm'] = recommendationInfo_df['track_name'].astype(str).map(normalize_text)
recommendationInfo_df['a_norm'] = recommendationInfo_df['track_artist'].astype(str).map(normalize_text)

# Build lookup mapping
rec_lookup = {(row.t_norm, row.a_norm): idx for idx, row in recommendationInfo_df.reset_index().iterrows()}
print(f'Recommendation lookup size: {len(rec_lookup)}')

Recommendation lookup size: 25942


In [None]:
# Map playlist songs to dataset indices
def map_row_to_idx(row):
    return rec_lookup.get((row['t_norm'], row['a_norm']))

playlists_df['dataset_idx'] = playlists_df.apply(map_row_to_idx, axis=1)

# playlist statistics
stats = playlists_df.groupby('playlistname').agg(
    total_count=('trackname', 'count'),
    matched_count=('dataset_idx', lambda s: s.notnull().sum())
).reset_index()

min_size = 20
max_size = 300
min_matched = 30
candidates = stats[(stats['total_count'].between(min_size, max_size)) & (stats['matched_count'] >= min_matched)].copy()
print(f'Found {len(candidates)} candidate playlists')

# Select playlists for training
chosen = candidates.sort_values('matched_count', ascending=False).head(30)['playlistname'].tolist()
print(f'Selected {len(chosen)} playlists for training')

Found 212 candidate playlists
Selected 30 playlists for training


In [None]:
audio_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode',
    'speechiness', 'acousticness', 'instrumentalness',
    'liveness', 'valence', 'tempo', 'duration_ms'
]

available_features = [f for f in audio_features if f in spotify_df.columns]
print(f'Using {len(available_features)} audio features: {available_features}')

# Extract feature matrix
feature_data = spotify_df[available_features].copy()
nan_count = feature_data.isna().sum().sum()
print(f'NaN values found: {nan_count}')

if nan_count > 0:
    print('Filling NaN values with column means...')
    feature_data = feature_data.fillna(feature_data.mean())

feature_matrix = feature_data.values
print(f'Feature matrix shape: {feature_matrix.shape}')

Using 12 audio features: ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
NaN values found: 0
Feature matrix shape: (32828, 12)


In [None]:
def generate_training_data(playlist_names, n_samples_per_playlist=100):
    """
    Generate training pairs: (seed_features, candidate_features, label)
    - For each playlist, randomly select seed songs
    - Positive samples: other songs from the same playlist
    - Negative samples: random songs NOT from the playlist
    """
    X_train = []
    y_train = []
    
    for playlist_name in playlist_names:
        playlist_songs = playlists_df[playlists_df['playlistname'] == playlist_name].copy()
        matched = playlist_songs.dropna(subset=['dataset_idx'])
        
        if len(matched) < 10:
            continue
        
        song_indices = matched['dataset_idx'].astype(int).tolist()
        
        # Generate positive and negative samples
        for _ in range(n_samples_per_playlist):
            if len(song_indices) < 2:
                break
            
            seed_idx = random.choice(song_indices)
            
            # Positive sample: another song from same playlist
            other_songs = [s for s in song_indices if s != seed_idx]
            if len(other_songs) > 0:
                pos_idx = random.choice(other_songs)
                
                seed_features = feature_matrix[seed_idx]
                pos_features = feature_matrix[pos_idx]
                combined = np.concatenate([seed_features, pos_features])
                
                X_train.append(combined)
                y_train.append(1)
            
            # Negative sample: random song not in playlist
            neg_idx = random.randint(0, len(feature_matrix) - 1)
            while neg_idx in song_indices:
                neg_idx = random.randint(0, len(feature_matrix) - 1)
            
            neg_features = feature_matrix[neg_idx]
            combined_neg = np.concatenate([seed_features, neg_features])
            
            X_train.append(combined_neg)
            y_train.append(0)
    
    return np.array(X_train), np.array(y_train)

print('Generating training data...')
X_train, y_train = generate_training_data(chosen[:20], n_samples_per_playlist=100)
print(f'Generated {len(X_train)} training samples')
print(f'Positive samples: {y_train.sum()}')
print(f'Negative samples: {len(y_train) - y_train.sum()}')
print(f'Feature dimension: {X_train.shape[1]}')

Generating training data...
Generated 4000 training samples
Positive samples: 2000
Negative samples: 2000
Feature dimension: 24


In [10]:
# Split data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f'Training set: {len(X_train_split)} samples')
print(f'Validation set: {len(X_val)} samples')

# Train XGBoost classifier
print('\nTraining XGBoost model...')
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train_split, 
    y_train_split,
    eval_set=[(X_val, y_val)],
    verbose=50
)

Training set: 3200 samples
Validation set: 800 samples

Training XGBoost model...
[0]	validation_0-logloss:0.67535
[50]	validation_0-logloss:0.52834
[100]	validation_0-logloss:0.52412
[150]	validation_0-logloss:0.51826
[199]	validation_0-logloss:0.52689


In [11]:
# Evaluate model on validation set
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

y_pred = xgb_model.predict(X_val)
y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred_proba)

print('Validation Set Performance:')
print(f'Accuracy:  {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall:    {recall:.4f}')
print(f'ROC-AUC:   {auc:.4f}')

Validation Set Performance:
Accuracy:  0.7312
Precision: 0.7176
Recall:    0.7625
ROC-AUC:   0.8190


XGBoost Recommendation Function

In [14]:
# XGBoost Recommendation Function
def recommend_xgboost(seed_song_tuples, n_recs=50, batch_size=1000):

    seed_idxs = []
    for title, artist in seed_song_tuples:
        key = (normalize_text(title), normalize_text(artist))
        idx = rec_lookup.get(key)
        if idx is not None:
            seed_idxs.append(idx)

    seed_features_avg = feature_matrix[seed_idxs].mean(axis=0)
    
    n_songs = len(feature_matrix)
    all_scores = np.zeros(n_songs)
    
    for batch_start in range(0, n_songs, batch_size):
        batch_end = min(batch_start + batch_size, n_songs)
        
        # Create feature pairs for this batch
        batch_features = []
        for i in range(batch_start, batch_end):
            combined = np.concatenate([seed_features_avg, feature_matrix[i]])
            batch_features.append(combined)
        
        batch_features = np.array(batch_features)
        
        # Predict probabilities
        batch_scores = xgb_model.predict_proba(batch_features)[:, 1]
        all_scores[batch_start:batch_end] = batch_scores
    
    # Rank by score
    ranking = np.argsort(-all_scores)
    
    # Filter out seed songs and get top recommendations
    recs = []
    seen_songs = set()
    for idx in ranking:
        if idx not in seed_idxs:
            song_key = (recommendationInfo_df.iloc[idx]['track_name'].lower().strip(),
                        recommendationInfo_df.iloc[idx]['track_artist'].lower().strip())
            if song_key not in seen_songs:
                seen_songs.add(song_key)
                recs.append(idx)
                if len(recs) >= n_recs:
                    break

    return recommendationInfo_df.iloc[recs].reset_index(drop=True)

Testing XGBoost Recommendation

In [15]:
# Test with sample songs
test_songs = [
    ("Don't You (Forget About Me) - Remastered", "Simple Minds"),
    ("Don't You Want Me", "The Human League"),
    ("Take On Me", "a-ha"),
    ("Sweet Dreams (Are Made of This)", "Eurythmics")
]

print("Input Songs:")
for title, artist in test_songs:
    print(f"- {title} — {artist}")

print("\nGenerating XGBoost Recommendations...")
recs = recommend_xgboost(test_songs, n_recs=10)

if isinstance(recs, str):
    print(recs)
else:
    print("\nTop 10 Recommended Songs:")
    for i in range(len(recs)):
        print(f"{i+1}. {recs.iloc[i]['track_name']} — {recs.iloc[i]['track_artist']}")

Input Songs:
- Don't You (Forget About Me) - Remastered — Simple Minds
- Don't You Want Me — The Human League
- Take On Me — a-ha
- Sweet Dreams (Are Made of This) — Eurythmics

Generating XGBoost Recommendations...

Top 10 Recommended Songs:
1. Paradise City — Guns N' Roses
2. I Feel Alive — Steady Rollin
3. The Return Of The Giant Hogweed - Digital Remastered 2008 — Genesis
4. Everybody Talks — Neon Trees
5. Working Man — Rush
6. Barbarism Begins at Home - 2011 Remaster — The Smiths
7. Highway Star — Deep Purple
8. Riders on the Storm — The Doors
9. Heads Will Roll — Yeah Yeah Yeahs
10. Rock Bottom - 2007 Remastered Version — UFO


Evaluate with Playlists

In [None]:
# Evaluation function for XGBoost model
def evaluate_playlist_xgboost(playlist_name, n_seed=10, n_recs=50):
    """
    Evaluate XGBoost recommendations on a playlist
    """
    playlist_songs = playlists_df[playlists_df['playlistname'] == playlist_name].copy()
    
    if len(playlist_songs) == 0:
        return {"error": f"Playlist '{playlist_name}' not found"}
    
    matched_songs = []
    for idx, row in playlist_songs.iterrows():
        if pd.notna(row.get('dataset_idx')):
            matched_songs.append({
                'track': row['trackname'],
                'artist': row['artistname'],
                'dataset_idx': int(row['dataset_idx'])
            })
    
    if len(matched_songs) < n_seed:
        return None
    
    random.seed(42)
    seed_songs = random.sample(matched_songs, n_seed)
    remaining_songs = [s for s in matched_songs if s not in seed_songs]
    
    # Get recommendations using XGBoost
    seed_list = [(s['track'], s['artist']) for s in seed_songs]
    recs_df = recommend_xgboost(seed_list, n_recs=n_recs)
    
    # metrics
    remaining_indices = {s['dataset_idx'] for s in remaining_songs}
    hits = 0
    for i in range(len(recs_df)):
        track = recs_df.iloc[i]['track_name']
        artist = recs_df.iloc[i]['track_artist']
        
        mask = (
            recommendationInfo_df["track_name"].str.lower().str.strip() == track.lower().strip()
        ) & (
            recommendationInfo_df["track_artist"].str.lower().str.strip() == artist.lower().strip()
        )
        
        if mask.sum() > 0:
            rec_idx = recommendationInfo_df[mask].index[0]
            if rec_idx in remaining_indices:
                hits += 1
    
    precision = hits / n_recs if n_recs > 0 else 0
    recall = hits / len(remaining_songs) if len(remaining_songs) > 0 else 0
    hit_rate = 1 if hits > 0 else 0
    
    print(f'{playlist_name}: Precision={precision:.4f}, Recall={recall:.4f}, Hits={hits}')
    
    return {
        'playlist': playlist_name,
        'precision': precision,
        'recall': recall,
        'hit_rate': hit_rate,
        'hits': hits,
        'total_holdout': len(remaining_songs)
    }


Evaluation function defined


In [18]:
# Evaluate on test playlists not used for training
test_playlists = chosen[20:30]

results = []
print('Evaluating XGBoost on test playlists...\n')
for playlist_name in test_playlists:
    res = evaluate_playlist_xgboost(playlist_name, n_seed=10, n_recs=50)
    if res:
        results.append(res)

# Summary statistics
if len(results) > 0:
    results_df = pd.DataFrame(results)
    print('\n' + '='*60)
    print('XGBOOST MODEL EVALUATION SUMMARY')
    print('='*60)
    print(f'Mean Precision: {results_df["precision"].mean():.4f}')
    print(f'Mean Recall:    {results_df["recall"].mean():.4f}')
    print(f'Mean Hit Rate:  {results_df["hit_rate"].mean():.4f}')
    print(f'Total Hits:     {results_df["hits"].sum()}')
else:
    print('No results to summarize')

Evaluating XGBoost on test playlists...

Windows Down Music Up - 2015 Edition: Precision=0.0000, Recall=0.0000, Hits=0
Windows Down Music Up - 2015 Edition: Precision=0.0000, Recall=0.0000, Hits=0
Coldplay: Precision=0.0000, Recall=0.0000, Hits=0
Coldplay: Precision=0.0000, Recall=0.0000, Hits=0
WORK IT: Precision=0.0200, Recall=0.0182, Hits=1
WORK IT: Precision=0.0200, Recall=0.0182, Hits=1
gots to get paid: Precision=0.0000, Recall=0.0000, Hits=0
gots to get paid: Precision=0.0000, Recall=0.0000, Hits=0
yeah: Precision=0.0000, Recall=0.0000, Hits=0
yeah: Precision=0.0000, Recall=0.0000, Hits=0
 adam: Precision=0.0000, Recall=0.0000, Hits=0
 adam: Precision=0.0000, Recall=0.0000, Hits=0
Dance Floor Maniac: Precision=0.0000, Recall=0.0000, Hits=0
Dance Floor Maniac: Precision=0.0000, Recall=0.0000, Hits=0
Favoritas: Precision=0.0000, Recall=0.0000, Hits=0
Favoritas: Precision=0.0000, Recall=0.0000, Hits=0
Positivity Yo: Precision=0.0200, Recall=0.0189, Hits=1
Positivity Yo: Precision=0