# Spotify Ontario Music Data Collection

This notebook handles the collection of music data from Spotify, focusing on playlists related to Ontario. The process is structured in three main steps:

1. Initial Setup and Authentication
2. Playlist Collection
3. Track and Audio Feature Collection

## Configuration and Imports

In [12]:
# Standard library imports
import os
import time
import logging
from datetime import datetime

# Third-party imports
import pandas as pd
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyOAuth

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Load environment variables and set up directories
load_dotenv()
RAW_DATA_DIR = "../data/raw"
os.makedirs(RAW_DATA_DIR, exist_ok=True)

# Initialize Spotify client with proper authentication
def init_spotify_client():
    """Initialize authenticated Spotify client with necessary permissions."""
    try:
        auth_manager = SpotifyOAuth(
            client_id=os.getenv("SPOTIFY_CLIENT_ID"),
            client_secret=os.getenv("SPOTIFY_CLIENT_SECRET"),
            redirect_uri=os.getenv("SPOTIFY_REDIRECT_URI", "http://127.0.0.1:8080/callback"),
            scope="playlist-read-private playlist-read-collaborative",
            cache_path=".spotify_cache"
        )
        client = spotipy.Spotify(auth_manager=auth_manager)
        
        # Test connection
        if client.search(q="test", limit=1):
            logging.info("✅ Spotify connection successful")
            return client
    except Exception as e:
        logging.error(f"❌ Failed to initialize Spotify client: {e}")
        raise

# Initialize global client
sp = init_spotify_client()

2025-09-18 12:37:44,618 - INFO - ✅ Spotify connection successful


## Playlist Collection Functions

Functions to search and collect playlist data from Spotify.

In [13]:
def search_playlists(keywords, limit=50):
    """
    Search Spotify playlists based on keywords.
    
    Args:
        keywords (list): List of search terms
        limit (int): Maximum playlists per keyword
    
    Returns:
        pd.DataFrame: Processed playlist information
    """
    playlists = []
    
    for keyword in keywords:
        try:
            results = sp.search(q=keyword, type='playlist', limit=limit)
            for playlist in results['playlists']['items']:
                playlists.append({
                    'id': playlist['id'],
                    'name': playlist['name'],
                    'description': playlist['description'],
                    'owner': playlist['owner']['display_name'],
                    'tracks_total': playlist['tracks']['total'],
                    'followers': playlist['followers']['total'] if 'followers' in playlist else 0,
                    'keyword': keyword
                })
            logging.info(f"✅ Found {len(results['playlists']['items'])} playlists for '{keyword}'")
        except Exception as e:
            logging.error(f"❌ Error searching for '{keyword}': {e}")
    
    df = pd.DataFrame(playlists)
    logging.info(f"Total playlists collected: {len(df)}")
    return df

# Define search keywords
ontario_keywords = [
    'Ontario music',
    'Ontario artists',
    'Ontario bands',
    'Toronto music',
    'Canadian indie'
]

# Collect playlists
playlists_df = search_playlists(ontario_keywords, limit=MAX_PLAYLISTS_PER_KEYWORD)

# Save raw data with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
playlists_df.to_csv(f"{RAW_DATA_DIR}/playlists_{timestamp}.csv", index=False)

2025-09-18 12:37:45,430 - ERROR - ❌ Error searching for 'Ontario music': 'NoneType' object is not subscriptable
2025-09-18 12:37:46,051 - ERROR - ❌ Error searching for 'Ontario artists': 'NoneType' object is not subscriptable
2025-09-18 12:37:46,765 - ERROR - ❌ Error searching for 'Ontario bands': 'NoneType' object is not subscriptable
2025-09-18 12:37:47,577 - ERROR - ❌ Error searching for 'Toronto music': 'NoneType' object is not subscriptable
2025-09-18 12:37:48,399 - ERROR - ❌ Error searching for 'Canadian indie': 'NoneType' object is not subscriptable
2025-09-18 12:37:48,402 - INFO - Total playlists collected: 43


## Track Collection and Audio Features

Functions to collect track information and audio features from the playlists.

In [14]:
def get_playlist_tracks(playlist_id):
    """Get all tracks from a playlist with their basic information."""
    tracks = []
    try:
        results = sp.playlist_tracks(playlist_id)
        while results:
            for item in results['items']:
                if not item['track']:
                    continue
                    
                track = item['track']
                tracks.append({
                    'id': track['id'],
                    'name': track['name'],
                    'artist': track['artists'][0]['name'],
                    'artist_id': track['artists'][0]['id'],
                    'album': track['album']['name'],
                    'album_id': track['album']['id'],
                    'popularity': track['popularity'],
                    'duration_ms': track['duration_ms'],
                    'explicit': track['explicit'],
                    'release_date': track['album']['release_date'],
                    'playlist_id': playlist_id
                })
            
            results = sp.next(results) if results['next'] else None
            time.sleep(RATE_LIMIT_DELAY)  # Rate limiting
        
        logging.info(f"✅ Retrieved {len(tracks)} tracks from playlist {playlist_id}")
        return tracks
    except Exception as e:
        logging.error(f"❌ Error fetching tracks from playlist {playlist_id}: {e}")
        return []

def get_audio_features_batch(track_ids):
    """Get audio features for a batch of tracks."""
    try:
        features = sp.audio_features(track_ids)
        return [f for f in features if f]
    except Exception as e:
        logging.error(f"❌ Error fetching audio features: {e}")
        return []

In [15]:
def handle_rate_limit(e=None, base_wait=2):
    """Respect Spotify rate limit using Retry-After if available"""
    wait_time = base_wait
    if e and hasattr(e, "http_headers") and "Retry-After" in e.http_headers:
        wait_time = int(e.http_headers["Retry-After"])
    logging.warning(f"⏳ Rate limited. Waiting {wait_time}s...")
    time.sleep(wait_time)
    return True  # Indicate successful wait

def log_failed_ids(track_ids, reason):
    """Log failed track IDs with timestamp and reason"""
    log_dir = os.path.join("..", "logs")
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, "failed_ids.log")
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    try:
        with open(log_file, "a") as f:
            f.write(f"\n[{timestamp}] {reason}\n")
            for tid in track_ids:
                f.write(f"{tid}\n")
        logging.warning(f"⚠️ Logged {len(track_ids)} failed IDs: {reason}")
        return True
    except Exception as e:
        logging.error(f"❌ Failed to log IDs: {e}")
        return False

def retry_auth():
    """Refresh Spotify authentication token"""
    try:
        logging.info("🔄 Refreshing Spotify token...")
        global sp
        sp = init_spotify_client()  # Use our existing OAuth setup
        return True
    except Exception as e:
        logging.error(f"❌ Failed to refresh token: {e}")
        return False

def get_audio_features(track_ids, chunk_size=50, max_retries=3):
    """Process tracks in batches with improved error handling"""
    if not track_ids:
        return []

    track_ids = list(set(track_ids))  # Remove duplicates
    chunks = [track_ids[i:i + chunk_size] for i in range(0, len(track_ids), chunk_size)]
    audio_features, failed_ids = [], set()

    logging.info(f"🎵 Processing {len(track_ids)} tracks in {len(chunks)} chunks")

    for i, chunk in enumerate(chunks, 1):
        retries = 0
        while retries < max_retries:
            try:
                features = sp.audio_features(chunk)
                valid = [f for f in features if f]
                audio_features.extend(valid)
                
                # Track failed IDs
                new_failed = set(chunk) - {f["id"] for f in valid}
                if new_failed:
                    failed_ids.update(new_failed)
                
                logging.info(f"✅ Chunk {i}/{len(chunks)}: {len(valid)}/{len(chunk)} processed")
                break  # Success - exit retry loop
                
            except spotipy.exceptions.SpotifyException as e:
                if e.http_status == 403 and retry_auth():
                    retries += 1
                    continue
                elif handle_rate_limit(e):
                    continue
                else:
                    failed_ids.update(chunk)
                    break
                    
            except Exception as e:
                logging.error(f"❌ Chunk {i} error: {e}")
                retries += 1
                if retries >= max_retries:
                    failed_ids.update(chunk)

    # Log failed tracks if any
    if failed_ids:
        log_failed_ids(failed_ids, "Failed after all retries")

    success_rate = (len(audio_features) / len(track_ids)) * 100
    logging.info(f"📊 Final results: {len(audio_features)}/{len(track_ids)} tracks ({success_rate:.1f}%)")
    
    return audio_features

In [17]:
def collect_tracks(playlists_df, save_interval=50):
    """Extract tracks from playlists with periodic saving"""
    if playlists_df.empty:
        logging.warning("❌ No playlists available")
        return pd.DataFrame(), set()

    all_tracks = []
    track_ids = set()
    save_dir = os.path.join("..", "data", "raw", "incremental")
    os.makedirs(save_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    for i, pid in enumerate(playlists_df["id"], 1):  # Cambiado de "playlist_id" a "id"
        try:
            tracks = get_playlist_tracks(pid)
            track_infos = []
            for item in tracks:
                if item.get('track'):
                    track = item['track']
                    track_info = {
                        'track_id': track['id'],
                        'name': track['name'],
                        'artist': track['artists'][0]['name'],
                        'playlist_id': pid
                    }
                    track_infos.append(track_info)
            
            # Update collections
            all_tracks.extend(track_infos)
            new_ids = {ti["track_id"] for ti in track_infos}
            track_ids.update(new_ids)
            
            # Periodic save
            if i % save_interval == 0:
                temp_df = pd.DataFrame(all_tracks)
                temp_df.to_csv(
                    os.path.join(save_dir, f"tracks_temp_{timestamp}_{i}.csv"),
                    index=False
                )
                logging.info(f"💾 Saved checkpoint at playlist {i}/{len(playlists_df)}")
                
        except Exception as e:
            logging.error(f"❌ Error processing playlist {pid}: {e}")
            continue

    final_df = pd.DataFrame(all_tracks)
    logging.info(f"✅ Total: {len(final_df)} tracks ({len(track_ids)} unique)")
    
    return final_df, track_ids

# === Usage ===
tracks_df, track_ids = collect_tracks(playlists_df)

2025-09-18 12:39:51,604 - INFO - ✅ Retrieved 31 tracks from playlist 0H5sLEc8WnoXmrXgZ1QA5R
2025-09-18 12:39:52,212 - INFO - ✅ Retrieved 76 tracks from playlist 2wfNEHp2xeD3LBv4m01rPT
2025-09-18 12:39:52,212 - INFO - ✅ Retrieved 76 tracks from playlist 2wfNEHp2xeD3LBv4m01rPT
2025-09-18 12:39:52,475 - INFO - ✅ Retrieved 16 tracks from playlist 66XDl402XcPVosSaF8LONU
2025-09-18 12:39:52,475 - INFO - ✅ Retrieved 16 tracks from playlist 66XDl402XcPVosSaF8LONU
2025-09-18 12:39:52,775 - INFO - ✅ Retrieved 16 tracks from playlist 6EsrPeJZG00trXEG6MsGMi
2025-09-18 12:39:52,775 - INFO - ✅ Retrieved 16 tracks from playlist 6EsrPeJZG00trXEG6MsGMi
2025-09-18 12:39:58,484 - INFO - ✅ Retrieved 860 tracks from playlist 32uH7sB8TAl03OtVOUuJzy
2025-09-18 12:39:58,484 - INFO - ✅ Retrieved 860 tracks from playlist 32uH7sB8TAl03OtVOUuJzy
2025-09-18 12:39:59,451 - INFO - ✅ Retrieved 103 tracks from playlist 1lQElxu7q2z09LPdFuoL4l
2025-09-18 12:39:59,451 - INFO - ✅ Retrieved 103 tracks from playlist 1lQElxu

In [18]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def collect_audio_features(track_ids):
    """Get audio features for unique IDs and return a clean DataFrame"""
    if not track_ids:
        logging.info("ℹ️ No track IDs to process")
        return pd.DataFrame()

    logging.info(f"🎶 Starting Audio Features Collection for {len(track_ids)} unique tracks")

    features = get_audio_features(track_ids)
    if not features:
        logging.warning("❌ No audio features collected")
        return pd.DataFrame()

    df = pd.DataFrame(features)
    success_rate = (len(df) / len(track_ids)) * 100
    logging.info(f"✅ Collected audio features for {len(df)}/{len(track_ids)} tracks ({success_rate:.1f}%)")

    return df

# === Usage ===
audio_features_df = collect_audio_features(track_ids)

2025-09-18 12:41:00,127 - INFO - ℹ️ No track IDs to process
