# Spotify Artist Data Fetcher

**Purpose:** Incrementally fetch artist data for Spotify tracks over multiple days.

**Usage:**
1. Run this notebook once per day (after 24h rate limit reset)
2. It will automatically fetch the next batch of ~1,500 tracks
3. Progress is saved to `artist_info_extended.csv`
4. Continue until you've fetched all 28K tracks!

**Current Progress:** Check the status cell below to see how many tracks you've collected.

## 1. Setup & Import Libraries

In [None]:
# Install missing packages (runs only in Jupyter)
%pip install -q spotipy python-dotenv

# Import required libraries
import pandas as pd
import numpy as np
import time
from datetime import datetime
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os

print("✓ Libraries imported successfully")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 2. Load Spotify API Credentials

In [None]:
# Load environment variables
load_dotenv()

# Get credentials
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

# Initialize Spotify client
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

print("✓ Spotify API client initialized")
print(f"✓ Client ID: {client_id[:10]}...")

## 3. Load Source Dataset

In [None]:
# Load the main Spotify dataset
df = pd.read_csv('spotify_songs.csv')

print("="*80)
print("SOURCE DATASET LOADED")
print("="*80)
print(f"Total tracks in source: {len(df):,}")
print(f"Unique track IDs: {df['track_id'].nunique():,}")

# Get unique tracks only
df_unique = df.drop_duplicates(subset=['track_id']).reset_index(drop=True)
print(f"\nUnique tracks to potentially fetch: {len(df_unique):,}")

## 4. Check Current Progress

In [None]:
# Check if we have existing progress
PROGRESS_FILE = 'artist_info_extended.csv'

if os.path.exists(PROGRESS_FILE):
    df_existing = pd.read_csv(PROGRESS_FILE)
    already_fetched = set(df_existing['track_id'].tolist())
    
    print("="*80)
    print("📊 CURRENT PROGRESS")
    print("="*80)
    print(f"✓ Found existing progress file: {PROGRESS_FILE}")
    print(f"✓ Already fetched: {len(already_fetched):,} tracks")
    print(f"✓ Remaining: {len(df_unique) - len(already_fetched):,} tracks")
    print(f"✓ Progress: {len(already_fetched)/len(df_unique)*100:.1f}%")
    
    # Show some stats
    print(f"\n📈 Dataset Statistics:")
    print(f"   Multi-artist tracks: {(df_existing['num_artists'] > 1).sum():,} ({(df_existing['num_artists'] > 1).mean()*100:.1f}%)")
    print(f"   Avg artists per track: {df_existing['num_artists'].mean():.2f}")
    print(f"   Tracks with genres: {df_existing['artist_genres'].notna().sum():,}")
    
else:
    print("="*80)
    print("🆕 STARTING FRESH")
    print("="*80)
    print(f"No existing progress file found.")
    print(f"Will create new file: {PROGRESS_FILE}")
    print(f"Total tracks to fetch: {len(df_unique):,}")
    
    already_fetched = set()
    df_existing = None

## 5. Multi-Artist Fetching Function

In [None]:
def fetch_artist_info_multi_artists(track_ids, sp_client, batch_size=50, delay=0.5, checkpoint_file=None):
    """
    Fetch artist information for multiple tracks, aggregating data for tracks with multiple artists.
    
    Parameters:
    -----------
    track_ids : list
        List of Spotify track IDs
    sp_client : spotipy.Spotify
        Authenticated Spotify client
    batch_size : int
        Number of tracks to fetch per API call (max 50)
    delay : float
        Delay between batches in seconds
    checkpoint_file : str
        Path to save checkpoint progress
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with aggregated artist information
    """
    
    artist_data = []
    total_tracks = len(track_ids)
    
    # Check for existing checkpoint
    if checkpoint_file and os.path.exists(checkpoint_file):
        checkpoint_df = pd.read_csv(checkpoint_file)
        already_done = set(checkpoint_df['track_id'].tolist())
        artist_data = checkpoint_df.to_dict('records')
        print(f"📂 Loaded checkpoint: {len(already_done)} tracks already processed")
    else:
        already_done = set()
    
    # Filter out already processed tracks
    track_ids = [tid for tid in track_ids if tid not in already_done]
    
    if len(track_ids) == 0:
        print("✓ All tracks already processed!")
        return pd.DataFrame(artist_data)
    
    print(f"Fetching artist data for {len(track_ids)} tracks...")
    
    try:
        # Process in batches
        for i in range(0, len(track_ids), batch_size):
            batch_ids = track_ids[i:i + batch_size]
            
            try:
                # Fetch tracks
                tracks_response = sp_client.tracks(batch_ids)
                
                # Process each track
                for track in tracks_response['tracks']:
                    if track is None:
                        continue
                    
                    track_id = track['id']
                    artists = track['artists']
                    
                    # Get all artist IDs
                    artist_ids = [artist['id'] for artist in artists]
                    artist_names = [artist['name'] for artist in artists]
                    
                    # Fetch detailed info for each artist
                    artist_details = []
                    for artist_id in artist_ids:
                        try:
                            artist_info = sp_client.artist(artist_id)
                            artist_details.append(artist_info)
                            time.sleep(0.05)  # Small delay between artist calls
                        except Exception as e:
                            print(f"Warning: Could not fetch artist {artist_id}: {e}")
                            continue
                    
                    # Aggregate artist information
                    if artist_details:
                        popularities = [a['popularity'] for a in artist_details]
                        followers = [a['followers']['total'] for a in artist_details]
                        all_genres = []
                        for a in artist_details:
                            all_genres.extend(a['genres'])
                        
                        artist_data.append({
                            'track_id': track_id,
                            'num_artists': len(artist_details),
                            'artist_names': '|'.join(artist_names),
                            'primary_artist': artist_names[0] if artist_names else None,
                            'avg_artist_popularity': np.mean(popularities) if popularities else None,
                            'max_artist_popularity': np.max(popularities) if popularities else None,
                            'min_artist_popularity': np.min(popularities) if popularities else None,
                            'total_artist_followers': np.sum(followers) if followers else None,
                            'avg_artist_followers': np.mean(followers) if followers else None,
                            'max_artist_followers': np.max(followers) if followers else None,
                            'artist_genres': ','.join(list(set(all_genres))) if all_genres else None
                        })
                
                # Progress update
                progress = len(artist_data)
                if progress % 100 == 0:
                    print(f"Progress: {progress}/{total_tracks} tracks ({progress/total_tracks*100:.1f}%)")
                    
                    # Save checkpoint
                    if checkpoint_file:
                        pd.DataFrame(artist_data).to_csv(checkpoint_file, index=False)
                
                # Delay between batches
                time.sleep(delay)
                
            except Exception as e:
                if 'rate limit' in str(e).lower() or '429' in str(e):
                    print(f"\n⚠️  Rate limit hit at {len(artist_data)} tracks")
                    print("Saving progress and stopping...")
                    break
                else:
                    print(f"Error processing batch: {e}")
                    continue
    
    except KeyboardInterrupt:
        print("\n⚠️  Interrupted by user. Saving progress...")
    
    # Create DataFrame
    df_result = pd.DataFrame(artist_data)
    
    # Save final checkpoint
    if checkpoint_file and len(df_result) > 0:
        df_result.to_csv(checkpoint_file, index=False)
        print(f"💾 Checkpoint saved: {checkpoint_file}")
    
    return df_result

print("✓ Fetching function defined")

## 6. Fetch Next Batch (RUN THIS DAILY)

**⚠️ Run this cell once per day to fetch the next batch of tracks.**

The function will:
- Automatically pick up where you left off
- Fetch ~1,500 tracks (or until rate limit)
- Save progress to `artist_info_extended.csv`
- Show you updated statistics

In [None]:
# Configuration
BATCH_SIZE_PER_DAY = 1500  # Target tracks per day
API_BATCH_SIZE = 50        # Tracks per API call
DELAY = 0.5                # Delay between API calls (seconds)
CHECKPOINT_FILE = 'daily_fetch_checkpoint.csv'

print("="*80)
print(f"🚀 DAILY FETCH - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

# Get tracks that haven't been fetched yet
remaining_tracks = df_unique[~df_unique['track_id'].isin(already_fetched)]

if len(remaining_tracks) == 0:
    print("\n🎉 ALL TRACKS FETCHED!")
    print(f"Total tracks in dataset: {len(df_existing):,}")
    print("\nNothing more to fetch. You have the complete dataset!")
else:
    # Get next batch
    next_batch = remaining_tracks.head(BATCH_SIZE_PER_DAY)
    next_batch_ids = next_batch['track_id'].tolist()
    
    print(f"\n📊 Today's Batch:")
    print(f"   Tracks to fetch: {len(next_batch_ids):,}")
    print(f"   Already have: {len(already_fetched):,}")
    print(f"   Remaining after: {len(remaining_tracks) - len(next_batch_ids):,}")
    print(f"   Estimated time: ~{len(next_batch_ids) * 0.6 / 60:.0f} minutes")
    
    print(f"\n⏱️  Starting fetch at {datetime.now().strftime('%H:%M:%S')}...\n")
    
    start_time = time.time()
    
    # Fetch the batch
    new_artist_data = fetch_artist_info_multi_artists(
        next_batch_ids,
        sp,
        batch_size=API_BATCH_SIZE,
        delay=DELAY,
        checkpoint_file=CHECKPOINT_FILE
    )
    
    elapsed = time.time() - start_time
    
    print(f"\n⏱️  Finished at {datetime.now().strftime('%H:%M:%S')}")
    print(f"⏱️  Total time: {elapsed/60:.1f} minutes")
    
    # Combine with existing data
    if df_existing is not None:
        df_combined = pd.concat([df_existing, new_artist_data], ignore_index=True)
    else:
        df_combined = new_artist_data
    
    # Remove any duplicates (just in case)
    df_combined = df_combined.drop_duplicates(subset=['track_id'], keep='last')
    
    # Save the combined dataset
    df_combined.to_csv(PROGRESS_FILE, index=False)
    
    print("\n" + "="*80)
    print("✓ DAILY FETCH COMPLETE!")
    print("="*80)
    print(f"\n📊 Results:")
    print(f"   Fetched today: {len(new_artist_data):,} tracks")
    print(f"   Total in dataset: {len(df_combined):,} tracks")
    print(f"   Remaining: {len(df_unique) - len(df_combined):,} tracks")
    print(f"   Overall progress: {len(df_combined)/len(df_unique)*100:.1f}%")
    
    # Estimate days remaining
    if len(new_artist_data) > 0:
        days_remaining = np.ceil((len(df_unique) - len(df_combined)) / len(new_artist_data))
        print(f"\n⏳ Estimated days to complete: {int(days_remaining)} days")
    
    print(f"\n💾 Saved to: {PROGRESS_FILE}")
    
    # Show some stats
    print(f"\n📈 Dataset Statistics:")
    print(f"   Multi-artist tracks: {(df_combined['num_artists'] > 1).sum():,} ({(df_combined['num_artists'] > 1).mean()*100:.1f}%)")
    print(f"   Avg artists per track: {df_combined['num_artists'].mean():.2f}")
    print(f"   Max artists on track: {df_combined['num_artists'].max()}")
    print(f"   Tracks with genres: {df_combined['artist_genres'].notna().sum():,}")
    
    print(f"\n✓ Come back tomorrow to fetch the next batch!")
    print(f"✓ Run this same cell again after 24 hours")

## 7. View Sample of Today's Results

In [None]:
# Display sample of fetched data
if 'new_artist_data' in locals() and len(new_artist_data) > 0:
    print("="*80)
    print("SAMPLE OF TODAY'S FETCHED DATA")
    print("="*80)
    
    # Show tracks with multiple artists
    multi_artist = new_artist_data[new_artist_data['num_artists'] > 1].head(10)
    
    if len(multi_artist) > 0:
        print("\nTracks with Multiple Artists:")
        display(multi_artist[['artist_names', 'num_artists', 'avg_artist_popularity', 
                              'total_artist_followers']].head())
    
    # Show overall sample
    print("\nRandom Sample:")
    display(new_artist_data[['primary_artist', 'num_artists', 'avg_artist_popularity', 
                             'max_artist_popularity', 'artist_genres']].sample(min(10, len(new_artist_data))))
else:
    print("No new data fetched in this session. Run the fetch cell above.")

## 8. Cleanup Checkpoint File

In [None]:
# Clean up the daily checkpoint file (optional - run after successful fetch)
if os.path.exists(CHECKPOINT_FILE):
    os.remove(CHECKPOINT_FILE)
    print(f"✓ Cleaned up checkpoint file: {CHECKPOINT_FILE}")
else:
    print("No checkpoint file to clean up")

---

## 📋 Instructions for Daily Use:

1. **Wait 24 hours** from your last fetch
2. **Open this notebook**
3. **Run cells 1-6** to load everything
4. **Run cell 6** ("Fetch Next Batch") to get today's tracks
5. **Check progress** in the output
6. **Repeat tomorrow!**

The notebook will automatically:
- Track your progress
- Fetch only unfetched tracks
- Save everything to `artist_info_extended.csv`
- Stop gracefully if rate limit is hit

**After ~18-20 days, you'll have all 28K tracks! 🎉**