In [None]:
import pickle
from tqdm.notebook import tqdm
import pandas as pd

class BatchProcessor:
    def __init__(self, loader, extractor, tracks_df, project_dir):
        self.loader = loader
        self.extractor = extractor
        self.tracks = tracks_df
        self.project_dir = project_dir
        self.checkpoint_dir = f"{project_dir}/checkpoints"

    def get_track_ids(self):
        return list(self.tracks.index)

    def load_checkpoint(self):
        checkpoint_file = f"{self.checkpoint_dir}/processed_data.pkl"
        if os.path.exists(checkpoint_file):
            with open(checkpoint_file, 'rb') as f:
                data = pickle.load(f)
            print(f"Loaded checkpoint: {len(data)} tracks already processed")
            return data
        return []

    def save_checkpoint(self, data, batch_num):
        checkpoint_file = f"{self.checkpoint_dir}/processed_data.pkl"
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(data, f)
        print(f"Checkpoint saved: {len(data)} tracks (Batch {batch_num})")

    def process_all(self, batch_size=500):

        # Load existing progress
        processed_data = self.load_checkpoint()
        processed_ids = set([d['track_id'] for d in processed_data])

        # Get all track IDs
        all_ids = self.get_track_ids()
        remaining_ids = [tid for tid in all_ids if tid not in processed_ids]

        if len(remaining_ids) == 0:
            print("All tracks already processed!")
            return processed_data

        # Process in batches
        batch_num = len(processed_ids) // batch_size

        for i in tqdm(range(len(remaining_ids)), desc="Processing tracks"):
            track_id = remaining_ids[i]

            try:
                # Load audio (full 30 seconds)
                audio, sr = librosa.load(
                    self.loader.get_path(track_id),
                    sr=22050,
                    duration=30
                )

                if audio is None or len(audio) < sr * 5:  # Skip if < 5 seconds
                    continue

                # Extract features
                features = self.extractor.process_track(audio, sr)

                # Get metadata
                try:
                    genre = self.tracks.loc[track_id, ('track', 'genre_top')]
                    title = self.tracks.loc[track_id, ('track', 'title')]
                    artist = self.tracks.loc[track_id, ('artist', 'name')]
                except:
                    genre = 'Unknown'
                    title = 'Unknown'
                    artist = 'Unknown'

                # Store everything
                processed_data.append({
                    'track_id': track_id,
                    'genre': genre,
                    'title': title,
                    'artist': artist,
                    **features
                })

                # Save checkpoint every batch_size tracks
                if (i + 1) % batch_size == 0:
                    batch_num += 1
                    self.save_checkpoint(processed_data, batch_num)

            except Exception as e:
                print(f"\n Error processing track {track_id}: {e}")
                continue

        # Final save
        self.save_checkpoint(processed_data, batch_num + 1)

        return processed_data

# Initialize processor
processor = BatchProcessor(loader, extractor, tracks, project_dir)
print("Batch processor ready")

In [None]:
import time

# 1. Re-initialize processor with the CLEAN 8k list
processor = BatchProcessor(loader, extractor, tracks_small, project_dir)

print(f"STARTING BATCH PROCESSING ON {len(tracks_small)} TRACKS")
print("=" * 60)
print("Monitor the first 1% to ensure no errors appear.")
print("=" * 60)

start_time = time.time()

# 2. Execute
processed_data = processor.process_all(batch_size=500)

# 3. Summary
elapsed = (time.time() - start_time) / 60
print("\n" + "=" * 60)
print(f"PROCESSING COMPLETE")
print(f"Time taken: {elapsed:.1f} minutes")
print(f"Total tracks processed: {len(processed_data)}")

In [None]:
import pandas as pd
import pickle
import os

# Define path to the file 
project_dir = ''path''
checkpoint_path = f"{project_dir}/checkpoints/processed_data.pkl"

print(f"Loading raw data from: {checkpoint_path}")

with open(checkpoint_path, 'rb') as f:
    data = pickle.load(f)

# Convert list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(data)

# Save the "complete" version so it's easier next time
df.to_pickle(f'{project_dir}/fma_features_complete.pkl')

print(f" SUCCESS! Loaded {len(df)} tracks.")
print(f"   Variable 'df' is ready to use.")