In [1]:
import os
import h5py
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
# Function to extract data from h5 files
def process_h5_file(file_path):
    try:
        with h5py.File(file_path, 'r') as h5:
            # Extract basic song information
            song_id = h5['metadata']['songs']['song_id'][0].decode('utf-8')
            artist_id = h5['metadata']['songs']['artist_id'][0].decode('utf-8')
            artist_name = h5['metadata']['songs']['artist_name'][0].decode('utf-8')
            title = h5['metadata']['songs']['title'][0].decode('utf-8')
            
            # Extract audio features
            tempo = float(h5['analysis']['songs']['tempo'][0])
            loudness = float(h5['analysis']['songs']['loudness'][0])
            danceability = float(h5['analysis']['songs']['danceability'][0])
            energy = float(h5['analysis']['songs']['energy'][0])
            duration = float(h5['analysis']['songs']['duration'][0])
            
            return {
                'song_id': song_id,
                'artist_id': artist_id,
                'artist_name': artist_name,
                'title': title,
                'tempo': tempo,
                'loudness': loudness,
                'danceability': danceability,
                'energy': energy,
                'duration': duration
            }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [3]:
# Find all h5 files in the directory structure
def find_h5_files(base_dir):
    h5_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.h5'):
                h5_files.append(os.path.join(root, file))
    return h5_files

In [4]:
base_dir = os.path.join(os.getcwd(), 'MillionSongSubset')
if os.path.exists(base_dir):
    print(f"Dataset directory found at: {base_dir}")
    print("Contents of the directory:")
    for item in os.listdir(base_dir):
        print(item)
else:
    print(f"Error: Dataset directory not found at {base_dir}")

Dataset directory found at: C:\Users\USER\Notebooks\DE\Project\MillionSongSubset
Contents of the directory:
A
B


In [5]:
# Get list of all h5 files
print("Finding h5 files...")
h5_files = find_h5_files(base_dir)
print(f"Found {len(h5_files)} h5 files")

Finding h5 files...
Found 10000 h5 files


In [6]:
# Process files in batches to avoid memory issues
batch_size = 1000
all_songs = []

In [7]:
for i in range(0, len(h5_files), batch_size):
    batch_files = h5_files[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}/{(len(h5_files)-1)//batch_size + 1}")
    
    # Process each file in the batch with a progress bar
    batch_results = []
    for file_path in tqdm(batch_files):
        result = process_h5_file(file_path)
        if result:
            batch_results.append(result)
    
    # Convert batch to DataFrame
    batch_df = pd.DataFrame(batch_results)
    
    # Save batch to avoid keeping everything in memory
    batch_df.to_csv(f'songs_batch_{i//batch_size + 1}.csv', index=False)
    
    # Optionally keep data in memory if your machine can handle it
    all_songs.extend(batch_results)

Processing batch 1/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 2/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 3/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 4/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 5/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 6/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 7/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 8/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 9/10


  0%|          | 0/1000 [00:00<?, ?it/s]

Processing batch 10/10


  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
# Create final DataFrame (either from memory or by combining CSVs)
if all_songs:
    # If we kept everything in memory
    songs_df = pd.DataFrame(all_songs)
    songs_df.to_csv('all_songs.csv', index=False)
else:
    # If we saved batches, combine them
    import glob
    all_files = glob.glob('songs_batch_*.csv')
    songs_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    songs_df.to_csv('all_songs.csv', index=False)

In [9]:
# Display sample data
songs_df.head()

Unnamed: 0,song_id,artist_id,artist_name,title,tempo,loudness,danceability,energy,duration
0,SOMZWCG12A8C13C480,ARD7TVE1187B99BFB1,Casual,I Didn't Mean To,92.198,-11.197,0.0,0.0,218.93179
1,SOCIWDW12A8C13D406,ARMJAGH1187FB546F3,The Box Tops,Soul Deep,121.274,-9.843,0.0,0.0,148.03546
2,SOXVLOJ12AB0189215,ARKRRTF1187B9984DA,Sonora Santanera,Amor De Cabaret,100.07,-9.689,0.0,0.0,177.47546
3,SONHOTT12A8C13493C,AR7G5I41187FB4CE6C,Adam Ant,Something Girls,119.293,-9.013,0.0,0.0,233.40363
4,SOFSOCN12A8C143F5D,ARXR32B1187FB57099,Gob,Face the Ashes,129.738,-4.501,0.0,0.0,209.60608
