In [1]:
import pandas as pd
import glob
import os
import mido
import pretty_midi
import numpy as np
from tqdm.notebook import tqdm # For progress bar

In [2]:
def extract_midi_stats(midi_path, folder_name):
    """
    Extracts a dictionary of statistics from a MIDI file.
    Returns None if the file cannot be processed by pretty_midi.
    """
    stats = {
        'file_path': midi_path,
        'file_name': os.path.basename(midi_path),
        'folder': folder_name,
        'mido_type': None,
        'mido_num_tracks': None,
        'mido_ticks_per_beat': None,
        'pm_duration_sec': None,
        'pm_num_instruments': None,
        'pm_total_notes': None,
        'pm_avg_notes_per_instrument': None,
        'pm_overall_min_pitch': None,
        'pm_overall_max_pitch': None,
        'pm_overall_avg_velocity': None,
        'pm_avg_note_duration_sec': None,
        'pm_initial_tempo': None,
        'pm_num_tempo_changes': None,
        'pm_num_time_signature_changes': None,
        'pm_num_key_signature_changes': None,
        'pm_has_lyrics': None,
        'pm_num_text_annotations': None,
        'parse_error_mido': False,
        'parse_error_pm': False
    }

    # --- Mido Analysis (for basic info) ---
    try:
        mid_mido = mido.MidiFile(midi_path)
        stats['mido_type'] = mid_mido.type
        stats['mido_num_tracks'] = len(mid_mido.tracks)
        stats['mido_ticks_per_beat'] = mid_mido.ticks_per_beat
    except Exception:
        stats['parse_error_mido'] = True
        # Continue, pretty_midi might still work or we want to log this file anyway

    # --- PrettyMIDI Analysis (for more detailed info) ---
    try:
        pm = pretty_midi.PrettyMIDI(midi_path)
        stats['pm_duration_sec'] = pm.get_end_time()
        stats['pm_num_instruments'] = len(pm.instruments)

        all_notes = []
        pitches = []
        velocities = []
        durations = []
        
        if pm.instruments:
            for instrument in pm.instruments:
                all_notes.extend(instrument.notes)
                for note in instrument.notes:
                    pitches.append(note.pitch)
                    velocities.append(note.velocity)
                    durations.append(note.end - note.start)
            
            stats['pm_total_notes'] = len(all_notes)
            stats['pm_avg_notes_per_instrument'] = len(all_notes) / len(pm.instruments) if pm.instruments else 0
            
            if pitches:
                stats['pm_overall_min_pitch'] = min(pitches)
                stats['pm_overall_max_pitch'] = max(pitches)
            if velocities:
                stats['pm_overall_avg_velocity'] = np.mean(velocities) if velocities else None
            if durations:
                stats['pm_avg_note_duration_sec'] = np.mean(durations) if durations else None

        tempo_times, tempo_bpms = pm.get_tempo_changes()
        stats['pm_initial_tempo'] = tempo_bpms[0] if len(tempo_bpms) > 0 else None
        stats['pm_num_tempo_changes'] = len(tempo_times)

        stats['pm_num_time_signature_changes'] = len(pm.time_signature_changes)
        stats['pm_num_key_signature_changes'] = len(pm.key_signature_changes)
        stats['pm_has_lyrics'] = bool(pm.lyrics)
        
        if hasattr(pm, 'text_events'):
            stats['pm_num_text_annotations'] = len(pm.text_events)
        else: # For older pretty_midi versions or if attribute is missing
            stats['pm_num_text_annotations'] = 0 
            
    except Exception:
        stats['parse_error_pm'] = True
        # If pretty_midi fails, we might not have many stats, but mido stats might be there.
        # Returning the partially filled dict is better than None to know which file failed.
    
    return stats

In [3]:
# Adjust base path if your notebook is in a different location relative to the 'data' folder
base_data_path = "../data/nesmdb_midi/" 

folders_to_process = {
    "train": os.path.join(base_data_path, "train"),
    "test": os.path.join(base_data_path, "test"),
    "valid": os.path.join(base_data_path, "valid")
}

all_midi_stats = []
file_count = 0

print("Starting MIDI file processing...")
for folder_name, folder_path in folders_to_process.items():
    print(f"\nProcessing folder: {folder_name} ({folder_path})")
    if not os.path.isdir(folder_path):
        print(f"Warning: Directory not found - {folder_path}")
        continue
        
    midi_files = glob.glob(os.path.join(folder_path, "*.mid"))
    if not midi_files:
        print(f"No MIDI files found in {folder_path}")
        continue
    
    print(f"Found {len(midi_files)} MIDI files in {folder_name} folder.")
    
    for midi_file_path in tqdm(midi_files, desc=f"Processing {folder_name}"):
        stats = extract_midi_stats(midi_file_path, folder_name)
        if stats: # Even if there were parse errors, we add the partial info
            all_midi_stats.append(stats)
        file_count +=1

print(f"\nProcessed a total of {file_count} MIDI files.")
print(f"Collected stats for {len(all_midi_stats)} files (some might have partial data due to parsing errors).")

# Create Pandas DataFrame
midi_stats_df = pd.DataFrame(all_midi_stats)

print("\nDataFrame created successfully.")

Starting MIDI file processing...

Processing folder: train (../data/nesmdb_midi/train)
Found 4502 MIDI files in train folder.


Processing train:   0%|          | 0/4502 [00:00<?, ?it/s]


Processing folder: test (../data/nesmdb_midi/test)
Found 373 MIDI files in test folder.


Processing test:   0%|          | 0/373 [00:00<?, ?it/s]


Processing folder: valid (../data/nesmdb_midi/valid)
Found 403 MIDI files in valid folder.


Processing valid:   0%|          | 0/403 [00:00<?, ?it/s]


Processed a total of 5278 MIDI files.
Collected stats for 5278 files (some might have partial data due to parsing errors).

DataFrame created successfully.


In [4]:
print("DataFrame Info:")
midi_stats_df.info()

print("\nDataFrame Head:")
display(midi_stats_df.head())

print("\nNumber of files with Mido parsing errors:", midi_stats_df['parse_error_mido'].sum())
print("Number of files with PrettyMIDI parsing errors:", midi_stats_df['parse_error_pm'].sum())

# Example: Show files that had PrettyMIDI parsing errors
if midi_stats_df['parse_error_pm'].sum() > 0:
    print("\nFiles with PrettyMIDI parsing errors:")
    display(midi_stats_df[midi_stats_df['parse_error_pm']][['file_name', 'folder', 'parse_error_mido']])

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5278 entries, 0 to 5277
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   file_path                      5278 non-null   object 
 1   file_name                      5278 non-null   object 
 2   folder                         5278 non-null   object 
 3   mido_type                      5278 non-null   int64  
 4   mido_num_tracks                5278 non-null   int64  
 5   mido_ticks_per_beat            5278 non-null   int64  
 6   pm_duration_sec                5248 non-null   float64
 7   pm_num_instruments             5248 non-null   float64
 8   pm_total_notes                 5244 non-null   float64
 9   pm_avg_notes_per_instrument    5244 non-null   float64
 10  pm_overall_min_pitch           5244 non-null   float64
 11  pm_overall_max_pitch           5244 non-null   float64
 12  pm_overall_avg_velocity        5

Unnamed: 0,file_path,file_name,folder,mido_type,mido_num_tracks,mido_ticks_per_beat,pm_duration_sec,pm_num_instruments,pm_total_notes,pm_avg_notes_per_instrument,...,pm_overall_avg_velocity,pm_avg_note_duration_sec,pm_initial_tempo,pm_num_tempo_changes,pm_num_time_signature_changes,pm_num_key_signature_changes,pm_has_lyrics,pm_num_text_annotations,parse_error_mido,parse_error_pm
0,../data/nesmdb_midi/train/238_MikeTyson_sPunch...,238_MikeTyson_sPunch_Out___11_12WeHaveaNewCham...,train,1,5,22050,7.453605,3.0,67.0,22.333333,...,3.268657,0.262138,120.0,1.0,2.0,0.0,False,0.0,False,False
1,../data/nesmdb_midi/train/108_Famista_90_10_11...,108_Famista_90_10_11Unknown2.mid,train,1,5,22050,25.658798,4.0,482.0,120.5,...,5.585062,0.091669,120.0,1.0,2.0,0.0,False,0.0,False,False
2,../data/nesmdb_midi/train/134_GanbareGoemon2_0...,134_GanbareGoemon2_07_08Underpass.mid,train,1,5,22050,11.995147,3.0,183.0,61.0,...,4.344262,0.186256,120.0,1.0,2.0,0.0,False,0.0,False,False
3,../data/nesmdb_midi/train/178_Ironsword_Wizard...,178_Ironsword_Wizards_amp_WarriorsII_12_13Fire...,train,1,5,22050,24.426463,4.0,262.0,65.5,...,9.083969,0.202986,120.0,1.0,2.0,0.0,False,0.0,False,False
4,../data/nesmdb_midi/train/387_Wizardry_Proving...,387_Wizardry_ProvingGroundsofTheMadOverlord_16...,train,1,5,22050,51.131156,3.0,183.0,61.0,...,3.52459,0.812176,120.0,1.0,2.0,0.0,False,0.0,False,False



Number of files with Mido parsing errors: 0
Number of files with PrettyMIDI parsing errors: 30

Files with PrettyMIDI parsing errors:


Unnamed: 0,file_name,folder,parse_error_mido
174,122_FireEmblem_AnkokuRyutoHikarinoTsurugi_30_3...,train,False
298,215_Magician_15_16EpiloguePart1.mid,train,False
930,298_SolarJetman_HuntfortheGoldenWarpship_18_19...,train,False
973,122_FireEmblem_AnkokuRyutoHikarinoTsurugi_28_2...,train,False
1009,215_Magician_08_09MountVunarCavernsAbadonsCast...,train,False
1029,405_ZombieNation_03_04VergeofDangerRoundSelect...,train,False
1078,104_FamicomJumpII_Saikyono7_nin_19_20ThemeofFr...,train,False
1198,298_SolarJetman_HuntfortheGoldenWarpship_12_13...,train,False
1338,329_SwordMaster_04_05MapScreen.mid,train,False
1623,314_SummerCarnival_92_Recca_13_14GelgoogScoreA...,train,False


In [5]:
# Optional: Save the DataFrame to a CSV file for later use
output_csv_path = "../data/nesmdb_midi_stats.csv"
midi_stats_df.to_csv(output_csv_path, index=False)
print(f"\nDataFrame saved to {output_csv_path}")


DataFrame saved to ../data/nesmdb_midi_stats.csv
