Methodology
The proposed project will be implemented using the following steps:

Data Collection: Data is collected and provided to you.
Data Pre-processing: Convert the musical scores into a format suitable for deep learning models. This involves converting the musical scores into MIDI files and applying data augmentation techniques.
Feature Extraction: Extract features from the MIDI files, such as notes, chords, and tempo, using music analysis tools.
Model Building: Develop a deep learning model using LSTM and CNN architectures to classify the musical scores according to the composer.
Model Training: Train the deep learning model using the pre-processed and feature-extracted data.
Model Evaluation: Evaluate the performance of the deep learning model using accuracy, precision, and recall metrics.
Model Optimization: Optimize the deep learning model by fine-tuning hyperparameters.

In [24]:

import os
from zipfile import ZipFile
import numpy as np
import pretty_midi
import random

# Data Collection 
zip_path = "archive.zip"     # Path to kaggle file 
extract_to = "midi_subset"   # Folder to where .mid files will be extracted
COMPOSERS = ["Bach", "Beethoven", "Chopin", "Mozart"]

# Pre-processing 
MAX_LEN = 1000  # number of time frames per piano-roll
FS = 100        # sampling rate for piano-roll


os.makedirs(extract_to, exist_ok=True)
with ZipFile(zip_path, 'r') as zf:
    all_files = zf.namelist()
    print("ZIP sample entries:", all_files[:10])
    # Normalize composer names for matching
    composers_lower = [c.lower() for c in COMPOSERS]
    selected = []
    for file in all_files:
        # skip non-MID files
        if not file.lower().endswith('.mid'):
            continue
        # check any path segment matches a target composer
        segments = file.replace('\\', '/').split('/')
        if any(seg.lower() in composers_lower for seg in segments):
            selected.append(file)
    # extract selected files, preserving directory structure
    for f in selected:
        zf.extract(f, extract_to)
print(f"Extracted {len(selected)} .mid files into '{extract_to}'")


ZIP sample entries: ['Albe╠üniz/Aragon (Fantasia) Op.47 part 6.mid', 'Albe╠üniz/Castilla (Seguidillas) Op.47 part 7.mid', 'Albe╠üniz/Cataluna (Curranda), No.2 from Suite Espanola.mid', 'Albe╠üniz/Catalun╠âa (Curranda), No.2 from Suite Espanola.mid', 'Albe╠üniz/Espana Op. 165.mid', 'Albe╠üniz/Espan╠âa Op. 165 No.1.mid', 'Albe╠üniz/Espan╠âa Op. 165 No.2.mid', 'Albe╠üniz/Espan╠âa Op. 165 No.3.mid', 'Albe╠üniz/Espan╠âa Op. 165 No.4.mid', 'Albe╠üniz/Espan╠âa Op. 165 No.5.mid']
Extracted 1630 .mid files into 'midi_subset'


In [None]:

# Data Pre-processing 

def midi_to_pianoroll(path, fs=FS, max_length=MAX_LEN):
    """
    Load a .mid file and convert to a normalized piano-roll of shape (128, max_length).
    Pads with zeros or truncates to fit exactly max_length frames.
    Returns None if the file cannot be parsed.
    """
    try:
        pm = pretty_midi.PrettyMIDI(path)
    except Exception as e:
        print(f"Warning: could not parse '{path}': {e}")
        return None
    roll = pm.get_piano_roll(fs)
    # pad or truncate
    if roll.shape[1] < max_length:
        pad_width = max_length - roll.shape[1]
        roll = np.pad(roll, ((0,0),(0,pad_width)), mode='constant')
    else:
        roll = roll[:, :max_length]
    # normalize velocities to [0,1]
    max_val = np.max(roll) if np.max(roll) > 0 else 1
    return (roll / max_val).astype(np.float32)

# Directory to save piano-roll .npy files
pianoroll_dir = "pianorolls"
os.makedirs(pianoroll_dir, exist_ok=True)

# Walk through extracted .mid files and convert
count = 0
for root, dirs, files in os.walk(extract_to):
    for fname in files:
        if not fname.lower().endswith('.mid'):
            continue
        mid_path = os.path.join(root, fname)
        pr = midi_to_pianoroll(mid_path)
        if pr is None:
            continue
        # build save path preserving relative hierarchy
        rel = os.path.relpath(mid_path, extract_to)
        save_path = os.path.join(pianoroll_dir, os.path.splitext(rel)[0] + '.npy')
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        np.save(save_path, pr)
        count += 1
print(f"Converted and saved {count} piano-roll arrays under '{pianoroll_dir}'")




In [21]:
import os
import pretty_midi
from collections import defaultdict

# Simplified statistics: file count and average duration per composer

def safe_load_midi(path):
    """Attempt to load a MIDI file, return PrettyMIDI or None."""
    try:
        return pretty_midi.PrettyMIDI(path)
    except Exception:
        return None

# Directory containing extracted .mid files
base_dir = "midi_subset"

# Target composers
composers = ["Bach", "Beethoven", "Chopin", "Mozart"]

# Collect durations
durations = defaultdict(list)

for root, _, files in os.walk(base_dir):
    for fname in files:
        if not fname.lower().endswith('.mid'):
            continue
        path = os.path.join(root, fname)
        pm = safe_load_midi(path)
        if pm is None:
            continue
        # Determine composer by path segment
        segments = root.replace("\\", "/").split("/")
        composer = next((c for c in composers if c.lower() in (s.lower() for s in segments)), "Unknown")
        durations[composer].append(pm.get_end_time())

# Print summary
print("Composer Statistics:")
for composer in composers:
    durs = durations.get(composer, [])
    if not durs:
        print(f"  {composer}: 0 files")
    else:
        avg = sum(durs) / len(durs)
        mn = min(durs)
        mx = max(durs)
        print(f"  {composer}: {len(durs)} files, duration (s) avg={avg:.1f}, min={mn:.1f}, max={mx:.1f}")

Composer Statistics:
  Bach: 1024 files, duration (s) avg=156.2, min=17.5, max=5209.3
  Beethoven: 212 files, duration (s) avg=508.4, min=21.9, max=5032.3
  Chopin: 136 files, duration (s) avg=220.8, min=23.3, max=1352.7
  Mozart: 256 files, duration (s) avg=400.8, min=26.1, max=1478.1
