In [None]:
%pip install mido
%pip install numpy
%pip install pretty_midi
%pip install pandas
%pip install scikit-learn
%pip install faiss-cpu  # for similarity search

In [4]:
import os
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import numpy as np
import pretty_midi
import faiss
from sklearn.preprocessing import StandardScaler


NOTE: ipynb has different paths concept, use full path for ipynb

---

In [5]:
def extract_features(file_path):
    """
    Extract features from a MIDI file using a dictionary structure.
    Focuses on pitch, duration, rhythm, and tempo features.
    """
    try:
        midi_data = pretty_midi.PrettyMIDI(file_path)
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None
    
    # Initialize feature structure
    detailed_features = {
        'pitch_features': {'values': []},
        'duration_features': {'values': []},
        'rhythm_features': {
            'onset_times': [],
            'ioi_values': [],
            'beat_positions': [],
            'syncopation': []
        },
        'tempo': midi_data.estimate_tempo(),
        'time_signature': None,
        'key_signature': None
    }
    
    # Extract time and key signatures
    for ts in midi_data.time_signature_changes:
        detailed_features['time_signature'] = f"{ts.numerator}/{ts.denominator}"
        break
        
    for ks in midi_data.key_signature_changes:
        detailed_features['key_signature'] = ks.key_number
        break

    # Collect note features and onset times
    for instrument in midi_data.instruments:
        if not instrument.is_drum:
            for note in instrument.notes:
                detailed_features['pitch_features']['values'].append(note.pitch)
                detailed_features['duration_features']['values'].append(note.end - note.start)
                detailed_features['rhythm_features']['onset_times'].append(note.start)
    
    if not detailed_features['pitch_features']['values']:
        return None

    # Process rhythm features
    if len(detailed_features['rhythm_features']['onset_times']) > 0:
        onset_times = sorted(detailed_features['rhythm_features']['onset_times'])
        ioi_values = np.diff(onset_times)
        detailed_features['rhythm_features']['ioi_values'] = ioi_values.tolist()
        
        if detailed_features['time_signature']:
            beats_per_measure = int(detailed_features['time_signature'].split('/')[0])
            beat_duration = 60.0 / detailed_features['tempo']
            beat_positions = [t % (beats_per_measure * beat_duration) / beat_duration 
                            for t in onset_times]
            detailed_features['rhythm_features']['beat_positions'] = beat_positions
            
            syncopation = [1 if not np.isclose(pos % 1, 0, atol=0.1) else 0 
                          for pos in beat_positions]
            detailed_features['rhythm_features']['syncopation'] = syncopation

    # Calculate statistics
    for feature_type in ['pitch_features', 'duration_features']:
        values = np.array(detailed_features[feature_type]['values'])
        detailed_features[feature_type].update({
            'values': values.tolist(),
            'mean': float(np.mean(values)),
            'std': float(np.std(values)),
            'min': float(np.min(values)),
            'max': float(np.max(values))
        })
    
    # Add rhythm statistics
    if len(detailed_features['rhythm_features']['ioi_values']) > 0:
        ioi_values = np.array(detailed_features['rhythm_features']['ioi_values'])
        detailed_features['rhythm_features'].update({
            'ioi_mean': float(np.mean(ioi_values)),
            'ioi_std': float(np.std(ioi_values)),
            'syncopation_ratio': float(np.mean(detailed_features['rhythm_features']['syncopation']))
            if detailed_features['rhythm_features']['syncopation'] else 0.0
        })
    else:
        detailed_features['rhythm_features'].update({
            'ioi_mean': 0.0,
            'ioi_std': 0.0,
            'syncopation_ratio': 0.0
        })
    
    # Create simplified features for clustering
    simplified_features = {
        'tempo': float(detailed_features['tempo']),
        'pitch_mean': float(detailed_features['pitch_features']['mean']),
        'pitch_std': float(detailed_features['pitch_features']['std']),
        'duration_mean': float(detailed_features['duration_features']['mean']),
        'duration_std': float(detailed_features['duration_features']['std']),
        'ioi_mean': float(detailed_features['rhythm_features']['ioi_mean']),
        'ioi_std': float(detailed_features['rhythm_features']['ioi_std']),
        'syncopation_ratio': float(detailed_features['rhythm_features']['syncopation_ratio'])
    }
    
    return simplified_features

# Musical Feature Explanations

## Basic Features
- **tempo**: Speed of the music in beats per minute (BPM)
  - Example: 188.05 BPM = Very fast, energetic tempo
  - Range: Usually 60-200 BPM

## Pitch Features
- **pitch_mean**: Average MIDI note number (0-127)
  - Example: 50.37 ≈ D3 note
  - Range: 0 (lowest) to 127 (highest)
  - Reference: Middle C (C4) = 60

- **pitch_std**: How spread out the notes are from the mean
  - Example: 7.31 = Notes span about 1 octave
  - Higher value = More melodic variation

## Timing Features
- **duration_mean**: Average note length in seconds
  - Example: 0.22s = Mostly short notes
  - Lower values = Faster, staccato notes
  - Higher values = Longer, sustained notes

- **duration_std**: Variation in note lengths
  - Example: 0.07s = Fairly consistent note lengths
  - Higher value = More rhythmic variety

## Rhythm Features
- **ioi_mean**: Average time between note onsets (Inter-Onset Interval)
  - Example: 1.84s = Relatively sparse notes
  - Lower values = Notes close together
  - Higher values = More space between notes

- **ioi_std**: Variation in timing between notes
  - Example: 14.67 = Very irregular rhythm
  - Higher value = More rhythmic complexity

- **syncopation_ratio**: Proportion of notes that fall between beats
  - Example: 0.96 = Almost all notes are syncopated
  - Range: 0 (on-beat) to 1 (all syncopated)
  - Higher value = More rhythmic tension

---

### Similarity Search Index
FAISS for efficient similarity searches in high-dimensional spaces

FAISS -- Facebook AI Similarity Search (https://ai.meta.com/tools/faiss/)

In [6]:
def build_faiss_index(feature_list):
    """
    - Fit a StandardScaler on feature_list
    - Scale the data
    - Build a FAISS IndexFlatL2
    Returns (faiss_index, scaler, scaled_features).
    """
    # Convert list to numpy array
    feature_matrix = np.array(feature_list, dtype='float32')
    
    # Fit scaler
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_matrix)
    
    # Build FAISS index
    d = scaled_features.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(scaled_features)
    
    return index, scaler, scaled_features


In [7]:
def find_similar_midi(query_file_path, file_names, faiss_index, scaler, k=5):
    """
    Given a query MIDI file and a FAISS index, returns top-k similar files.
    """
    query_feats = extract_features(query_file_path)
    if query_feats is None:
        print("Failed to extract features from query file.")
        return [], []
    
    query_vector = np.array([
        query_feats['tempo'],
        query_feats['pitch_mean'],
        query_feats['pitch_std'],
        query_feats['duration_mean'],
        query_feats['duration_std'],
        query_feats['ioi_mean'],
        query_feats['ioi_std'],
        query_feats['syncopation_ratio']
    ], dtype='float32').reshape(1, -1)
    
    # Scale the query with the same scaler
    query_scaled = scaler.transform(query_vector)
    
    # Search top-k
    distances, indices = faiss_index.search(query_scaled.astype('float32'), k)
    
    # Retrieve file names and distances
    results_files = [file_names[i] for i in indices[0]]
    results_distances = distances[0]
    
    return results_files, results_distances


In [8]:
def process_dataset(dataset_folder):
    """
    Extract features from all MIDI files within a folder structure.
    Returns (feature_list, file_names).
    """
    feature_list = []
    file_names = []
    
    for root, _, files in os.walk(dataset_folder):
        for file in files:
            if file.lower().endswith(('.mid', '.midi')):
                midi_path = os.path.join(root, file)
                feats = extract_features(midi_path)
                if feats is not None:
                    feature_vec = [
                        feats['tempo'],
                        feats['pitch_mean'],
                        feats['pitch_std'],
                        feats['duration_mean'],
                        feats['duration_std'],
                        feats['ioi_mean'],
                        feats['ioi_std'],
                        feats['syncopation_ratio']
                    ]
                    feature_list.append(feature_vec)
                    file_names.append(midi_path)
    
    if not feature_list:
        raise ValueError("No valid MIDI files found in the dataset folder.")
    
    return feature_list, file_names


In [None]:
if __name__ == "__main__":
    # Suppose we have a dataset folder
    dataset_path = "/path/to/your/midi/dataset"
    
    # Process dataset -> (feature_list, file_names)
    feature_list, file_names = process_dataset(dataset_path)
    
    # Build FAISS index
    faiss_index, scaler, _ = build_faiss_index(feature_list)
    
    # Query
    query_midi_path = "/path/to/query/file.mid"
    k_neighbors = 5
    
    similar_files, dists = find_similar_midi(query_midi_path, file_names, faiss_index, scaler, k=k_neighbors)
    
    print(f"\nQuery: {query_midi_path}")
    print(f"Top {k_neighbors} similar files:")
    for fpath, dist in zip(similar_files, dists):
        # Optionally convert distance -> similarity ( e.g. 1/(1+dist) )
        sim = 1.0 / (1.0 + dist)
        print(f"\tFile: {fpath} | Distance: {dist:.4f} | Similarity: {sim:.2%}")
