In [1]:
pip install fastapi uvicorn librosa numpy fastdtw scipy

Collecting fastapi
  Downloading fastapi-0.112.1-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting fastdtw
  Downloading fastdtw-0.3.4.tar.gz (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting starlette<0.39.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.38.2-py3-none-any.whl.metadata (5.9 kB)
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading fastapi-0.112.1-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.30.6-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-p

In [3]:
import librosa
import numpy as np
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
import plotly.graph_objects as go

# Function to load audio and perform Harmonic-Percussive Source Separation (HPSS)
def load_and_separate(audio_path):
    y, sr = librosa.load(audio_path)
    harmonic, percussive = librosa.effects.hpss(y)
    return harmonic, percussive, sr

# Function to extract pitch using YIN method
def extract_yin_pitch(audio, sr):
    pitches = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    return pitches

# Function to compare pitch accuracy
def compare_pitch(original_pitches, user_pitches):
    min_length = min(len(original_pitches), len(user_pitches))
    original_pitches = original_pitches[:min_length]
    user_pitches = user_pitches[:min_length]
    pitch_diff = np.abs(original_pitches - user_pitches)
    avg_pitch_diff = np.mean(pitch_diff)
    return avg_pitch_diff

# Function to extract onset times and compare rhythm/timing
def compare_onsets(original, user, sr):
    original_onsets = librosa.onset.onset_detect(y=original, sr=sr)
    user_onsets = librosa.onset.onset_detect(y=user, sr=sr)

    # Convert onset frames to time
    original_onset_times = librosa.frames_to_time(original_onsets, sr=sr)
    user_onset_times = librosa.frames_to_time(user_onsets, sr=sr)

    # Truncate to match the length of the shorter array
    min_length = min(len(original_onset_times), len(user_onset_times))
    original_onset_times = original_onset_times[:min_length]
    user_onset_times = user_onset_times[:min_length]

    # Calculate timing differences
    onset_diff = np.abs(original_onset_times - user_onset_times)

    # Calculate tempo consistency (standard deviation of onset intervals)
    original_intervals = np.diff(original_onset_times)
    user_intervals = np.diff(user_onset_times)
    tempo_consistency = np.std(original_intervals - user_intervals)

    avg_timing_diff = np.mean(onset_diff)

    return avg_timing_diff, original_onsets, user_onsets

# Function to calculate articulation by comparing note attack times
def compare_articulation(original, user, sr):
    original_attacks = librosa.onset.onset_strength(y=original, sr=sr)
    user_attacks = librosa.onset.onset_strength(y=user, sr=sr)

    # Calculate DTW using librosa's DTW function
    D, wp = librosa.sequence.dtw(X=original_attacks, Y=user_attacks)
    distance = D[-1, -1]  # The final distance metric

    return distance

# Function to calculate dynamics by comparing loudness
def compare_dynamics(original, user):
    original_loudness = librosa.feature.rms(y=original)
    user_loudness = librosa.feature.rms(y=user)
    distance, _ = fastdtw(original_loudness.T, user_loudness.T, dist=euclidean)
    return distance

# Function to generate all comparisons and plot results
def compare_piano_recordings(original_file, user_file):
    # Load and separate harmonic and percussive components
    original_harmonic, original_percussive, sr = load_and_separate(original_file)
    user_harmonic, user_percussive, _ = load_and_separate(user_file)

    # Compare pitch using DTW and YIN pitch extraction
    original_yin_pitches = extract_yin_pitch(original_harmonic, sr)
    user_yin_pitches = extract_yin_pitch(user_harmonic, sr)
    pitch_accuracy = compare_pitch(original_yin_pitches, user_yin_pitches)

    # Compare timing/rhythm and onset times using DTW
    rhythm_distance, original_onsets, user_onsets = compare_onsets(original_percussive, user_percussive, sr)

    # Compare articulation (attack times) using DTW
    articulation_distance = compare_articulation(original_harmonic, user_harmonic, sr)

    # Compare dynamics (loudness) using DTW
    dynamics_distance = compare_dynamics(original_harmonic, user_harmonic)

    # Calculate pitch accuracy score
    if pitch_accuracy < 10:
        pitch_score = 10
    elif pitch_accuracy < 20:
        pitch_score = 8
    elif pitch_accuracy < 50:
        pitch_score = 6
    else:
        pitch_score = 4

    # Calculate timing/rhythm score
    if rhythm_distance < 100:
        timing_score = 10
    elif rhythm_distance < 200:
        timing_score = 8
    elif rhythm_distance < 300:
        timing_score = 6
    else:
        timing_score = 4

    # Calculate articulation score
    if articulation_distance < 100:
        articulation_score = 10
    elif articulation_distance < 200:
        articulation_score = 8
    elif articulation_distance < 300:
        articulation_score = 6
    else:
        articulation_score = 4

    # Calculate dynamics score
    if dynamics_distance < 100:
        dynamics_score = 10
    elif dynamics_distance < 200:
        dynamics_score = 8
    elif dynamics_distance < 300:
        dynamics_score = 6
    else:
        dynamics_score = 4

    # Tempo consistency is derived from rhythm distance (similar logic)
    tempo_score = timing_score

    # Consistency can be an average of the above scores
    consistency_score = np.mean([pitch_score, timing_score, articulation_score, dynamics_score])

    # Print the results
    print(f"Pitch Accuracy: {pitch_score}/10 (Average Pitch Difference: {pitch_accuracy:.2f} Hz)")
    print(f"Timing/Rhythm: {timing_score}/10 (Rhythm Distance: {rhythm_distance:.2f})")
    print(f"Articulation: {articulation_score}/10 (Articulation Distance: {articulation_distance:.2f})")
    print(f"Dynamics: {dynamics_score}/10 (Dynamics Distance: {dynamics_distance:.2f})")
    print(f"Tempo Consistency: {tempo_score}/10")
    print(f"Overall Consistency: {consistency_score:.2f}/10")

    # Interactive Onset Detection Comparison Plot
    original_times = librosa.frames_to_time(original_onsets, sr=sr)
    user_times = librosa.frames_to_time(user_onsets, sr=sr)

    fig = go.Figure()

    # Original recording onsets
    fig.add_trace(go.Scatter(
        x=original_times,
        y=[1] * len(original_times),
        mode='markers+lines',
        name='Original Recording Onsets',
        marker=dict(color='red', size=8, symbol='line-ns'),
        line=dict(dash='dash')
    ))

    # User recording onsets
    fig.add_trace(go.Scatter(
        x=user_times,
        y=[0.5] * len(user_times),
        mode='markers+lines',
        name='User Recording Onsets',
        marker=dict(color='blue', size=8, symbol='line-ns'),
        line=dict(dash='dash')
    ))

    fig.update_layout(
        title="Interactive Onset Detection Comparison",
        xaxis_title="Time (seconds)",
        yaxis=dict(showticklabels=False),
        showlegend=True,
        height=400,
        width=1000
    )

    fig.show()

    # Interactive Pitch Contours Plot
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        y=original_yin_pitches,
        mode='lines',
        name='Original YIN Pitch',
        line=dict(color='blue')
    ))

    fig.add_trace(go.Scatter(
        y=user_yin_pitches,
        mode='lines',
        name='User YIN Pitch',
        line=dict(color='orange', dash='dash')
    ))

    fig.update_layout(
        title="Interactive YIN Pitch Contours",
        xaxis_title="Frame",
        yaxis_title="Frequency (Hz)",
        showlegend=True,
        height=400,
        width=1000
    )

    fig.show()

# Example usage:
compare_piano_recordings('/content/Original_32_notes.wav', '/content/5_notes_high_volume.wav')


Pitch Accuracy: 8/10 (Average Pitch Difference: 10.85 Hz)
Timing/Rhythm: 10/10 (Rhythm Distance: 0.29)
Articulation: 10/10 (Articulation Distance: 28.61)
Dynamics: 10/10 (Dynamics Distance: 0.08)
Tempo Consistency: 10/10
Overall Consistency: 9.50/10


In [5]:
compare_piano_recordings('/content/Original_32_notes.wav', '/content/5_notes_high_volume.wav')

Pitch Accuracy: 8/10 (Average Pitch Difference: 10.85 Hz)
Timing/Rhythm: 10/10 (Rhythm Distance: 0.29)
Articulation: 10/10 (Articulation Distance: 28.61)
Dynamics: 10/10 (Dynamics Distance: 0.08)
Tempo Consistency: 10/10
Overall Consistency: 9.50/10
