In [None]:
!pip install pretty_midi mir_eval numpy

Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mir_eval
  Downloading mir_eval-0.7.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.7/90.7 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.2-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting packaging~=23.1 (from mido>=1.1.16->pretty_midi)
  Downloading packaging-23.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty_midi, mi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import subprocess

os.chdir("/content/drive/MyDrive/Dissertation Code/piano-vision")
!pip install -r requirements.txt

os.chdir("/content/drive/MyDrive/Dissertation Code/Skipping-The-Frame-Level")
!pip install -r requirements.txt

Collecting ncls (from -r requirements.txt (line 1))
  Downloading ncls-0.0.68-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from -r requirements.txt (line 7))
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting torch_optimizer (from -r requirements.txt (line 12))
  Downloading torch_optimizer-0.3.0-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sox (from -r requirements.txt (line 13))
  Downloading sox-1.5.0.tar.gz (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->-r requirements.txt (line 5))
  Downloading nvid

In [None]:
import os
import pretty_midi
import mir_eval
import numpy as np
import itertools
import time
import subprocess

def read_note_data_from_text(file_path):
    intervals = []
    pitches = []
    with open(file_path, 'r') as file:
        for line in file:
            start, end, pitch = line.strip().split()
            intervals.append([float(start), float(end)])
            pitches.append(int(pitch))
    # print(np.array(intervals))
    return np.array(intervals), np.array(pitches)

def prepare_data_for_evaluation(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    intervals = []
    pitches = []
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            intervals.append([note.start, note.end])
            pitches.append(note.pitch)
    return np.array(intervals), np.array(pitches)

def evaluate_midi(predicted_midi_file, truth_midi_file):
    predicted_intervals, predicted_pitches = prepare_data_for_evaluation(predicted_midi_file)
    truth_intervals, truth_pitches = read_note_data_from_text(truth_midi_file)

    # mir_eval.transcription.validate(truth_intervals, truth_pitches, predicted_intervals, predicted_pitches)

    full_precision, full_recall, full_f1_score, _ = mir_eval.transcription.precision_recall_f1_overlap(
        truth_intervals, truth_pitches, predicted_intervals, predicted_pitches,
        onset_tolerance=0.05, pitch_tolerance=50.0, offset_ratio=None, offset_min_tolerance=0.05, strict=False, beta=1.0)

    onset_precision, onset_recall, onset_f1_score = mir_eval.transcription.onset_precision_recall_f1(
        truth_intervals, predicted_intervals, onset_tolerance=0.05, strict=False, beta=1.0)

    offset_precision, offset_recall, offset_f1_score = mir_eval.transcription.offset_precision_recall_f1(
        truth_intervals, predicted_intervals, offset_ratio=0.2, offset_min_tolerance=0.05, strict=False, beta=1.0)

    return {
        'full': (full_precision, full_recall, full_f1_score),
        'onset': (onset_precision, onset_recall, onset_f1_score),
        'offset': (offset_precision, offset_recall, offset_f1_score)
    }

def run_transcription_and_evaluate(input_video_path, output_midi_path, segment_size, segment_hop_size):
    command = f"python -m transkun.transcribe {input_video_path} {output_midi_path} --segmentHopSize {segment_hop_size} --segmentSize {segment_size} --device cuda"
    subprocess.run(command, check=True, shell=True)
    results = evaluate_midi(output_midi_path, truth_midi_path)
    full_f1_score = results['full'][2]
    return full_f1_score


In [None]:
results = {}

os.chdir("/content/drive/MyDrive/Dissertation Code/Skipping-The-Frame-Level")

input_directory = "../OMAPS/complete/mp4-tuning"
output_directory = "../OMAPS/evaluation/audio-visual-fusion-tuning-4"
truth_folder = "../OMAPS/complete/text-tuning"

segment_sizes = [6, 8, 12, 16, 32, 64]
hop_size_ratios = [0.5, 0.75]

for segment_size, ratio in itertools.product(segment_sizes, hop_size_ratios):
    segment_hop_size = segment_size * ratio

    for filename in os.listdir(input_directory):
        if filename.endswith(".mp4"):
            base_name = os.path.splitext(filename)[0]
            modified_output_filename = f"{base_name}_seg{segment_size}_hop{segment_hop_size}.mid"
            input_video_path = os.path.join(input_directory, filename)
            output_midi_path = os.path.join(output_directory, modified_output_filename)
            truth_midi_path = os.path.join(truth_folder, base_name + ".txt")
            print(f"Processing with segment size {segment_size} and hop size {segment_hop_size}")

            start_time = time.time()
            try:
                f1_score = run_transcription_and_evaluate(input_video_path, output_midi_path, segment_size, segment_hop_size)
                processing_time = time.time() - start_time

                results[(segment_size, segment_hop_size)] = results.get((segment_size, segment_hop_size), []) + [f1_score]

                combination_result_path = os.path.join(output_directory, f"results_seg{segment_size}_hop{segment_hop_size}.txt")
                with open(combination_result_path, 'a') as combo_file:
                    combo_file.write(f"{filename}: F1 Score = {f1_score}, Processing Time = {processing_time} seconds\n")
            except subprocess.CalledProcessError as e:
                print(f"An error occurred: {e}")

average_f1_scores = {k: sum(v) / len(v) for k, v in results.items()}
best_parameters = max(average_f1_scores, key=average_f1_scores.get)
best_f1_score = average_f1_scores[best_parameters]

output_file_path = "/content/drive/MyDrive/Dissertation Code/OMAPS/evaluation/audio-visual-fusion-tuning-2/evaluation_results.txt"

with open(output_file_path, 'w') as file:
    file.write("Best Parameters:\n")
    file.write(f"Segment Size: {best_parameters[0]}, Hop Size: {best_parameters[1]}\n")
    file.write(f"Best F1 Score: {best_f1_score}\n")

print(f"Best Parameters: Segment Size = {best_parameters[0]}, Hop Size = {best_parameters[1]}")
print(f"Best Average F1 Score: {best_f1_score}")

Processing with segment size 6 and hop size 3.0...
Processing with segment size 6 and hop size 3.0...
Processing with segment size 6 and hop size 3.0...
Processing with segment size 6 and hop size 3.0...
Processing with segment size 6 and hop size 3.0...
Processing with segment size 6 and hop size 4.5...
Processing with segment size 6 and hop size 4.5...
Processing with segment size 6 and hop size 4.5...
Processing with segment size 6 and hop size 4.5...
Processing with segment size 6 and hop size 4.5...
Processing with segment size 8 and hop size 4.0...
Processing with segment size 8 and hop size 4.0...
Processing with segment size 8 and hop size 4.0...
Processing with segment size 8 and hop size 4.0...
Processing with segment size 8 and hop size 4.0...
Processing with segment size 8 and hop size 6.0...
Processing with segment size 8 and hop size 6.0...
Processing with segment size 8 and hop size 6.0...
Processing with segment size 8 and hop size 6.0...
Processing with segment size 8 

In [None]:
!python -m transkun.transcribe ../OMAPS/complete/mp4-tuning/001.mp4 ../OMAPS/evaluation/audio-visual-fusion-tuning-2/001.mid --segmentHopSize 3 --segmentSize 3.5 --device cuda

error: XDG_RUNTIME_DIR not set in the environment.
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM default
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evalu