# CSV files for annotation_comparison.ipynb

This note book contains the code that builds every csv file needed for the note book 'annotation_comparison'.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from glob import glob
from fractions import Fraction
from math import *
import ast

import sys
sys.path.append('../python_scripts')

from metric import get_distance
from progression import get_progression
from data_types import ChordType,PitchType
from constants import TRIAD_REDUCTION
from utils import get_chord_pitches

Creation of the folder Beethoven_sonata_dcml and Beethoven_sonata_fh with the csv files of the progression of one annotation of the first movement of Beethoven sonatas, the two different annotations being the dcml's and the fh's one.

In [2]:
def overlap(
    i1:float,
    i2:float
):
    if i1[1]>i2[0] and i1[1]<=i2[1]:
        return True
    if i2[1]>i1[0] and i2[1]<=i1[1]:
        return True
    return False

In [3]:
def duration_overlap(
    i1:float,
    i2:float
):
    if i1[1]>i2[0] and i1[1]<=i2[1]:
        if i1[0]<i2[0]:
            return i1[1]-i2[0]
        else :
            return i1[1]-i1[0]
    if i2[1]>i1[0] and i2[1]<=i1[1]:
        if i2[0]<i1[0]:
            return i2[1]-i1[0]
        else :
            return i2[1]-i2[0]
    return 0

In [None]:
for dcml_df_path in glob("../Data/Beethoven-labels/dcml*.tsv", recursive=True):

    dcml_df = pd.read_csv(dcml_df_path, sep='\t', converters={'duration': Fraction})
    dcml_df['chord_type'] = dcml_df['chord_type'].apply(lambda r : ChordType[r.split(".")[1]])

    time_dcml = dcml_df.duration.cumsum().astype(float, copy=False)
    dcml_df['interval']=[[i, f] for i, f in zip([0]+list(time_dcml[:-1]), time_dcml)]
    
    ##
    
    num = dcml_df_path.split('-')[2]
    movement = dcml_df_path.split('-')[3]
    fh_df_path = '../Data/Beethoven-labels/fh-'+num+'-'+movement
    
    fh_df = pd.read_csv(fh_df_path, sep='\t', converters={'duration': Fraction})
    fh_df['chord_type'] = fh_df['chord_type'].apply(lambda r : ChordType[r.split(".")[1]])
    
    time_fh = fh_df.duration.cumsum().astype(float, copy=False)
    fh_df['interval']=[[i, f] for i, f in zip([0]+list(time_fh[:-1]), time_fh)]

    ##
    
    matched_idx_list = []
    matched_duration_list = []
    chords_sps_dist_list = []
    chords_vl_dist_list = []
    chords_tbt_dist_list = []

    idx_dcml = 0
    for idx_fh, r_fh in fh_df.iterrows():
        
        matched_idx = []
        matched_duration = []
        chords_sps_dist = []
        chords_vl_dist = []
        chords_tbt_dist = []
        
        if (idx_dcml > 0 and overlap(dcml_df.interval[idx_dcml-1], r_fh.interval)):
            
            matched_idx.append(idx_dcml-1)
            
            matched_duration.append(duration_overlap(dcml_df.interval[idx_dcml-1], r_fh.interval))
            
            chords_sps_dist.append(get_distance(distance = 'SPS',
                                               root1=dcml_df.chord_root_midi[idx_dcml-1],
                                               root2=r_fh.chord_root_midi,
                                               chord_type1=dcml_df.chord_type[idx_dcml-1],
                                               chord_type2=r_fh.chord_type,
                                               inversion1=dcml_df.chord_inversion[idx_dcml-1],
                                               inversion2=r_fh.chord_inversion))

            chords_vl_dist.append(get_distance(distance = 'voice leading',
                                               root1=dcml_df.chord_root_midi[idx_dcml-1],
                                               root2=r_fh.chord_root_midi,
                                               chord_type1=dcml_df.chord_type[idx_dcml-1],
                                               chord_type2=r_fh.chord_type,
                                               inversion1=dcml_df.chord_inversion[idx_dcml-1],
                                               inversion2=r_fh.chord_inversion,
                                               bass_weight = 3))

            chords_tbt_dist.append(get_distance(distance = 'tone by tone',
                                               root1=dcml_df.chord_root_midi[idx_dcml-1],
                                               root2=r_fh.chord_root_midi,
                                               chord_type1=dcml_df.chord_type[idx_dcml-1],
                                               chord_type2=r_fh.chord_type,
                                               inversion1=dcml_df.chord_inversion[idx_dcml-1],
                                               inversion2=r_fh.chord_inversion,
                                               bass_weight = 3,
                                               root_weight = 3))

        while(idx_dcml < len(dcml_df) and overlap(dcml_df.interval[idx_dcml], r_fh.interval)):
            
            matched_idx.append(idx_dcml)
            
            matched_duration.append(duration_overlap(dcml_df.interval[idx_dcml], r_fh.interval))
            
            chords_sps_dist.append(get_distance(distance = 'SPS',
                                               root1=dcml_df.chord_root_midi[idx_dcml],
                                               root2=r_fh.chord_root_midi,
                                               chord_type1=dcml_df.chord_type[idx_dcml],
                                               chord_type2=r_fh.chord_type,
                                               inversion1=dcml_df.chord_inversion[idx_dcml],
                                               inversion2=r_fh.chord_inversion))

            chords_vl_dist.append(get_distance(distance = 'voice leading',
                                               root1=dcml_df.chord_root_midi[idx_dcml],
                                               root2=r_fh.chord_root_midi,
                                               chord_type1=dcml_df.chord_type[idx_dcml],
                                               chord_type2=r_fh.chord_type,
                                               inversion1=dcml_df.chord_inversion[idx_dcml],
                                               inversion2=r_fh.chord_inversion,
                                               bass_weight = 3))

            chords_tbt_dist.append(get_distance(distance = 'tone by tone',
                                               root1=dcml_df.chord_root_midi[idx_dcml],
                                               root2=r_fh.chord_root_midi,
                                               chord_type1=dcml_df.chord_type[idx_dcml],
                                               chord_type2=r_fh.chord_type,
                                               inversion1=dcml_df.chord_inversion[idx_dcml],
                                               inversion2=r_fh.chord_inversion,
                                               bass_weight = 3,
                                               root_weight = 3))

            idx_dcml += 1
            
        matched_idx_list.append(matched_idx)
        matched_duration_list.append(matched_duration)
        chords_sps_dist_list.append(chords_sps_dist)
        chords_vl_dist_list.append(chords_vl_dist)
        chords_tbt_dist_list.append(chords_tbt_dist)
        
    fh_df['matched_idx'] = matched_idx_list
    fh_df['matched_duration'] = matched_duration_list
    fh_df['chords_sps_dist'] = chords_sps_dist_list
    fh_df['chords_vl_dist'] = chords_vl_dist_list
    fh_df['chords_tbt_dist'] = chords_tbt_dist_list
    
    ##
    
    target_path_dcml = '../Data/Beethoven_sonata_dcml/'+num+'-1.csv'
    target_path_fh = '../Data/Beethoven_sonata_fh/'+num+'-1.csv'
    
    dcml_df.to_csv(target_path_dcml, index=False)
    fh_df.to_csv(target_path_fh, index=False)


___

Creation of the csv file sonata_comparisons.csv with the average distance between both annotations.

In [1]:
df_comparison=[]

for df_path in glob("../Data/Beethoven_sonata_fh/*.csv", recursive=True):
#for df_path in glob("../data_set_comparison/fh-new/*.csv", recursive=True):
    
    results_df = pd.read_csv(df_path, converters={'duration': Fraction})
    results_df['chord_type'] = results_df['chord_type'].apply(lambda r : ChordType[r.split(".")[1]])
    features = ['interval', 'matched_idx', 'matched_duration', 'chords_sps_dist', 'chords_vl_dist', 'chords_tbt_dist']
    for feature in features :
        results_df[feature] = results_df[feature].apply(lambda r : ast.literal_eval(r))
        
        
    num = df_path.split('/')[3].split('-')[0]
    
    weighted_dist_sps = []
    weighted_dist_vl  = []
    weighted_dist_tbt = []
    weighted_dist_bin = []
    
    duration_length = 0
    not_matched = []
    for idx, r in results_df.iterrows():
        if len(r.matched_duration)>0:
            weighted_dist_sps.append(float(np.dot(r.chords_sps_dist, r.matched_duration)))
            weighted_dist_vl.append(float(np.dot(r.chords_vl_dist, r.matched_duration)))
            weighted_dist_tbt.append(float(np.dot(r.chords_tbt_dist, r.matched_duration)))
            
            dist_bin = [0 if tbt == 0 else 1 for tbt in r.chords_tbt_dist]
            weighted_dist_bin.append(float(np.dot(dist_bin, r.matched_duration)))
        else :
            not_matched.append(idx)

        duration_length += float(sum(r.matched_duration))

    average_sps = sum(weighted_dist_sps)/duration_length
    average_vl  = sum(weighted_dist_vl)/duration_length
    average_tbt = sum(weighted_dist_tbt)/duration_length
    average_bin = sum(weighted_dist_bin)/duration_length
    
    features = {'number':num,
                'average_sps':average_sps,
                'average_vl' :average_vl,
                'average_tbt':average_tbt,
                'average_bin':average_bin,
                'idx_not_matched':not_matched}
    
    df_comparison.append(features)
    
df_comparison = pd.DataFrame(df_comparison).sort_values('number')
df_comparison.to_csv('../Data/sonata_comparisons.csv', index=False)

NameError: name 'glob' is not defined

___

Creation of the folder Beethoven_sonata_progression with the the csv files of both annotations of the first movement of Beethoven sonatas.

In [None]:
for dcml_df_path in glob("../Data/Beethoven-labels/dcml*.tsv", recursive=True):

    dcml_df = pd.read_csv(dcml_df_path, sep='\t', converters={'duration': Fraction})
    dcml_df['chord_type'] = dcml_df['chord_type'].apply(lambda r : ChordType[r.split(".")[1]])
    
    ##
    
    num = dcml_df_path.split('-')[2]
    movement = dcml_df_path.split('-')[3]
    fh_df_path = '../Data/Beethoven-labels/fh-'+num+'-'+movement
    
    fh_df = pd.read_csv(fh_df_path, sep='\t', converters={'duration': Fraction})
    fh_df['chord_type'] = fh_df['chord_type'].apply(lambda r : ChordType[r.split(".")[1]])
    
    progression = get_progression(dcml_df, fh_df)
    
    target_path = '../Data/Beethoven_sonata_progression/'+num+'-1.csv'
    progression.to_csv(target_path, index=False)