In [1]:
import os
from git import Repo
import dimcat as dc
from ms3 import __version__ as ms3_version
corpus_path = "~/ABC"
repo = Repo(corpus_path)
notebook_repo = Repo('.', search_parent_directories=True)
notebook_repo_path = notebook_repo.git.rev_parse("--show-toplevel")
print(f"Notebook repository '{os.path.basename(notebook_repo_path)}' @ {notebook_repo.commit().hexsha[:7]}")
print(f"Data repo '{os.path.basename(corpus_path)}' @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dc.__version__}")
print(f"ms3 version {ms3_version}")

Notebook repository 'dimcat' @ 36fcf12
Data repo 'ABC' @ 560802e
dimcat version 0.2.0.post1.dev64+gda0a036
ms3 version 1.0.2


# Working with Harmonic Annotations

In [2]:
import pandas as pd

## Load dataset

Use dimcat's `Dataset` class to load a dataset.
Each dataset consists of several subcorpora (here only `ABC`),
which in turn consist of several pieces (here `n01_op18-1_01`, `n01_op18-1_02`, etc.).

A `Dataset` has several representations of each piece (e.g. a list of chord labels or a list of notes) called *facets*.
Each facet is represented by a dataframe.

Corpora can be processed, e.g. slicing notes according to different criteria (see below).
The output of these operations is again a dataset with facets.

In [3]:
dataset = dc.Dataset()
dataset.load(corpus_path)
dataset.data

[[1mdefault[0;0m|all]
All corpora
-----------
View: This view is called 'default'. It 
	- excludes fnames that are not contained in the metadata,
	- filters out file extensions requiring conversion (such as .xml), and
	- excludes review files and folders.

            has   active   scores measures           notes        expanded       
       metadata     view detected detected parsed detected parsed detected parsed
corpus                                                                           
ABC         yes  default       70       70     70       70     70       70     70

210/560 files are excluded from this view.

210 files have been excluded based on their subdir.

## Get chord labels

Chord labels are stored in the `expanded` facet.
Using `.get_facet()` returns a single dataframe with all chord labels.
Corpus, piece, and timespan ("interval") are encoded in an hierarchical index.

In [4]:
labels = dataset.get_facet("expanded")
labels

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mc,mn,quarterbeats,duration_qb,mc_onset,mn_onset,timesig,staff,voice,label,...,phraseend,chord_type,globalkey_is_minor,localkey_is_minor,chord_tones,added_tones,root,bass_note,alt_label,volta
corpus,fname,interval,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ABC,n01op18-1_01,"[0.0, 3.0)",1,1,0,3.0,0,0,3/4,4,1,F.I,...,,M,False,False,"(0, 4, 1)",(),0,0,,
ABC,n01op18-1_01,"[3.0, 6.0)",2,2,3,3.0,0,0,3/4,4,1,V,...,,M,False,False,"(1, 5, 2)",(),1,1,,
ABC,n01op18-1_01,"[6.0, 9.0)",3,3,6,3.0,0,0,3/4,4,1,I,...,,M,False,False,"(0, 4, 1)",(),0,0,,
ABC,n01op18-1_01,"[9.0, 15.0)",4,4,9,6.0,0,0,3/4,4,1,IV6,...,,M,False,False,"(3, 0, -1)",(),-1,3,,
ABC,n01op18-1_01,"[15.0, 18.0)",6,6,15,3.0,0,0,3/4,4,1,V65,...,,Mm7,False,False,"(5, 2, -1, 1)",(),1,5,,
ABC,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ABC,n16op135_04,"[1140.0, 1141.0)",281,280,1140,1.0,1/2,1/2,4/4,4,1,I,...,,M,True,False,"(0, 4, 1)",(),0,0,,
ABC,n16op135_04,"[1141.0, 1142.0)",281,280,1141,1.0,3/4,3/4,4/4,4,1,I6,...,,M,True,False,"(4, 1, 0)",(),0,4,,
ABC,n16op135_04,"[1142.0, 1144.0)",282,281,1142,2.0,0,0,4/4,4,1,V(64),...,,M,True,False,"(1, 0, 4)",(),1,1,,
ABC,n16op135_04,"[1144.0, 1146.0)",282,281,1144,2.0,1/2,1/2,4/4,4,1,V7,...,,Mm7,True,False,"(1, 5, 2, -1)",(),1,1,,


## Get salami slices

Use the `NoteSlicer` to obtain a sliced version of the dataset.
Querying the note facet returns the sliced notes.

In [5]:
# this takes some time
salami_dts = dc.NoteSlicer().process_data(dataset)
salami_notes = salami_dts.get_facet("notes")
salami_notes

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mc,mn,quarterbeats,duration_qb,mc_onset,mn_onset,timesig,staff,voice,duration,...,nominal_duration,scalar,tied,tpc,midi,name,octave,chord_id,tremolo,volta
corpus,fname,onset_slice,interval,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
ABC,n01op18-1_01,"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,3,1,1/4,...,1/4,1,1,-1,53,F3,3,12,,
ABC,n01op18-1_01,"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,4,1,1/4,...,1/4,1,1,-1,53,F3,3,18,,
ABC,n01op18-1_01,"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,1,1,1/4,...,1/4,1,1,-1,65,F4,4,0,,
ABC,n01op18-1_01,"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,2,1,1/4,...,1/4,1,1,-1,65,F4,4,6,,
ABC,n01op18-1_01,"[1.0, 1.5)","[1.0, 1.5)",1,1,1,0.5,1/4,1/4,3/4,3,1,1/8,...,1/8,1,-1,-1,53,F3,3,13,,
ABC,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ABC,n16op135_04,"[1146.0, 1147.0)","[1146.0, 1147.0)",283,282,1146,1.0,0,0,4/4,3,1,1/4,...,1/4,1,,-1,53,F3,3,2731,,
ABC,n16op135_04,"[1146.0, 1147.0)","[1146.0, 1147.0)",283,282,1146,1.0,0,0,4/4,1,1,1/4,...,1/4,1,,3,69,A4,4,2729,,
ABC,n16op135_04,"[1146.0, 1147.0)","[1146.0, 1147.0)",283,282,1146,1.0,0,0,4/4,2,1,1/4,...,1/4,1,,3,69,A4,4,2730,,
ABC,n16op135_04,"[1146.0, 1147.0)","[1146.0, 1147.0)",283,282,1146,1.0,0,0,4/4,2,1,1/4,...,1/4,1,,-1,77,F5,5,2730,,


# Match salami slices with chord labels

Each chord label has an `interval` index that encodes its timespan.
We can use this to find the corresponding slices from the previous step.

Let's try this for a single chord. Start by getting the interval of the first chord in the first piece:

In [6]:
# zoom in on the chords in one piece
chords = labels.loc[('ABC', 'n01op18-1_01')]
chords

Unnamed: 0_level_0,mc,mn,quarterbeats,duration_qb,mc_onset,mn_onset,timesig,staff,voice,label,...,phraseend,chord_type,globalkey_is_minor,localkey_is_minor,chord_tones,added_tones,root,bass_note,alt_label,volta
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"[0.0, 3.0)",1,1,0,3.0,0,0,3/4,4,1,F.I,...,,M,False,False,"(0, 4, 1)",(),0,0,,
"[3.0, 6.0)",2,2,3,3.0,0,0,3/4,4,1,V,...,,M,False,False,"(1, 5, 2)",(),1,1,,
"[6.0, 9.0)",3,3,6,3.0,0,0,3/4,4,1,I,...,,M,False,False,"(0, 4, 1)",(),0,0,,
"[9.0, 15.0)",4,4,9,6.0,0,0,3/4,4,1,IV6,...,,M,False,False,"(3, 0, -1)",(),-1,3,,
"[15.0, 18.0)",6,6,15,3.0,0,0,3/4,4,1,V65,...,,Mm7,False,False,"(5, 2, -1, 1)",(),1,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[921.0, 924.0)",308,308,921,3.0,0,0,3/4,4,1,I,...,,M,False,False,"(0, 4, 1)",(),0,0,,
"[924.0, 927.0)",309,309,924,3.0,0,0,3/4,4,1,Fr6,...,,Fr,False,False,"(-4, 0, 2, 6)",(),2,-4,,
"[927.0, 930.0)",310,310,927,3.0,0,0,3/4,4,1,I],...,,M,False,False,"(0, 4, 1)",(),0,0,,
"[930.0, 937.0)",311,311,930,7.0,0,0,3/4,4,1,I,...,,M,False,False,"(0, 4, 1)",(),0,0,,


In [7]:
# get the interval of the first chord...
chord0_interval = chords.index[0]
chord0_interval # this is a pandas Interval

Interval(0.0, 3.0, closed='left')

In [8]:
# and the chord itself
chord0 = chords.loc[chord0_interval]
chord0

mc                            1
mn                            1
quarterbeats                  0
duration_qb                 3.0
mc_onset                      0
mn_onset                      0
timesig                     3/4
staff                         4
voice                         1
label                       F.I
globalkey                     F
localkey                      I
pedal                       NaN
chord                         I
special                     NaN
numeral                       I
form                        NaN
figbass                     NaN
changes                     NaN
relativeroot                NaN
cadence                     NaN
phraseend                   NaN
chord_type                    M
globalkey_is_minor        False
localkey_is_minor         False
chord_tones           (0, 4, 1)
added_tones                  ()
root                          0
bass_note                     0
alt_label                   NaN
volta                      <NA>
Name: [0

Finally, find all slices in the same piece that overlap with the chord:

In [9]:
salamis = salami_notes.loc[("ABC", "n01op18-1_01")]
salamis[salamis.index.get_level_values(0).overlaps(chord0_interval)]

Unnamed: 0_level_0,Unnamed: 1_level_0,mc,mn,quarterbeats,duration_qb,mc_onset,mn_onset,timesig,staff,voice,duration,...,nominal_duration,scalar,tied,tpc,midi,name,octave,chord_id,tremolo,volta
onset_slice,interval,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,3,1,1/4,...,1/4,1,1.0,-1,53,F3,3,12,,
"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,4,1,1/4,...,1/4,1,1.0,-1,53,F3,3,18,,
"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,1,1,1/4,...,1/4,1,1.0,-1,65,F4,4,0,,
"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,2,1,1/4,...,1/4,1,1.0,-1,65,F4,4,6,,
"[1.0, 1.5)","[1.0, 1.5)",1,1,1,0.5,1/4,1/4,3/4,3,1,1/8,...,1/8,1,-1.0,-1,53,F3,3,13,,
"[1.0, 1.5)","[1.0, 1.5)",1,1,1,0.5,1/4,1/4,3/4,4,1,1/8,...,1/8,1,-1.0,-1,53,F3,3,19,,
"[1.0, 1.5)","[1.0, 1.5)",1,1,1,0.5,1/4,1/4,3/4,1,1,1/8,...,1/8,1,-1.0,-1,65,F4,4,1,,
"[1.0, 1.5)","[1.0, 1.5)",1,1,1,0.5,1/4,1/4,3/4,2,1,1/8,...,1/8,1,-1.0,-1,65,F4,4,7,,
"[1.5, 1.75)","[1.5, 1.75)",1,1,3/2,0.25,3/8,3/8,3/4,3,1,1/16,...,1/16,1,,1,55,G3,3,14,,
"[1.5, 1.75)","[1.5, 1.75)",1,1,3/2,0.25,3/8,3/8,3/4,4,1,1/16,...,1/16,1,,1,55,G3,3,20,,


## Rest...

In [10]:
salami_notes.loc[("ABC", "n01op18-1_01")]

Unnamed: 0_level_0,Unnamed: 1_level_0,mc,mn,quarterbeats,duration_qb,mc_onset,mn_onset,timesig,staff,voice,duration,...,nominal_duration,scalar,tied,tpc,midi,name,octave,chord_id,tremolo,volta
onset_slice,interval,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,3,1,1/4,...,1/4,1,1,-1,53,F3,3,12,,
"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,4,1,1/4,...,1/4,1,1,-1,53,F3,3,18,,
"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,1,1,1/4,...,1/4,1,1,-1,65,F4,4,0,,
"[0.0, 1.0)","[0.0, 1.0)",1,1,0,1.0,0,0,3/4,2,1,1/4,...,1/4,1,1,-1,65,F4,4,6,,
"[1.0, 1.5)","[1.0, 1.5)",1,1,1,0.5,1/4,1/4,3/4,3,1,1/8,...,1/8,1,-1,-1,53,F3,3,13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[937.0, 938.0)","[937.0, 938.0)",313,313,937,1.0,1/4,1/4,3/4,3,1,1/4,...,1/4,1,,-1,53,F3,3,4441,,
"[937.0, 938.0)","[937.0, 938.0)",313,313,937,1.0,1/4,1/4,3/4,2,1,1/4,...,1/4,1,,3,57,A3,3,4439,,
"[937.0, 938.0)","[937.0, 938.0)",313,313,937,1.0,1/4,1/4,3/4,2,1,1/4,...,1/4,1,,-1,65,F4,4,4439,,
"[937.0, 938.0)","[937.0, 938.0)",313,313,937,1.0,1/4,1/4,3/4,1,1,1/4,...,1/4,1,,3,69,A4,4,4437,,


In [11]:
salami_notes.loc[("ABC", "n01op18-1_01", pd.Interval(12.0,13.0,closed='left'))]

Unnamed: 0_level_0,mc,mn,quarterbeats,duration_qb,mc_onset,mn_onset,timesig,staff,voice,duration,...,nominal_duration,scalar,tied,tpc,midi,name,octave,chord_id,tremolo,volta
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"[12.0, 13.0)",5,5,12.0,1.0,0,0,3/4,4,1,3/4,...,1/2,3/2,,2,50,D3,3,64,,
"[12.0, 13.0)",5,5,12.0,1.0,0,0,3/4,3,1,3/4,...,1/2,3/2,,2,62,D4,4,63,,
"[12.0, 13.0)",5,5,12.0,1.0,0,0,3/4,2,1,3/4,...,1/2,3/2,1.0,-2,70,Bb4,4,62,,
"[12.0, 13.0)",5,5,12.0,1.0,0,0,3/4,1,1,1/4,...,1/4,1,1.0,-1,77,F5,5,56,,


In [12]:
str(pd.Interval(0.0,1.0,closed='left'))

'[0.0, 1.0)'

In [13]:
slice_info = salami_dts.get_slice_info()
slice_info.to_csv('ABC_chord_slices.tsv.zip', sep='\t')
slice_info

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mc,mn,quarterbeats,duration_qb,mc_onset,mn_onset,timesig,staff,voice,duration,...,nominal_duration,scalar,tied,tpc,midi,name,octave,chord_id,tremolo,volta
corpus,fname,onset_slice,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ABC,n01op18-1_01,"[0.0, 1.0)",1,1,0,1.0,0,0,3/4,3,1,1/4,...,1/4,1,1,-1,53,F3,3,12,,
ABC,n01op18-1_01,"[1.0, 1.5)",1,1,1,0.5,1/4,1/4,3/4,3,1,1/8,...,1/8,1,-1,-1,53,F3,3,13,,
ABC,n01op18-1_01,"[1.5, 1.75)",1,1,3/2,0.25,3/8,3/8,3/4,3,1,1/16,...,1/16,1,,1,55,G3,3,14,,
ABC,n01op18-1_01,"[1.75, 2.0)",1,1,7/4,0.25,7/16,7/16,3/4,3,1,1/16,...,1/16,1,,-1,53,F3,3,15,,
ABC,n01op18-1_01,"[2.0, 2.5)",1,1,2,0.5,1/2,1/2,3/4,3,1,1/8,...,1/8,1,,4,52,E3,3,16,,
ABC,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ABC,n16op135_04,"[1140.0, 1141.0)",281,280,1140,1.0,1/2,1/2,4/4,4,1,1/4,...,1/4,1,,-1,41,F2,2,2719,,
ABC,n16op135_04,"[1141.0, 1142.0)",281,280,1141,1.0,3/4,3/4,4/4,4,1,1/4,...,1/4,1,,3,45,A2,2,2720,,
ABC,n16op135_04,"[1142.0, 1144.0)",282,281,1142,2.0,0,0,4/4,4,1,1/2,...,1/2,1,,0,48,C3,3,2727,,
ABC,n16op135_04,"[1144.0, 1146.0)",282,281,1144,2.0,1/2,1/2,4/4,4,1,1/2,...,1/2,1,,0,36,C2,2,2728,,
