In [1]:
%load_ext autoreload
%autoreload 2
import os
from dimcat import (
    Corpus,  
    Pipeline,
    IsAnnotatedFilter,
    CorpusGrouper, 
    PieceGrouper, 
    ModeGrouper, 
    ChordSymbolBigrams, 
    ChordSymbolUnigrams,
    LocalKeySlicer,
)
from dimcat import __version__ as dimcat_version
from ms3 import __version__ as ms3_version
from git import Repo
import pandas as pd
pd.set_option("display.max_columns", 100)

In [2]:
corpus_path = "~/romantic_piano_corpus"

repo = Repo(corpus_path)
print(f"{os.path.basename(corpus_path)} @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dimcat_version}")
print(f"ms3 version {ms3_version}")

romantic_piano_corpus @ 025834b
dimcat version 0.2.0.post1.dev35+g9830b67
ms3 version 0.5.3.post0.dev88+g764fcd4.dirty


# Overview

In [3]:
corpus = Corpus(directory=corpus_path)
corpus.data

1154 files.
KEY                       -> EXTENSIONS
---------------------------------------
beethoven_piano_sonatas   -> {'.mscx': 87, '.tsv': 239}
chopin_mazurkas           -> {'.mscx': 55, '.tsv': 166}
debussy_suite_bergamasque -> {'.mscx': 4, '.tsv': 13}
dvorak_silhouettes        -> {'.mscx': 12, '.tsv': 37}
grieg_lyrical_pieces      -> {'.mscx': 66, '.tsv': 199}
liszt_pelerinage          -> {'.mscx': 19, '.tsv': 77}
medtner_tales             -> {'.mscx': 19, '.tsv': 59}
schumann_kinderszenen     -> {'.mscx': 13, '.tsv': 40}
tchaikovsky_seasons       -> {'.mscx': 12, '.tsv': 37}

None of the 287 score files have been parsed.

All 867 tabular files have been parsed, 283 of them as Annotations object(s).
KEY                       -> ANNOTATION LAYERS
----------------------------------------------
beethoven_piano_sonatas   -> staff  voice  harmony_layer  color  
                          -> 2      1      0 (dcml)       default    20720
                          ->        2      0 (dcml

## Metadata

In [4]:
all_metadata = corpus.data.metadata(from_tsv=True)
print(f"Concatenated 'metadata.tsv' files cover {len(all_metadata)} of the {len(corpus.data._score_ids())} scores.")
all_metadata.groupby(level=0).nth(0)

Concatenated 'metadata.tsv' files cover 287 of the 287 scores.


Unnamed: 0_level_0,rel_paths,fnames,last_mc,last_mn,length_qb,length_qb_unfolded,all_notes_qb,n_onsets,n_onset_positions,TimeSig,KeySig,label_count,annotated_key,annotators,reviewers,composer,workTitle,movementNumber,movementTitle,workNumber,poet,lyricist,arranger,copyright,creationDate,mscVersion,platform,source,translator,musescore,ambitus,PDF,Reviewers,annotator,comments,composed_end,composed_start,harmony_version,imslp,key,mode,originalFormat,pdf,score integrity,score_integrity,staff_1_ambitus,staff_1_instrument,staff_2_ambitus,staff_2_instrument,staff_3_ambitus,staff_3_instrument,staff_4_ambitus,staff_4_instrument,typesetter
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
beethoven_piano_sonatas,MS3,01-1,154,152,608.0,1216.0,1476.0,1679.0,,,1: -4,241,f,"Lars & Ya-Chuan (2.2.0), John Heilig (2.3.0)",AN,,,,,,,,,,2019-03-05,3.02,Apple Macintosh,,,3.6.2,32-89 (Ab1-F6),,,,,1795,1794,2.3.0,,,,xml,,,,51-89 (Eb3-F6),piano,32-73 (Ab1-Db5),piano,,,,,
chopin_mazurkas,MS3,BI105-2op30-2,65,64,193.0,193.0,711.0,810.0,,,1: 2,116,b,"Wendelin Bitzan (1.0.0), Adrian Nagel (2.2.0),...","JH, AN, DK",Frédéric Chopin (1810-1849),,,,Op. 30 No. 2,,,,,2019-02-08,3.02,Apple Macintosh,https://github.com/craigsapp/chopin-mazurkas,,3.6.2,35-90 (B1-F#6),,,,,1837,1836,2.3.0,,B minor / F sharp,minor,xml,,,Cédric Koller,59-90 (B3-F#6),piano,35-71 (B1-B4),piano,,,,,
debussy_suite_bergamasque,MS3,l075-01_suite_prelude,89,89,356.0,356.0,1533.67,1721.0,,,1: -1,274,F,"Adrian Nagel (2.1.1), Amelia Brey (2.3.0)","AB, AN",Claude Debussy,Suite Bergamasque,,,,,,,,2015-05-19,3.02,Microsoft Windows,http://musescore.com/score/890041,,3.6.2,24-94 (C1-Bb6),,,,,1905,1890,2.3.0,,,,,,,,48-94 (C3-Bb6),Piano,24-90 (C1-F#6),Piano,,,,,
dvorak_silhouettes,MS3,op08n01,54,52,,,,,,1: 6/8,"1: 4, 7: -5, 49: 4",80,c#,"Daniel Grote (2.1.1), Hanné Becker (2.3.0)","Johannes Hentschel (2.1.1), AN",Antonín Dvořák,Silhouettes,1.0,Allegro feroce,op. 8,,,,,2018-05-26,3.02,Microsoft Windows,,,3.6.2,32-92 (G#1-Ab6),,,,,1879,1870,2.3.0,,,,xml,https://imslp.org/wiki/Special:ReverseLookup/5...,Tom Schreyer,,56-92 (G#3-Ab6),Piano,32-68 (G#1-G#4),Piano,,,,,
grieg_lyrical_pieces,MS3,op12n01,23,23,,,,,,1: 2/4,1: -3,43,Eb,"Adrian Nagel (2.1.1), John Heilig (2.30)",Adrian Nagel,Edvard H. Grieg (1843-1907),,1.0,,Op. 12,,,,,2018-10-11,3.02,Microsoft Windows,,,3.6.2,39-79 (Eb2-G5),,,,,1901,1867,2.3.0,,,,mxl,https://imslp.eu/files/imglnks/euimg/8/8e/IMSL...,,Tom Schreyer,55-79 (G3-G5),,39-71 (Eb2-Cb5),,,,,,
liszt_pelerinage,MS3,160.01_Chapelle_de_Guillaume_Tell,97,97,388.0,388.0,1902.42,2879.0,,1: 4/4,1: 0,174,C,"Adrian Nagel (2.1.1), Amelia Brey (2.3.0)","JH, AB, AN",Franz Liszt,Au bord d'une source,,,,,,,,2019-01-26,3.02,Microsoft Windows,https://musescore.com/score/3987861,,3.6.2,24-96 (C1-C7),https://imslp.org/wiki/Special:ReverseLookup/1...,,,,1855,1848,2.3.0,,,,xml,,,Tom Schreyer,40-96 (E2-C7),Piano,24-79 (C1-G5),Piano,,,,,
medtner_tales,MS3,op08n01,81,81,,,,,,1: 4/8,1: -3,213,c,"Wendelin Bitzan (2.2.0), John Heilig (2.3.0)","Adrian Nagel, DK",Medtner,Zwei Märchen,,,,,,,,2017-10-21,3.02,Apple Macintosh,,,3.6.2,22-87 (Bb0-Eb6),,,,,1925,1904,2.3.0,,,,,https://imslp.org/wiki/Special:ReverseLookup/5790,,Tom Schreyer,47-87 (B2-Eb6),Piano,22-77 (Bb0-F5),Piano,,,,,
schumann_kinderszenen,MS3,n01,22,22,44.0,88.0,134.33,241.0,141.0,1: 2/4,1: 1,44,G,"Tal Soker (2.1.1), John Heilig (2.3.0)","AN, JHei, JH",Robert Schumann,Kinderszenen Nos. 7 and 8,,,,,,,,2017-03-11,3.02,Microsoft Windows,http://musescore.com/user/22249306/scores/4778176,,3.6.2,42-79 (F#2-G5),,,,,1838,1838,2.3.0,,,,,https://imslp.org/wiki/Special:ReverseLookup/6...,,Tom Schreyer,62-79 (D4-G5),Piano,42-69 (F#2-A4),Piano,,,,,
tchaikovsky_seasons,MS3,op37a01,103,103,,,,,,1: 3/4,"1: 3, 29: 1, 63: 3",313,A,"Adrian Nagel (2.1.1), John Heilig (2.3.0)","Johannes Hentschel, AN",P. Tchaikovsky,January,,,,,,,,2018-11-29,3.02,Linux,http://musescore.com/user/12839876/scores/3444321,,3.6.2,33-88 (A1-E6),,,,,1876,1876,2.3.0,,,,mxl,https://imslp.org/wiki/Special:ReverseLookup/1...,Tom Schreyer,,53-88 (E#3-E6),Piano,33-88 (A1-E6),Piano,,,,,


In [5]:
print("VALUE COUNTS OF THE COLUMN 'annotators'")
all_metadata.annotators.value_counts()

VALUE COUNTS OF THE COLUMN 'annotators'


Adrian Nagel (2.1.1), John Heilig (2.3.0)                               74
Wendelin Bitzan (1.0.0), Adrian Nagel (2.2.0), Davor Krkljus (2.3.0)    49
Adrian Nagel (2.3.0)                                                    16
Adrian Nagel (2.1.1), Amelia Brey (2.3.0)                               15
Wendelin Bitzan (2.2.0), John Heilig (2.3.0)                            15
Tal Soker (2.1.1), John Heilig (2.3.0)                                  13
Lydia Carlisi (2.2.0), Adrian Nagel (2.3.0)                             11
Adrian Nagel                                                             9
Adrian Nagel (2.1.1), Hanné Becker (2.3.0)                               9
Adrian Nagel (2.2.0), Amelia Brey (2.3.0)                                6
Daniel Grote (2.2.0), Adrian Nagel (2.3.0)                               6
Adrian Nagel (2.2.0), Hanné Becker (2.3.0)                               5
Wendelin Bitzan                                                          5
Lydia Carlisi (2.2.0), Vi

In [6]:
print(f"Composition dates range from {all_metadata.composed_start.min()} ({all_metadata.loc[all_metadata.composed_start.idxmin(), 'fnames']}) "
      f"to {all_metadata.composed_end.max()} ({all_metadata.loc[all_metadata.composed_end.idxmax(), 'fnames']}).")

Composition dates range from 1794 (01-1) to 1925 (op08n01).


In [7]:
annotated = IsAnnotatedFilter().process_data(corpus)
print(f"Before: {len(corpus.indices[()])} IDs, after filtering: {len(annotated.indices[()])}")

Before: 287 IDs, after filtering: 264


**Choose here if you want to see stats for all or only for annotated scores.**

In [8]:
#selected = corpus
selected = annotated

## Measures

In [9]:
all_measures = selected.get_facet('measures')
print(f"{len(all_measures.index)} measures over {len(all_measures.groupby(level=[0,1]))} files.")
all_measures.head()

29956 measures over 264 files.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quarterbeats,duration_qb,mc,mn,keysig,timesig,act_dur,mc_offset,volta,numbering_offset,dont_count,barline,breaks,repeats,next,markers,jump_bwd,jump_fwd,play_until
corpus,fname,interval,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
beethoven_piano_sonatas,01-1,"[0.0, 1.0)",0,1.0,1,0,-4,2/2,1/4,3/4,,,1.0,,,firstMeasure,"(2,)",,,,
beethoven_piano_sonatas,01-1,"[1.0, 5.0)",1,4.0,2,1,-4,2/2,1,0,,,,,,,"(3,)",,,,
beethoven_piano_sonatas,01-1,"[5.0, 9.0)",5,4.0,3,2,-4,2/2,1,0,,,,,,,"(4,)",,,,
beethoven_piano_sonatas,01-1,"[9.0, 13.0)",9,4.0,4,3,-4,2/2,1,0,,,,,,,"(5,)",,,,
beethoven_piano_sonatas,01-1,"[13.0, 17.0)",13,4.0,5,4,-4,2/2,1,0,,,,,,,"(6,)",,,,


In [10]:
print("Distribution of time signatures per XML measure (MC):")
all_measures.timesig.value_counts(dropna=False)

Distribution of time signatures per XML measure (MC):


3/4      10130
2/4       6073
4/4       5186
6/8       3432
2/2       2523
3/8       1278
12/8       351
9/8        304
6/4        283
9/16       148
4/8         81
2/8         75
12/16       43
12/32       17
6/16        17
17/4         3
33/32        2
15/4         1
14/4         1
10/4         1
10/8         1
12/4         1
7/4          1
5/4          1
3/2          1
11/4         1
18/4         1
Name: timesig, dtype: int64

## Notes

In [11]:
all_notes = selected.get_facet('notes')
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()

434220 notes over 264 files.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quarterbeats,duration_qb,mc,mn,mc_onset,mn_onset,timesig,staff,voice,duration,gracenote,nominal_duration,scalar,tied,tpc,midi,volta,chord_id
corpus,fname,interval,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
beethoven_piano_sonatas,01-1,"[0.0, 1.0)",0,1.0,1,0,0,3/4,2/2,1,1,1/4,,1/4,1,,0,60,,0
beethoven_piano_sonatas,01-1,"[1.0, 2.0)",1,1.0,2,1,0,0,2/2,1,1,1/4,,1/4,1,,-1,65,,1
beethoven_piano_sonatas,01-1,"[2.0, 3.0)",2,1.0,2,1,1/4,1/4,2/2,1,1,1/4,,1/4,1,,-4,68,,2
beethoven_piano_sonatas,01-1,"[3.0, 4.0)",3,1.0,2,1,1/2,1/2,2/2,1,1,1/4,,1/4,1,,0,72,,3
beethoven_piano_sonatas,01-1,"[4.0, 5.0)",4,1.0,2,1,3/4,3/4,2/2,1,1,1/4,,1/4,1,,-1,77,,4


In [12]:
print("Distribution of notes over staves:")
all_notes.staff.value_counts()

Distribution of notes over staves:


1    230221
2    200618
3      2397
4       984
Name: staff, dtype: Int64

In [13]:
print("Distribution of notes over staves for all pieces with more than two staves\n")
for group, df in all_notes.groupby(level=[0,1]):
    if (df.staff > 2).any():
        print(group)
        print(df.staff.value_counts().to_dict())

Distribution of notes over staves for all pieces with more than two staves

('grieg_lyrical_pieces', 'op43n06')
{2: 769, 3: 422, 1: 180}
('liszt_pelerinage', '161.04_Sonetto_47_del_Petrarca')
{1: 1076, 2: 628, 3: 42, 4: 29}
('liszt_pelerinage', '161.07_Apres_une_lecture_du_Dante')
{1: 6638, 2: 5181, 3: 50}
('liszt_pelerinage', '162.01_Gondoliera')
{3: 1745, 4: 955}
('medtner_tales', 'op34n03')
{1: 1219, 2: 816, 3: 89}
('medtner_tales', 'op35n04')
{1: 1678, 2: 1632, 3: 49}


In [14]:
all_notes[all_notes.staff > 2].groupby(level=[0,1]).staff.value_counts()

corpus                fname                              staff
grieg_lyrical_pieces  op43n06                            3         422
liszt_pelerinage      161.04_Sonetto_47_del_Petrarca     3          42
                                                         4          29
                      161.07_Apres_une_lecture_du_Dante  3          50
                      162.01_Gondoliera                  3        1745
                                                         4         955
medtner_tales         op34n03                            3          89
                      op35n04                            3          49
Name: staff, dtype: int64

## Harmony labels

All symbols, independent of the local key (the mode of which changes their semantics).

In [15]:
all_annotations = annotated.get_facet('expanded')
all_annotations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quarterbeats,duration_qb,mc,mn,mc_onset,mn_onset,timesig,staff,voice,volta,label,alt_label,globalkey,localkey,pedal,chord,special,numeral,form,figbass,changes,relativeroot,cadence,phraseend,chord_type,globalkey_is_minor,localkey_is_minor,chord_tones,added_tones,root,bass_note,pedalend
corpus,fname,interval,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
beethoven_piano_sonatas,01-1,"[0.0, 9.0)",0,9.0,1,0,0,3/4,2/2,2,1,,f.i{,,f,i,,i,,i,,,,,,{,m,True,True,"(0, -3, 1)",(),0,0,
beethoven_piano_sonatas,01-1,"[9.0, 17.0)",9,8.0,4,3,0,0,2/2,2,1,,V65,,f,i,,V65,,V,,65.0,,,,,Mm7,True,True,"(5, 2, -1, 1)",(),1,5,
beethoven_piano_sonatas,01-1,"[17.0, 21.0)",17,4.0,6,5,0,0,2/2,2,1,,i,,f,i,,i,,i,,,,,,,m,True,True,"(0, -3, 1)",(),0,0,
beethoven_piano_sonatas,01-1,"[21.0, 25.0)",21,4.0,7,6,0,0,2/2,2,1,,#viio6,,f,i,,#viio6,,#vii,o,6.0,,,,,o,True,True,"(2, -1, 5)",(),5,2,
beethoven_piano_sonatas,01-1,"[25.0, 27.0)",25,2.0,8,7,0,0,2/2,2,1,,i6,,f,i,,i6,,i,,6.0,,,,,m,True,True,"(-3, 1, 0)",(),0,-3,


In [16]:
no_chord = all_annotations.root.isna()
print(f"Concatenated annotation tables contains {all_annotations.shape[0]} rows. {no_chord.sum()} of them are not chords. Their values are:")
all_annotations.label[no_chord].value_counts(dropna=False).to_dict()

Concatenated annotation tables contains 57566 rows. 677 of them are not chords. Their values are:


{'{': 640, '}': 28, '|PAC}': 6, '|HC': 2, '|PAC': 1}

In [17]:
all_chords = all_annotations[~no_chord]
print(f"Corpus contains {all_chords.shape[0]} tokens and {len(all_chords.chord.unique())} types over {len(all_chords.groupby(level=[0,1]))} documents.")

Corpus contains 56889 tokens and 3115 types over 264 documents.


In [18]:
#from ms3 import write_tsv
#write_tsv(all_annotations[all_annotations.pedalend.notna()], './issues/pedalpoints.tsv', pre_process=False)

## Corpus summary

In [19]:
summary = all_metadata.set_index('fnames', append=True).reset_index(level=[1,2], drop=True)
if selected == annotated:
    summary = summary[summary.label_count > 0].copy()
summary.index = summary.index.rename(['corpus', 'fname'])
summary.length_qb = all_measures.groupby(level=[0,1]).act_dur.sum() * 4.0
summary = pd.concat([summary,
                     all_notes.groupby(level=[0,1]).size().rename('notes'),
                    ], axis=1)
summary.groupby(level=0).describe().dropna(axis=1, how='all')

Unnamed: 0_level_0,last_mc,last_mc,last_mc,last_mc,last_mc,last_mc,last_mc,last_mc,last_mn,last_mn,last_mn,last_mn,last_mn,last_mn,last_mn,last_mn,length_qb_unfolded,length_qb_unfolded,length_qb_unfolded,length_qb_unfolded,length_qb_unfolded,length_qb_unfolded,length_qb_unfolded,length_qb_unfolded,all_notes_qb,all_notes_qb,all_notes_qb,all_notes_qb,all_notes_qb,all_notes_qb,all_notes_qb,all_notes_qb,n_onsets,n_onsets,n_onsets,n_onsets,n_onsets,n_onsets,n_onsets,n_onsets,n_onset_positions,n_onset_positions,n_onset_positions,n_onset_positions,n_onset_positions,n_onset_positions,n_onset_positions,n_onset_positions,label_count,label_count,label_count,label_count,label_count,label_count,label_count,label_count,poet,arranger,mscVersion,mscVersion,mscVersion,mscVersion,mscVersion,mscVersion,mscVersion,mscVersion,translator,composed_end,composed_end,composed_end,composed_end,composed_end,composed_end,composed_end,composed_end,composed_start,composed_start,composed_start,composed_start,composed_start,composed_start,composed_start,composed_start,notes,notes,notes,notes,notes,notes,notes,notes
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,count,count,mean,std,min,25%,50%,75%,max,count,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
corpus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2,Unnamed: 88_level_2,Unnamed: 89_level_2,Unnamed: 90_level_2,Unnamed: 91_level_2
beethoven_piano_sonatas,64.0,184.703125,99.997409,28.0,111.25,169.0,253.5,543.0,64.0,182.21875,99.937674,28.0,108.75,166.5,253.25,543.0,64.0,719.6425,440.937916,84.0,365.5,603.5,958.125,1872.0,64.0,1559.357031,966.510342,232.12,790.72,1365.04,2032.4375,4265.05,64.0,2532.3125,1455.761066,494.0,1477.5,2108.0,3461.5,6652.0,4.0,1255.0,956.210577,343.0,529.0,1148.0,1874.0,2381.0,64.0,343.203125,171.867167,82.0,234.5,311.5,399.25,869.0,0.0,0.0,64.0,3.02,0.0,3.02,3.02,3.02,3.02,3.02,0.0,64.0,1802.5625,8.17832,1795.0,1798.0,1799.0,1804.25,1822.0,64.0,1801.34375,8.36322,1794.0,1795.75,1798.0,1803.25,1821.0,64.0,2592.9375,1507.781181,517.0,1486.0,2113.5,3663.0,6817.0
chopin_mazurkas,55.0,89.509091,47.668171,20.0,58.5,75.0,112.0,225.0,55.0,92.527273,55.124054,20.0,56.5,72.0,115.5,247.0,53.0,304.533019,146.421218,35.0,204.0,289.0,360.0,673.0,55.0,923.672545,473.585484,208.0,583.625,782.0,1206.5,2207.0,55.0,1017.563636,508.305892,234.0,672.5,837.0,1322.5,2183.0,0.0,,,,,,,,55.0,165.945455,97.501836,38.0,104.0,139.0,213.5,481.0,0.0,0.0,55.0,3.02,1.344547e-15,3.02,3.02,3.02,3.02,3.02,0.0,55.0,1836.636364,6.126335,1826.0,1831.5,1837.0,1841.5,1849.0,55.0,1836.290909,6.193404,1826.0,1831.0,1836.0,1841.0,1849.0,55.0,1040.036364,525.956552,252.0,680.0,921.0,1353.5,2265.0
debussy_suite_bergamasque,4.0,105.25,36.270971,72.0,84.75,96.5,117.0,156.0,4.0,105.25,36.270971,72.0,84.75,96.5,117.0,156.0,4.0,404.0,147.837749,312.0,321.0,340.0,423.0,624.0,4.0,1522.25,231.693064,1266.0,1414.5,1498.835,1606.585,1825.33,4.0,1943.0,362.927908,1559.0,1680.5,1940.5,2203.0,2332.0,0.0,,,,,,,,4.0,253.25,70.03511,150.0,243.0,279.0,289.25,305.0,0.0,0.0,4.0,3.02,0.0,3.02,3.02,3.02,3.02,3.02,0.0,4.0,1905.0,0.0,1905.0,1905.0,1905.0,1905.0,1905.0,4.0,1890.0,0.0,1890.0,1890.0,1890.0,1890.0,1890.0,4.0,2052.5,327.47977,1680.0,1839.75,2061.0,2273.75,2408.0
dvorak_silhouettes,12.0,56.5,18.12833,15.0,51.5,58.5,63.75,81.0,12.0,56.166667,17.928308,15.0,50.0,58.5,63.75,80.0,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,12.0,128.25,52.82927,67.0,94.75,108.5,146.0,238.0,0.0,0.0,12.0,3.02,0.0,3.02,3.02,3.02,3.02,3.02,0.0,12.0,1879.0,0.0,1879.0,1879.0,1879.0,1879.0,1879.0,12.0,1870.0,0.0,1870.0,1870.0,1870.0,1870.0,1870.0,12.0,887.416667,294.495704,382.0,684.25,877.0,1074.25,1440.0
grieg_lyrical_pieces,66.0,82.5,47.123161,23.0,47.25,72.0,96.75,204.0,66.0,82.030303,47.118499,23.0,47.25,72.0,95.0,204.0,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,66.0,124.712121,57.222579,21.0,87.0,111.0,156.0,313.0,0.0,0.0,66.0,3.02,4.474922e-16,3.02,3.02,3.02,3.02,3.02,0.0,66.0,1901.0,0.0,1901.0,1901.0,1901.0,1901.0,1901.0,66.0,1867.0,0.0,1867.0,1867.0,1867.0,1867.0,1867.0,66.0,996.575758,626.270329,273.0,548.75,780.5,1228.25,3691.0
liszt_pelerinage,19.0,138.947368,113.069533,49.0,72.5,97.0,148.0,481.0,19.0,138.157895,112.51482,48.0,72.5,97.0,146.5,479.0,19.0,510.855263,363.849391,120.0,276.0,388.0,647.5,1505.25,19.0,1764.118947,1402.357903,289.5,867.085,1169.75,2369.46,5901.89,19.0,3081.684211,2844.240321,737.0,1472.0,1996.0,2832.0,11681.0,0.0,,,,,,,,19.0,266.789474,196.779227,84.0,135.0,200.0,290.5,716.0,0.0,0.0,19.0,3.02,4.562583e-16,3.02,3.02,3.02,3.02,3.02,0.0,19.0,1853.421053,3.746343,1849.0,1849.0,1855.0,1855.0,1859.0,19.0,1849.0,4.546061,1846.0,1846.0,1848.0,1848.0,1859.0,19.0,3133.368421,2888.257764,749.0,1486.5,2061.0,2861.0,11869.0
medtner_tales,19.0,130.210526,115.216983,48.0,71.5,81.0,142.5,554.0,19.0,129.684211,115.060879,47.0,71.0,81.0,141.5,553.0,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,19.0,360.789474,215.769831,116.0,204.0,307.0,446.0,1020.0,0.0,0.0,19.0,3.02,4.562583e-16,3.02,3.02,3.02,3.02,3.02,0.0,19.0,1925.0,0.0,1925.0,1925.0,1925.0,1925.0,1925.0,19.0,1904.0,0.0,1904.0,1904.0,1904.0,1904.0,1904.0,19.0,2259.421053,1394.984003,826.0,1402.0,1917.0,2860.0,6944.0
schumann_kinderszenen,13.0,31.230769,12.00801,17.0,24.0,27.0,34.0,58.0,13.0,30.153846,11.985033,16.0,24.0,25.0,32.0,57.0,12.0,94.791667,34.041224,34.0,64.0,96.25,122.0,144.0,13.0,278.82,119.65065,106.0,173.0,305.5,365.5,471.0,13.0,385.538462,118.159225,241.0,282.0,375.0,440.0,622.0,13.0,163.538462,51.648839,100.0,126.0,144.0,207.0,253.0,13.0,72.923077,30.148139,44.0,49.0,67.0,84.0,140.0,0.0,0.0,13.0,3.02,4.622227e-16,3.02,3.02,3.02,3.02,3.02,0.0,13.0,1838.0,0.0,1838.0,1838.0,1838.0,1838.0,1838.0,13.0,1838.0,0.0,1838.0,1838.0,1838.0,1838.0,1838.0,13.0,401.769231,124.614843,244.0,292.0,402.0,466.0,632.0
tchaikovsky_seasons,12.0,104.333333,50.10232,46.0,76.25,89.0,119.5,199.0,12.0,104.166667,49.96332,46.0,76.25,89.0,119.5,198.0,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,12.0,254.583333,106.98637,119.0,186.5,235.5,286.75,514.0,0.0,0.0,12.0,3.02,0.0,3.02,3.02,3.02,3.02,3.02,0.0,12.0,1876.0,0.0,1876.0,1876.0,1876.0,1876.0,1876.0,12.0,1876.0,0.0,1876.0,1876.0,1876.0,1876.0,1876.0,12.0,1562.583333,573.170998,626.0,1113.25,1585.5,1910.0,2523.0


In [20]:
corpus_metadata = summary.groupby(level=0)
n_pieces = corpus_metadata.size().rename('pieces')
absolute_numbers = dict(
    measures = corpus_metadata.last_mn.sum(),
    length = corpus_metadata.length_qb.sum(),
    notes = corpus_metadata.notes.sum(),
    labels = corpus_metadata.label_count.sum(),
)
absolute = pd.DataFrame.from_dict(absolute_numbers)
relative = absolute.div(n_pieces, axis=0).astype(float).round(1)
complete_summary = pd.concat([pd.concat([n_pieces, absolute], axis=1), relative], axis=1, keys=['absolute', 'per piece'])
complete_summary

Unnamed: 0_level_0,absolute,absolute,absolute,absolute,absolute,per piece,per piece,per piece,per piece
Unnamed: 0_level_1,pieces,measures,length,notes,labels,measures,length,notes,labels
corpus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
beethoven_piano_sonatas,64,11662,35663.375,165948,21965,182.2,557.2,2592.9,343.2
chopin_mazurkas,55,5089,14605.25,57202,9127,92.5,265.6,1040.0,165.9
debussy_suite_bergamasque,4,421,1616.0,8210,1013,105.2,404.0,2052.5,253.2
dvorak_silhouettes,12,674,1852.5,10649,1539,56.2,154.4,887.4,128.2
grieg_lyrical_pieces,66,5414,16485.25,65774,8231,82.0,249.8,996.6,124.7
liszt_pelerinage,19,2625,9709.25,59534,5069,138.2,511.0,3133.4,266.8
medtner_tales,19,2464,6598.0,42929,6855,129.7,347.3,2259.4,360.8
schumann_kinderszenen,13,392,934.0,5223,948,30.2,71.8,401.8,72.9
tchaikovsky_seasons,12,1250,3919.5,18751,3055,104.2,326.6,1562.6,254.6


# Harmony labels
## Unigrams
For computing unigram statistics, the tokens need to be grouped by their occurrence within a major or a minor key because this changes their meaning. To that aim, the annotated corpus needs to be sliced into contiguous localkey segments which are then grouped into a major (`is_minor=False`) and a minor group.

In [21]:
localkey_slices = LocalKeySlicer().process_data(annotated)

In [22]:
mode_slices = ModeGrouper().process_data(localkey_slices)

### Whole dataset

In [23]:
unigrams = ChordSymbolUnigrams(once_per_group=True).process_data(mode_slices)

In [24]:
unigrams.get()

localkey_is_minor  chord        
False              I                5229
                   V7               2597
                   V                2176
                   I6               1757
                   IV                981
                                    ... 
True               I(#6)               1
                   V(642)              1
                   I(94)               1
                   It/iv               1
                   #viio2(+2)/ii       1
Name: count, Length: 3715, dtype: int64

In [25]:
modes = {True: 'MINOR', False: 'MAJOR'}
for (is_minor,), ugs in unigrams.iter():
    print(f"{modes[is_minor]} UNIGRAMS\n{ugs.shape[0]} types, {ugs.sum()} tokens")
    print(ugs.head(20).to_string())

MAJOR UNIGRAMS
2004 types, 35610 tokens
chord
I        5229
V7       2597
V        2176
I6       1757
IV        981
V(64)     722
V43       670
V2        647
V6        638
vi        610
ii        570
ii6       548
V65       541
IV6       469
I64       445
V7(9)     363
viio6     347
V7/V      324
ii7       278
V7/IV     236
MINOR UNIGRAMS
1711 types, 21279 tokens
chord
i          2703
V          1413
V7         1173
i6          914
iv          489
V(64)       458
V6          354
VI          321
iv6         314
V43         303
i64         243
III         233
V65         222
V2          213
#viio43     209
I           208
v           170
bII         161
V7/III      158
#viio7      156


### Per corpus

In [26]:
corpus_wise_unigrams = Pipeline([CorpusGrouper(), ChordSymbolUnigrams(once_per_group=True)]).process_data(mode_slices)

In [27]:
corpus_wise_unigrams.get()

localkey_is_minor  corpus                   chord   
False              beethoven_piano_sonatas  I           2232
                                            V           1189
                                            I6          1125
                                            V7          1041
                                            V6           468
                                                        ... 
True               tchaikovsky_seasons      i(4)/v         1
                                            i/v            1
                                            ii64           1
                                            v6/iv          1
                                            ii%65(2)       1
Name: count, Length: 6705, dtype: int64

In [28]:
for (is_minor, corpus_name), ugs in corpus_wise_unigrams.iter():
    print(f"{corpus_name} {modes[is_minor]} unigrams ({ugs.shape[0]} types, {ugs.sum()} tokens)")
    print(ugs.head(5).to_string())

beethoven_piano_sonatas MAJOR unigrams (842 types, 15275 tokens)
chord
I     2232
V     1189
I6    1125
V7    1041
V6     468
chopin_mazurkas MAJOR unigrams (451 types, 5256 tokens)
chord
I       949
V7      659
V       252
IV      189
V7/V    128
debussy_suite_bergamasque MAJOR unigrams (106 types, 363 tokens)
chord
I         28
V7        22
iii       19
V7(+2)    12
IV6       12
dvorak_silhouettes MAJOR unigrams (140 types, 1067 tokens)
chord
I     254
I6    103
V7     89
V      79
IV     46
grieg_lyrical_pieces MAJOR unigrams (659 types, 4882 tokens)
chord
I     566
V7    264
V     224
IV    127
vi    116
liszt_pelerinage MAJOR unigrams (552 types, 3455 tokens)
chord
I     501
V7    250
V     155
I6    109
IV    108
medtner_tales MAJOR unigrams (636 types, 2864 tokens)
chord
I     263
V     139
V7    122
I6     83
IV     61
schumann_kinderszenen MAJOR unigrams (114 types, 714 tokens)
chord
I      106
I6      69
V       65
V7      50
V43     30
tchaikovsky_seasons MAJOR unigrams (194

### Per piece

In [29]:
piece_wise_unigrams = Pipeline([PieceGrouper(), ChordSymbolUnigrams(once_per_group=True)]).process_data(mode_slices)

In [30]:
piece_wise_unigrams.get()

localkey_is_minor  fname    chord 
False              01-1     V7        11
                            I6         8
                            I          7
                            ii6        5
                            V          5
                                      ..
True               op71n07  ii%65      3
                            VIM43      2
                            ii%7       2
                            V(964)     2
                            #viio2     1
Name: count, Length: 16090, dtype: int64

## Bigrams

### Whole dataset

In [31]:
bigrams = ChordSymbolBigrams(once_per_group=True).process_data(mode_slices)

In [32]:
bigrams.get()

localkey_is_minor  from  to      
False              V7    I           1254
                   V     I            586
                   I     V            443
                         V7           406
                   V2    I6           344
                                     ... 
True               V7    bVI            1
                         VIM7(+2)       1
                         VIM7           1
                         VI(11)         1
                   vo7   i              1
Name: count, Length: 15149, dtype: int64

In [33]:
modes = {True: 'MINOR', False: 'MAJOR'}
for (is_minor,), ugs in bigrams.iter():
    print(f"{modes[is_minor]} BIGRAMS\n{ugs.shape[0]} transition types, {ugs.sum()} tokens")
    print(ugs.head(20).to_string())

MAJOR BIGRAMS
8539 transition types, 34721 tokens
from   to 
V7     I      1254
V      I       586
I      V       443
       V7      406
V2     I6      344
V(64)  V7      310
I      I6      294
I6     I       292
V65    I       266
I      IV      245
V      V7      233
V43    I       218
I      V43     210
IV     I       164
V6     I       164
I      V6      161
V      I6      147
V(64)  V       142
I6     IV      139
I      V2      132
MINOR BIGRAMS
6610 transition types, 20690 tokens
from   to   
V7     i        573
V      i        417
i      V        209
       V7       192
       i6       172
V(64)  V7       149
i6     i        145
V      V7       142
V(64)  V        138
V43    i        133
V65    i        129
V6     i        118
i      V6       104
V2     i6        94
i      iv        89
       VI        80
       V43       75
V7(6)  V7        65
V7     V(64)     61
i(9)   i         57


### Per corpus

In [34]:
corpus_wise_bigrams = Pipeline([CorpusGrouper(), ChordSymbolBigrams(once_per_group=True)]).process_data(mode_slices)

In [35]:
corpus_wise_bigrams.get()

localkey_is_minor  corpus                   from    to      
False              beethoven_piano_sonatas  V7      I           532
                                            V       I           323
                                            V2      I6          238
                                            I       V           230
                                            V(64)   V7          213
                                                               ... 
True               tchaikovsky_seasons      i(4)/v  i/v           1
                                            V2/bII  ii%65(2)      1
                                            V7(2)   V7            1
                                            ii%43   V             1
                                            ii%65   ii%7(4)       1
Name: count, Length: 19367, dtype: int64

In [36]:
for (is_minor, corpus_name), ugs in corpus_wise_bigrams.iter():
    print(f"{corpus_name} {modes[is_minor]} bigrams ({ugs.shape[0]} transition types, {ugs.sum()} tokens)")
    print(ugs.head(5).to_string())

beethoven_piano_sonatas MAJOR bigrams (3509 transition types, 15028 tokens)
from   to
V7     I     532
V      I     323
V2     I6    238
I      V     230
V(64)  V7    213
chopin_mazurkas MAJOR bigrams (1394 transition types, 5135 tokens)
from  to
V7    I     338
I     IV     96
      V7     93
      V      70
IV    I      67
debussy_suite_bergamasque MAJOR bigrams (245 transition types, 354 tokens)
from  to   
I     iii      8
V7    I        7
iii6  V7       5
iii   bIII6    5
      V64      4
dvorak_silhouettes MAJOR bigrams (347 transition types, 1013 tokens)
from  to
V7    I     64
I     V7    45
V     I     37
V2    I6    29
I     I6    21
grieg_lyrical_pieces MAJOR bigrams (1735 transition types, 4755 tokens)
from   to
V7     I     102
I      V      39
V      I      37
V7(9)  I      31
I      V7     30
liszt_pelerinage MAJOR bigrams (1471 transition types, 3323 tokens)
from  to
V7    I     97
V     I     47
I     V     44
      V7    42
I(9)  I     41
medtner_tales MAJOR bigrams (

### Per piece

In [37]:
piece_wise_bigrams = Pipeline([PieceGrouper(), ChordSymbolBigrams(once_per_group=True)]).process_data(mode_slices)

(('beethoven_piano_sonatas', '23-2', Interval(190.0, 194.0, closed='left')),): DataFrame has only one row, cannot compute bigram.
Group '(True, '23-2')' will be missing from the processed data.


In [38]:
piece_wise_bigrams.get()

localkey_is_minor  fname    from     to    
False              01-1     V(64)    V7        4
                            viio7/V  V(64)     3
                            ii6(2)   ii6       3
                            V65/V    V         3
                            V7(+b9)  V7        3
                                              ..
True               op71n07  V(964)   ii%65     2
                            VIM43    ii%7      2
                            ii%65    VIM43     2
                            ii%7     V(964)    2
                            ii%65    #viio2    1
Name: count, Length: 27544, dtype: int64