In [None]:
%load_ext autoreload
%autoreload 2
import os
from dimcat import (
    Corpus,  
    Pipeline,
    IsAnnotatedFilter,
    CorpusGrouper, 
    PieceGrouper, 
    ModeGrouper, 
    ChordFeatureSlicer,
    ChordSymbolBigrams, 
    ChordSymbolUnigrams,
    LocalKeySlicer,
)
from dimcat import __version__ as dimcat_version
from ms3 import __version__ as ms3_version
from git import Repo
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
pd.set_option("display.max_columns", 100)

In [None]:
corpus_path = "~/romantic_piano_corpus"

repo = Repo(corpus_path)
print(f"{os.path.basename(corpus_path)} @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dimcat_version}")
print(f"ms3 version {ms3_version}")

In [None]:
STD_LAYOUT = {
 'paper_bgcolor': '#FFFFFF',
 'plot_bgcolor': '#FFFFFF',
 'margin': {'l': 40, 'r': 0, 'b': 0, 't': 40, 'pad': 0},
 'font': {'size': 15}
}
OUTPUT_DIR = "/home/hentsche/Documents/phd/romantic_piano_corpus_report/figures/"

# Overview

In [None]:
corpus = Corpus(directory=corpus_path)
corpus.data

In [None]:
annotated = IsAnnotatedFilter().process_data(corpus)
print(f"Before: {len(corpus.indices[()])} IDs, after filtering: {len(annotated.indices[()])}")

**Choose here if you want to see stats for all or only for annotated scores.**

In [None]:
#selected = corpus
selected = annotated

## Notes

In [None]:
all_notes = selected.get_facet('notes')
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()

In [None]:
def weight_notes(nl, group_col='midi', precise=True):
    summed_durations = nl.groupby(group_col).duration_qb.sum()
    summed_durations /= summed_durations.min() # normalize such that the shortest duration results in 1 occurrence
    if not precise:
        # This simple trick reduces compute time but also precision:
        # The rationale is to have the smallest value be slightly larger than 0.5 because
        # if it was exactly 0.5 it would be rounded down by repeat_notes_according_to_weights()
        summed_durations /= 1.9999999
    return repeat_notes_according_to_weights(summed_durations)
    
def repeat_notes_according_to_weights(weights):
    counts = weights.round().astype(int)
    counts_reflecting_weights = []
    for pitch, count in counts.iteritems():
        counts_reflecting_weights.extend([pitch]*count)
    return pd.Series(counts_reflecting_weights)

grouped_notes = all_notes.groupby(level=0)
weighted_midi = pd.concat([weight_notes(nl, 'midi', precise=False) for _, nl in grouped_notes], axis=1, keys=grouped_notes.groups.keys())
weighted_midi

In [None]:
yaxis=dict(tickmode= 'array',
           tickvals= [12, 24, 36, 48, 60, 72, 84, 96],
           ticktext = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7"],
           gridcolor='lightgrey',
           )
fig = px.violin(weighted_midi, labels=dict(variable='', value='pitch'), box=True, height=500) #, title="Distribution of pitches per corpus"
fig.update_layout(yaxis=yaxis, **STD_LAYOUT)
fig.write_image(os.path.join(OUTPUT_DIR, "ambitus_per_corpus.png"), scale=2)
fig.show()

In [None]:
weighted_tpc = pd.concat([weight_notes(nl, 'tpc') for _, nl in grouped_notes], axis=1, keys=grouped_notes.groups.keys())
weighted_tpc

In [None]:
yaxis=dict(
    tickmode= 'array',
    tickvals= [-12, -9, -6, -3, 0, 3, 6, 9, 12, 15, 18],
    ticktext = ["Dbb", "Bbb", "Gb", "Eb", "C", "A", "F#", "D#", "B#", "G##", "E##"],
    gridcolor='lightgrey',
    zerolinecolor='lightgrey',
    zeroline=True
           )
fig = px.violin(weighted_tpc, labels=dict(variable='', value='pitch class'), box=True, height=500)
fig.update_layout(yaxis=yaxis, **STD_LAYOUT)
fig.write_image(os.path.join(OUTPUT_DIR, "tpc_per_corpus.png"), scale=2)
fig.show()

In [None]:
# adapted from https://plotly.com/python/violin/#ridgeline-plot
fig = go.Figure()
for corpus, data_line in weighted_tpc.iteritems():
    fig.add_trace(go.Violin(x=data_line, name=corpus))

fig.update_traces(side='positive', orientation='h', width=2, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=True, height=600)
fig.show()

In [None]:
fig = make_subplots(rows=len(grouped_notes), cols=1, subplot_titles=list(grouped_notes.groups.keys()), shared_xaxes=True)
for i, (corpus, notes) in enumerate(grouped_notes, 1):
    tpc_durations = notes.groupby('tpc').duration_qb.sum()
    tpc_durations /= tpc_durations.sum()
    fig.add_trace(go.Scatter(x=tpc_durations.index, y=tpc_durations, name=corpus, mode='lines+markers'), row=i, col=1)

#fig.update_traces(side='positive', orientation='h', width=2, points=False)
fig.update_layout(**STD_LAYOUT, showlegend=False, height=800, width=300)
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='lightgrey', tickmode='array', tickvals= [-12, -6, 0, 6, 12, 18],
    ticktext = ["Dbb", "Gb", "C", "F#", "B#", "E##"],)
fig.update_yaxes(showgrid=False, zeroline=False)
fig.write_image(os.path.join(OUTPUT_DIR, "tpc_line_per_corpus.png", scale=2)
fig.show()

### Notes and staves

In [None]:
print("Distribution of notes over staves:")
all_notes.staff.value_counts()

In [None]:
print("Distribution of notes over staves for all pieces with more than two staves\n")
for group, df in all_notes.groupby(level=[0,1]):
    if (df.staff > 2).any():
        print(group)
        print(df.staff.value_counts().to_dict())

In [None]:
all_notes[all_notes.staff > 2].groupby(level=[0,1]).staff.value_counts()