In [None]:
import sys, os
from fractions import Fraction
import pandas as pd
import seaborn as sns
from ms3 import transform, name2fifths, fifths2pc
sys.path.append(os.path.abspath('../python_scripts/'))
from data_types import ChordType, PitchType
from constants import TRIAD_REDUCTION

def vc(S):
    """Value counts & normalized column"""
    vc = S.value_counts(dropna=False)
    return pd.concat([vc, vc / vc.sum()], axis=1, keys=['counts', 'fraction'])

In [None]:
data = '../Data/Beethoven_sonata_progression/'
csv_files = os.listdir(data)
fnames = [os.path.splitext(f)[0] for f in csv_files]
csv_paths = [os.path.join(data, f) for f in csv_files]
dtypes = dict(binary=bool, vl=int)
to_frac = lambda f: Fraction(f).limit_denominator(8)
converters = dict(tbt=to_frac)
D = pd.concat([pd.read_csv(p, dtype=dtypes, converters=converters) for p in csv_paths], keys=fnames).sort_index()
D.head()

**Percentage of diverging labels** (binary=True)

In [None]:
vc(D.binary)

### Durations or counts?
Set `delete_repetitions` to False to have the notebook account for the durations of divergences.

Set it to True if you only care about their counts. In this case you will see the new proportion between identical and diverging labels.

In [None]:
delete_repetitions = True

if delete_repetitions:
    changed1 = D.annotation1_chord != D.annotation1_chord.shift()
    changed2 = D.annotation2_chord != D.annotation2_chord.shift()
    D = D[changed1 | changed2]
    print(D.binary.value_counts(normalize=True))

In [None]:
df = D[D.binary] # select only diverging

**Overview Spectral Pitch distances**

In [None]:
print(df.sps.describe())
sns.histplot(data=df, x='sps', bins=50);

**Overview Voice Leading distances**

In [None]:
print(df.vl.describe())
vl = df.vl.value_counts()
sns.barplot(x=vl.index, y=vl);

**Overview tone-by-tone distances**

In [None]:
print(df.tbt.astype(float).describe())
tbt = df.tbt.value_counts()
sns.barplot(x=tbt.index, y=tbt);

## Expanding the data to inspect different categories of chord divergence

In [None]:
regex = r"^([A-G](b*|#*)?)_(.*)_inv(\d)$"
def is_triad(type_str):
    val = ChordType[type_str]
    return val.value < 4
df = D[D.binary]
for n, col in ((1, 'annotation1_chord'), (2, 'annotation2_chord')):
    renaming = {0: f"root{n}", 2: f"type{n}", 3: f"inv{n}", 'triad': f"triad{n}", 'tpc': f"tpc{n}", 'pc': f"pc{n}"}
    expanded = df[col].str.extract(regex).loc[:, [0, 2, 3]]
    expanded['triad'] = transform(expanded[2], is_triad)
    expanded['tpc'] = transform(expanded[0], name2fifths)
    expanded['pc'] = transform(expanded.tpc, fifths2pc)
    df = pd.concat([df, expanded.rename(columns=renaming)], axis=1)
df = pd.concat([df,
                (df.tpc1-df.tpc2).abs().rename('tpc_iv'), 
                (df.pc1 - df.pc2).abs().rename('pc_iv')], axis=1)
df.pc_iv = df.pc_iv.where(df.pc_iv <= 6, 12 - df.pc_iv)
df.head()

In [None]:
same_root = df.tpc_iv == 0
same_type = df.type1 == df.type2
same_inversion = df.inv1 == df.inv2

### Inspecting chords diverging only by inversion
**Proportion of diverging chords that have the same root and type but different inversion (True)**

In [None]:
diverging_inv = same_root & same_type
vc(diverging_inv)

In [None]:
inv_groups = df.loc[diverging_inv, ['inv1', 'inv2']]\
                 .apply(lambda row: tuple(sorted(row.astype(int))), axis=1)\
                 .rename('inversions')
inv_dist = inv_groups.map(lambda t: abs(t[0] - t[1])).rename('inv_dist')
different_inversions = pd.concat([df[diverging_inv], inv_groups, inv_dist], axis=1)
vc(different_inversions.inv_dist)

In [None]:
different_inversions.groupby(['inv_dist', 'triad1', 'inversions']).size()

#### Tone by tone

In [None]:
vc(different_inversions.tbt)

In [None]:
different_inversions.groupby(['tbt', 'triad1', 'inversions']).size()

**Grouping `tbt` values by combinations of inversions shows that all tetrads have the distance `1/4` and all triads the distance `2/7`** Note that the chords in question all share the same root.

#### Voice leading

In [None]:
vc(different_inversions.vl)

In [None]:
different_inversions.groupby(['vl', 'inv_dist', 'triad1']).size()

In [None]:
different_inversions.groupby(['inv_dist', 'triad1', 'type1', 'inversions', 'vl']).size()

In [None]:
different_inversions.groupby(['vl', 'type1', 'inversions']).size()

**`18` is among the highest values that the voice leading distance yiels, so it is rather surprising that chords with the same root and type shall get this value**

#### SPS

In [None]:
different_inversions.groupby(['type1', 'inversions', 'sps']).size().head(60)

**Surprising to see such different distances for transpositionally identical chord combinations**
As an example, let's inspect all cases where both datasets have a diminished 7th chord on the same root, one in first, one in second inversion:

In [None]:
different_inversions[(different_inversions.type1 == 'DIM7') & (different_inversions.inversions == ('1', '2'))]

In [None]:
gb = different_inversions.groupby('triad1').sps
pd.concat([gb.min(), gb.max()], axis=1, keys=['min_sps', 'max_sps'])

### Inspecting chords diverging only by type
**Proportion of diverging chords that have the same root and inversion but different type (True)**

In [None]:
diverging_type = same_root & same_inversion
vc(diverging_type)

#### Tone by tone

In [None]:
type_groups = df.loc[diverging_type, ['type1', 'type2']]\
                 .apply(lambda row: tuple(sorted(row)), axis=1)\
                 .rename('types')
triad_groups = df.loc[diverging_type, ['triad1', 'triad2']]\
                 .apply(lambda row: tuple(sorted(row)), axis=1)\
                 .rename('triads')
different_types = pd.concat([df[diverging_type], type_groups, triad_groups], axis=1)
vc(different_types.tbt)

In [None]:
different_types.groupby(['tbt', 'triads', 'inv1']).size()

In [None]:
different_types.groupby(['tbt', 'inv1', 'types']).size()

In [None]:
vc(different_types.vl)

In [None]:
different_types.groupby(['vl', 'triads', 'types']).size()

#### SPS

In [None]:
different_types.sps.describe()

In [None]:
different_types[different_types.sps > 0.4]

### Filtering out tetrad-triad correspondence (e.g. M <-> Mm7)

In [None]:
def reduce_tetrad(type_str):
    val = ChordType[type_str]
    return TRIAD_REDUCTION[val]

same_type_reduced = transform(df.type1, reduce_tetrad) == transform(df.type2, reduce_tetrad)

In [None]:
diverging_type_reduced = diverging_type & ~same_type_reduced
vc(diverging_type_reduced)

In [None]:
different_types_reduced = df[diverging_type_reduced]
vc(different_types_reduced.tbt).sort_index()

In [None]:
vc(different_types_reduced.vl).sort_index()

In [None]:
different_types_reduced.sps.describe()

In [None]:
different_types_reduced.sort_values('sps').head(50)

### Inspecting chords diverging only by root
**Proportion of diverging chords that have the same type and inversion but different root (True)**

In [None]:
diverging_root = same_type & same_inversion
vc(diverging_root)

**Divergence by fifths**

In [None]:
different_roots = df[diverging_root].copy()
vc(different_roots.tpc_iv)

In [None]:
different_roots[different_roots.tpc_iv == 11]

**Divergence by semitones**

In [None]:
vc(different_roots.pc_iv)

In [None]:
vc(different_roots.tbt)

In [None]:
vc(different_roots.vl).sort_index()

In [None]:
different_roots.sps.describe()