In [None]:
import sys, os
from fractions import Fraction
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ms3 import transform, name2fifths, fifths2pc
sys.path.append(os.path.abspath('../python_scripts/'))
from data_types import ChordType, PitchType
from utils import get_chord_pitches
from constants import TRIAD_REDUCTION
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

def vc(S):
    """Value counts & normalized column"""
    vc = S.value_counts(dropna=False)
    return pd.concat([vc, vc / vc.sum()], axis=1, keys=['counts', 'fraction'])

In [None]:
data = '../Data/Beethoven_sonata_progression/'
csv_files = os.listdir(data)
fnames = [os.path.splitext(f)[0] for f in csv_files]
csv_paths = [os.path.join(data, f) for f in csv_files]
dtypes = dict(binary=bool, vl=int)
to_frac = lambda f: Fraction(f).limit_denominator(8)
converters = dict(tbt=to_frac)
D = pd.concat([pd.read_csv(p, dtype=dtypes, converters=converters) for p in csv_paths], keys=fnames).sort_index()
D.head()

### Durations or counts?
Set `delete_repetitions` to False to have the notebook account for the durations of divergences.

Set it to True if you only care about their counts. In this case you will see the new proportion between identical and diverging labels.

In [None]:
delete_repetitions = True

if delete_repetitions:
    changed1 = D.annotation1_chord != D.annotation1_chord.shift()
    changed2 = D.annotation2_chord != D.annotation2_chord.shift()
    D = D[changed1 | changed2]
    print(D.binary.value_counts(normalize=True))

### Data Expansion

In [None]:
df = D[D.binary] # select only diverging
regex = r"^([A-G](b*|#*)?)_(.*)_inv(\d)$"
def is_triad(type_str):
    val = ChordType[type_str]
    return val.value < 4
df = D[D.binary]
for n, col in ((1, 'annotation1_chord'), (2, 'annotation2_chord')):
    renaming = {0: f"root{n}", 2: f"type{n}", 3: f"inv{n}", 'triad': f"triad{n}", 'tpc': f"tpc{n}", 'pc': f"pc{n}"}
    expanded = df[col].str.extract(regex).loc[:, [0, 2, 3]]
    expanded['triad'] = transform(expanded[2], is_triad)
    expanded['tpc'] = transform(expanded[0], name2fifths)
    expanded['pc'] = transform(expanded.tpc, fifths2pc)
    df = pd.concat([df, expanded.rename(columns=renaming)], axis=1)

In [None]:
def tbt_relaxed(set_a, set_b, as_distance=False):
    """Tone-by-tone but normalizing by 3 if a triad is involved.
    Returns 1 if a triad is contained in a tetrad.
    """
    n = min(len(set_a), len(set_b))
    overlap = len(set_a.intersection(set_b)) / n
    if as_distance:
        overlap = 1 - overlap
    return Fraction(overlap).limit_denominator(n)
    
def tbt_relaxed_cacheable(pc1, pc2, type1, type2, as_distance=False):
    a = set(get_chord_pitches(pc1, ChordType[type1], PitchType.MIDI) % 12)
    b = set(get_chord_pitches(pc2, ChordType[type2], PitchType.MIDI) % 12)
    return tbt_relaxed(a, b, as_distance=as_distance)

overlap = transform(df, tbt_relaxed_cacheable, ['pc1', 'pc2', 'type1', 'type2']).rename('overlap')
df = pd.concat([df,
                (df.tpc1-df.tpc2).abs().rename('tpc_iv'), 
                (df.pc1 - df.pc2).abs().rename('pc_iv'),
                overlap,
               ], axis=1)
df.pc_iv = df.pc_iv.where(df.pc_iv <= 6, 12 - df.pc_iv)
df.head()

### SPS by shared tones ('relaxed tbt')
Maximum overlap is the minimum number of tones between the two chords, so if a triad's notes are all included in a tetrad, the overlap is 1.

In [None]:
sns.set(rc={"figure.figsize":(16, 9)})
sns.violinplot(data=df, x='overlap', y='sps', order=sorted(df.overlap.unique(), reverse=True), cut=0)
plt.xticks(rotation=45)
plt.show()

### Categorize divergences

**Create boolean masks**

In [None]:
def reduce_tetrad(type_str):
    val = ChordType[type_str]
    return TRIAD_REDUCTION[val]

same_type_reduced = transform(df.type1, reduce_tetrad) == transform(df.type2, reduce_tetrad)
same_root = df.pc_iv == 0
same_type = df.type1 == df.type2
same_inversion = df.inv1 == df.inv2
diverging_inv = same_root & same_type
diverging_type = same_root & same_inversion

In [None]:
ctgs = pd.Series(index=df.index, dtype='string', name='divergence')
triad_tetrad = same_root & same_type_reduced & (df.triad1 != df.triad2) # the latter part was missing in the first version of the plots. It ensures that only one is a triad
triad_tetrad_root = overlap == 1
both_triads = df.triad1 & df.triad2
selectors = (
    ('inversion', diverging_inv),
    ('triad-tetrad', triad_tetrad & same_inversion),
    ('triad-tetrad & inversion', triad_tetrad & ~same_inversion),
    ('type', diverging_type),
    ('type & inversion', same_root),
    ('triad-tetrad & root', triad_tetrad_root),
    ('no_overlap', overlap == 0),
    ('tetrads_sharing_3', overlap == 3/4),
    ('tetrads_sharing_2', overlap == 1/2),
    ('tetrads_sharing_1', overlap== 1/4),
    ('triads_sharing_2',  (overlap == Fraction(2,3)) & both_triads),
    ('triad_tetrad_sharing_2', (overlap == Fraction(2,3)) & ~both_triads),
    ('triads_sharing_1', (overlap == Fraction(1,3)) & both_triads),
    ('triad_tetrad_sharing_1', (overlap == Fraction(1,3)) & ~both_triads),
    )
for c, sel in selectors:
    ctgs = ctgs.where(ctgs.notna(), sel.map({True: c, False: pd.NA}))
## Now, fill in the uncategorized with the fraction of diverging notes
tbt_rel = transform(df, tbt_relaxed_cacheable, ['pc1', 'pc2', 'type1', 'type2'], as_distance=True)
ctgs = ctgs.where(ctgs.notna(), tbt_rel.astype(str))
vc(ctgs)

#### SPS distributions for the different categories

In [None]:
order_by_median = df.groupby(ctgs).sps.median().sort_values().index.to_list()
labels = [f"{l} ({ctgs.value_counts(normalize=True)[l]:.1%})" for l in order_by_median]

In [None]:
sns.set(rc={"figure.figsize":(16, 9)})
ax = sns.violinplot(data=df, x=ctgs, y='sps', order=order_by_median, cut=0)
ax.set_xticklabels(labels)
plt.xticks(rotation=35)
plt.ylabel('sps including changes')
plt.savefig('../figures/categories_sps_with_changes.png', bbox_inches='tight')
plt.show()

In [None]:
ax = sns.boxplot(data=df, x=ctgs, y='tbt', order=order_by_median)
ax = sns.stripplot(data=df, x=ctgs, y='tbt', order=order_by_median, marker="o", alpha=0.3, color="black")
ax.set_xticklabels(labels)
plt.xticks(rotation=35)
yticks = sorted(df.tbt.unique())
plt.yticks(df.tbt.astype(float).unique(), df.tbt.unique())
plt.ylabel('tbt including changes')
plt.savefig('../figures/categories_tbt_with_changes.png', bbox_inches='tight')
plt.show()

In [None]:
ax = sns.stripplot(data=df, x=ctgs, y='vl', order=order_by_median, marker="o", alpha=0.3, color="black")
ax = sns.boxplot(data=df, x=ctgs, y='vl', order=order_by_median)
ax.set_xticklabels(labels)
plt.xticks(rotation=35)
plt.ylabel('vl including changes')
plt.savefig('../figures/categories_vl_with_changes.png', bbox_inches='tight')
plt.show()

# Combining metrics for more fine-grained error categories
## Triad-tetrad SPS divided by VL values

In [None]:
tri_tet = df[ctgs == 'triad-tetrad'].copy()
tri_tet.groupby('vl').sps.describe()

In [None]:
ax = sns.boxplot(data=tri_tet, x='vl', y='sps')
ax = sns.stripplot(data=tri_tet, x='vl', y='sps', marker="o", alpha=0.3, color="black")
ax.set_xticklabels([f"{i} (n = {n})" for i, n in tri_tet.groupby('vl').size().iteritems()])
plt.show()

In [None]:
comparisons = ('tpc', 'pc', 'type', 'inv', 'triad') 
tri_tet_groups = {compare: tri_tet[[f"{compare}1", f"{compare}2"]].apply(lambda row: tuple(sorted(row)), axis=1) for compare in comparisons}

### Chord type combinations for the different VL values
Chord combinations in this category share the same root, inversion, and reduced chord type.

In [None]:
tri_tet.groupby([tri_tet.vl, tri_tet_groups['type']]).size()

**The three different VL values seem to be dependent on the size of the 7th**

In [None]:
tri_tet.groupby([tri_tet.vl, tri_tet_groups['type'], tri_tet_groups['inv']]).sps.describe()

In [None]:
tri_tet[(tri_tet.vl == 3) & (tri_tet_groups['tpc'] == (-1, -1))]

# Older code
#### Checking outliers

In [None]:
df[ctgs == 'triads_sharing_1'].groupby('vl').sps.min()

In [None]:
df[(ctgs == 'triads_sharing_1') & (df.vl <= 3)]

In [None]:
df[ctgs == 'triads_sharing_1'].sort_values('sps')

In [None]:
def get_value_pair(row, col):
    """Return value pairs such that the first one belongs to the lower of both roots."""
    col1, col2 = f"{col}1", f"{col}2"
    if (row.pc1 + row.pc_iv) % 12 != row.pc2:
        return row[col2], row[col1]
    return row[col1], row[col2]

### Inspect particular divergences
#### Two seventh chords with different roots and 3 common tones
For example, the first row, `1  (MAJ_MIN7, DIM7)  4` means: 4 instances of a dominant 7th chord and a diminished 7th chord with the root 1 semitone higher than the former.

In [None]:
tetrads_sharing_3 = df[ctgs == 'tetrads_sharing_3']
type_groups = tetrads_sharing_3.apply(get_value_pair, col='type', axis=1).rename('types')
tetrads_sharing_3 = pd.concat([tetrads_sharing_3, type_groups], axis=1)
tetrads_sharing_3.groupby(['pc_iv', 'types']).size()

#### Two triads with different roots and 2 common tones

In [None]:
triads_sharing_2 = df[ctgs == 'triads_sharing_2']
# here, the order of types is always (lower_root, higher_root)
type_groups = triads_sharing_2.apply(get_value_pair, col='type', axis=1).rename('types')
triads_sharing_2 = pd.concat([triads_sharing_2, type_groups], axis=1)
triads_sharing_2.groupby(['pc_iv', 'types']).size()