In [1036]:
import pandas as pd
from model.PrepareData import PrepareData
import re
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter


In [1037]:
topn = 50
ngrams = [1,2]

In [1038]:
def raw_chords_to_df(tunes, remove_root=False):
  tunes_chords = [item for tune in tunes for item in tune]

  if remove_root:
    tunes_chords = [re.sub('[A-G][#b]?', '*', chord) for chord in tunes_chords]

  counts = Counter(tunes_chords)
  _df = pd.DataFrame(counts.items(),
                    columns=['chord', 'count']).sort_values(by='count', ascending=False)

  return _df


In [1039]:
def plot_zipf(chords):
    counter_of_words = chords.set_index('chord').to_dict(orient='dict')
    counter_of_words = Counter(counter_of_words['count'])

    word_counts = sorted(counter_of_words.values(), reverse=True)
    frequency_rank = np.array(list(range(1, len(word_counts) + 1)))

    plt.figure(figsize=(20, 3))

    plt.subplot(1, 2, 1)

    n = 30
    df_top = chords.head(n)
    plt.bar(df_top['chord'], np.log(df_top['count']))
    plt.xlabel('')
    plt.ylabel('Absolute Counts (log)')
    plt.title(f'Top {n} Most Frequent Chords in the Corpus')
    ax = plt.gca()
    ax.set_xticklabels(labels=chords['chord'],rotation=90)

    plt.subplot(1, 2, 2)

    plt.scatter(np.log(frequency_rank), np.log(word_counts))
    plt.xlabel('Frequency Rank of Token (log)')
    plt.ylabel('Absolute Count of Token (log)')
    plt.title('Zipf Plot for Chord Frequencies')

    #plt.savefig('plot.png')
    return plt


In [1040]:
def corpus_chord_ngram(obj, ngrams):
    _df = pd.DataFrame(columns=['sectionid', 'chords'])
    list_corpus_chords = []
    list_sectionid = []

    # for each unique section of a tune, process the chords
    for _id, line in obj.df_section.iterrows():
        sectionid = line['sectionid']
        tune_n = obj.preprocess_input(line['chords'], ngrams=ngrams)

        list_corpus_chords.append(tune_n)
        list_sectionid.append(sectionid)

    _df = pd.DataFrame(list(zip(list_sectionid, list_corpus_chords)),
                       columns=['sectionid', 'chords'])
    _df = _df.set_index('sectionid')
    return _df

In [1041]:
def plot_distribution(df, root_removed=False):
    df.sort_values(by=['count'], ascending=False, inplace=True)
    df_top = df.head(topn)

    if len(df_top) > topn:
        text = f'Only the top {topn} chords are shown.'
    else:
        text = f'All chords are shown.'

    if root_removed:
        text2 = f', with Root removed.'
    else:
        text2 = ''

    fig = px.bar(df_top,
                 x='chord',
                 y='count',
                 log_y=True,
                 width=600, height=300,
                 )
    fig.update_layout(
        barmode='stack',
        xaxis={'categoryorder':'total descending'},
        title={'text': f"Absolute Counts of {chords_preprocess} Chords{text2}<br><sup>{text}</sup>",
               'font': {'size': 12}
               },
        xaxis_title="",
        yaxis_title="Count (logarithmic)",
        margin=dict(l=0, r=20, t=20, b=20),
        font=dict(
            size=8,
        ),
        plot_bgcolor="white",
    )
    fig.show()



# Read Full Chords

In [None]:
chords_preprocess = 'chordsFull'
prep = PrepareData(chords_preprocess, ngrams=ngrams)
df = corpus_chord_ngram(prep, prep.ngrams)
df = df['chords']

### Prepare Overview for Full Chords, any Root

In [None]:
df_chords = raw_chords_to_df(df)
df_chords.head()

In [None]:
df_chords.tail()

In [None]:
plot_zipf(df_chords)


In [None]:
plot_distribution(df_chords)


### Prepare Overview for Full Chords, Roots removed

In [None]:
df_noroot = raw_chords_to_df(df, remove_root=True)
df_noroot.head(50)


In [None]:
df_noroot.head(10)

In [None]:
plot_distribution(df_noroot, root_removed=True)


# Read Simplified Chords

In [None]:
chords_preprocess = 'chordsSimplified'
prep = PrepareData(chords_preprocess, ngrams=ngrams)
df = corpus_chord_ngram(prep, prep.ngrams)
df = df['chords']

### Prepare Overview for Simplified Chords, any Root

In [None]:
df_chords = raw_chords_to_df(df)
df_chords.head()


In [None]:
plot_zipf(df_chords)


In [None]:
plot_distribution(df_chords)

### Prepare Overview for Simplified Chords, Roots removed

In [None]:
df_noroot = raw_chords_to_df(df, remove_root=True)

In [None]:
df_noroot.head(10)

In [None]:
plot_distribution(df_noroot, root_removed=True)




# Read Basic Chords

In [None]:
chords_preprocess = 'chordsBasic'
prep = PrepareData(chords_preprocess, ngrams=ngrams)
df = corpus_chord_ngram(prep, prep.ngrams)
df = df['chords']

### Prepare Overview for Basic Chords, any Root

In [None]:
df_chords = raw_chords_to_df(df)
df_chords.head()


In [None]:
plot_zipf(df_chords)


In [None]:
plot_distribution(df_chords)

### Prepare Overview for Basic Chords, Roots removed

In [None]:
df_noroot = raw_chords_to_df(df, remove_root=True)

In [None]:
df_noroot.head(10)

In [None]:
plot_distribution(df_noroot, root_removed=True)

