In [282]:
import pandas as pd
from model.PrepareData import PrepareData
import re
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from scipy.optimize import minimize



In [283]:
topn = 30
ngrams = [1,2,3,4]

WIDTH = 400
HEIGHT = 250
SCALE = 3

In [284]:
def get_ngram_string():
    return "ngrams-" + "-".join([str(i) for i in ngrams])


In [285]:
def raw_chords_to_df(tunes, remove_root=False):
  tunes_chords = [item for tune in tunes for item in tune]

  if remove_root:
    tunes_chords = [re.sub('[A-G][#b]?', '*', chord) for chord in tunes_chords]
    tunes_chords = [chord.replace('-', ' – ') for chord in tunes_chords]

  counts = Counter(tunes_chords)
  _df = pd.DataFrame(counts.items(),
                    columns=['chord', 'count']).sort_values(by='count', ascending=False)

  return _df

In [286]:
def plot_zipf(chords):
    counter_of_words = chords.set_index('chord').to_dict(orient='dict')
    counter_of_words = Counter(counter_of_words['count'])

    word_counts = np.array(sorted(counter_of_words.values(), reverse=True))
    frequency_rank = np.array(list(range(1, len(word_counts) + 1)))

    df_zipf = pd.DataFrame({'word_counts': word_counts,
                            'rank': frequency_rank})

    fig = px.scatter(df_zipf,
                     x='rank',
                     y='word_counts',
                     log_y=True,
                     log_x=True,
                     labels={
                         "word_counts": "Absolute Frequency",
                         "rank": "Frequency Rank",
                     },
                     width=WIDTH, height=HEIGHT,
    )

    fig.update_layout(
        title={'text': f"Zipf Plot for {chords_preprocess} Vocabulary<br><sup>n-grams={ngrams}</sup>",
               'font': {'size': 12}
               },
        yaxis={'dtick': 1, 'showline': True, 'linewidth': 1, 'linecolor': 'black', 'showgrid': True, 'showticklabels': True},
        xaxis={'dtick': 1, 'showline': True, 'linewidth': 1, 'linecolor': 'black', 'showgrid': True, 'showticklabels': True},
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor='rgba(0,0,0,0)',
        font={'size': 8},
    )

    fig.show()

    for format in ["pdf"]:
        fig.write_image(f"images/92a_{chords_preprocess}_{get_ngram_string()}_zipf.{format}")

    return word_counts, frequency_rank


In [287]:
def loglik(b):
    # Power law function
    Probabilities = word_counts**(-b)

    # Normalized
    Probabilities = Probabilities/Probabilities.sum()

    # Log Likelihoood
    Lvector = np.log(Probabilities)

    # Multiply the vector by frequencies
    Lvector = np.log(Probabilities) * freq_of_word_counts

    # LL is the sum
    L = Lvector.sum()

    # We want to maximize LogLikelihood or minimize (-1)*LogLikelihood
    return(-L)

In [288]:
def corpus_chord_ngram(obj, ngrams):
    _df = pd.DataFrame(columns=['sectionid', 'chords'])
    list_corpus_chords = []
    list_sectionid = []

    # for each unique section of a tune, process the chords
    for _id, line in obj.df_section.iterrows():
        sectionid = line['sectionid']
        tune_n = obj.preprocess_input(line['chords'], ngrams=ngrams)

        list_corpus_chords.append(tune_n)
        list_sectionid.append(sectionid)

    _df = pd.DataFrame(list(zip(list_sectionid, list_corpus_chords)),
                       columns=['sectionid', 'chords'])
    _df = _df.set_index('sectionid')
    return _df

In [289]:
def plot_distribution(df, root_removed=False):
    df.sort_values(by=['count'], ascending=False, inplace=True)
    df_top = df.head(topn)

    if len(df_top) >= topn:
        text = f'Only the top {topn} chord n-grams are shown.'
    else:
        text = f'All chord n-grams are shown.'

    if root_removed:
        text2 = f', Root removed.'
    else:
        text2 = ''

    fig = px.bar(df_top,
                 x='chord',
                 y='count',
                 log_y=True,
                 labels={
                     "chord": "",
                     "count": "Absolute Frequency",
                 },
                 width=WIDTH, height=HEIGHT,
                 )
    fig.update_layout(
        barmode='stack',
        yaxis={'showline': False, 'linewidth': 1, 'linecolor': 'black', 'showgrid': True, 'showticklabels': True},
        xaxis={'showline': True, 'linewidth': 1, 'linecolor': 'black', 'showgrid': True, 'showticklabels': True, 'categoryorder':'total descending'},
        title={'text': f"Distribution of {chords_preprocess} n-grams{text2}<br><sup>n-grams={ngrams}. {text}</sup>",
               'font': {'size': 12},
               'yanchor': 'top',
               'pad': {'b': 100}
               },
        xaxis_title="",
        yaxis_title="Absolute Frequency",

        margin=dict(l=0, r=20, t=20, b=20),
        font=dict(
            size=8,
        ),
        plot_bgcolor="white",
    )
    fig.show()

    for format in ["pdf"]:
        fig.write_image(f"images/92_{chords_preprocess}_{get_ngram_string()}_{text2}_{text}.{format}",
                        width=WIDTH, height=HEIGHT,
                        scale=5)


In [None]:
summary = pd.DataFrame(columns=['vocab',
                                'ngram',
                                'total_tokens',
                                'unique_tokens',
                                'num_sections',
                                'mean_tokens_per_section',
                                'prop_vocab_corpus'])

for vocab in ['chordsBasic', 'chordsSimplified', 'chordsFull']:
    for ngram in [[1], [1,2], [1,2,3], [1,2,3,4]]:
        prep = PrepareData(vocab, ngrams=ngram)
        df = corpus_chord_ngram(prep, ngram)

        data = list(df['chords'])

        flat_list = [item for section in data for item in section]
        total_tokens = len(flat_list)
        unique_tokens = len(set(flat_list))
        num_sections = len(data)
        mean_tokens_per_section = sum([len(section) for section in data])/num_sections

        summary.loc[len(summary)] = {
            'vocab': vocab,
            'ngram': ngram,
            'total_tokens': total_tokens,
            'unique_tokens': unique_tokens,
            'num_sections': num_sections,
            'mean_tokens_per_section': round(mean_tokens_per_section, 2),
            'prop_vocab_corpus': round(100*(unique_tokens / total_tokens), 2)
        }

summary




Train Corpus: 3225
Test Corpus: 186
Train Corpus: 3225
Test Corpus: 186


# Read Full Chords

In [None]:
chords_preprocess = 'chordsFull'
prep = PrepareData(chords_preprocess, ngrams=ngrams)
df = corpus_chord_ngram(prep, prep.ngrams)
df = df['chords']


In [None]:
print(f"Number of sections used for training: {len(prep.df_section)}")

print(f"Number of total sections: {len(prep.df)}")

### Prepare Overview for Full Chords, any Root

In [None]:
df_chords = raw_chords_to_df(df)
df_chords.head()

In [None]:
df_chords.tail()

In [None]:
word_counts, freq_of_word_counts = plot_zipf(df_chords)


In [None]:
if False:
    f,ax = plt.subplots()
    ax.scatter(freq_of_word_counts, word_counts,
               marker='.',
               label = "data")
    ax.set_xlabel('Log Frequency Rank of Token')
    ax.set_ylabel('Log Absolute Frequency of Token')
    ax.set_xscale("log")
    ax.set_yscale("log")

    if ngrams == [1]:
        c = 0.1 # for ngram=[1]
        min_rank = 1
        max_rank = -1
    elif ngrams == [1,2]:
        c = 0.8 # for ngram=[1,2]
        min_rank = 10
        max_rank = 1000
    elif ngrams == [1,2,3]:
        c = 2 # for ngram=[1,2,3]
        min_rank = 10
        max_rank = 4000
    elif ngrams == [1,2,3,4]:
        c = 4 # for ngram=[1,2,3,4]
        min_rank = 10
        max_rank = 10000

    print(f'Limiting data fit to {min_rank} to {max_rank}')
    word_counts = word_counts[min_rank:max_rank]
    freq_of_word_counts = freq_of_word_counts[min_rank:max_rank]

    s_best = minimize(loglik, [2])
    print(s_best)
    print(s_best.x[0])

    #c = 0.1 # for ngram=[1]
    #c = 0.8 # for ngram=[1,2]
    #c = 4 # for ngram=[1,2,3]
    #c = 5 # for ngram=[1,2,3,4]
    alpha = r'$\alpha$'
    ax.plot(c*10**4 * word_counts**-s_best.x,
            word_counts,
            '--',
            color="orange",
            lw=1,
            label = f'fitted {alpha}')
    ax.set_title(f'Zipf Law for {chords_preprocess}, ngrams={ngrams}\nc={c}, {alpha}={round(s_best.x[0],2)}')
    ax.legend()

    plt.savefig(f"images/92_zipf_{chords_preprocess}_{get_ngram_string()}.pdf", format="pdf", bbox_inches="tight")

In [None]:
plot_distribution(df_chords)


### Prepare Overview for Full Chords, Roots removed

In [None]:
df_noroot = raw_chords_to_df(df, remove_root=True)
df_noroot.head(50)


In [None]:
df_noroot.head(10)

In [None]:
plot_distribution(df_noroot, root_removed=True)


# Read Simplified Chords

In [None]:
chords_preprocess = 'chordsSimplified'
prep = PrepareData(chords_preprocess, ngrams=ngrams)
df = corpus_chord_ngram(prep, prep.ngrams)
df = df['chords']

### Prepare Overview for Simplified Chords, any Root

In [None]:
df_chords = raw_chords_to_df(df)
df_chords.head()



In [None]:
word_counts, freq_of_word_counts = plot_zipf(df_chords)


In [None]:
if False:
    f,ax = plt.subplots()
    ax.scatter(freq_of_word_counts, word_counts,
               marker='.',
               label = "data")
    ax.set_xlabel('Log Frequency Rank of Token')
    ax.set_ylabel('Log Absolute Frequency of Token')
    ax.set_xscale("log")
    ax.set_yscale("log")

    if ngrams == [1]:
        c = 0.04 # for ngram=[1]
        min_rank = 1
        max_rank = -1
    elif ngrams == [1,2]:
        c = 0.5 # for ngram=[1,2]
        min_rank = 10
        max_rank = 1000
    elif ngrams == [1,2,3]:
        c = 1 # for ngram=[1,2,3]
        min_rank = 10
        max_rank = 4000
    elif ngrams == [1,2,3,4]:
        c = 2 # for ngram=[1,2,3,4]
        min_rank = 10
        max_rank = 8000

    print(f'Limiting data fit to {min_rank} to {max_rank}')
    word_counts = word_counts[min_rank:max_rank]
    freq_of_word_counts = freq_of_word_counts[min_rank:max_rank]

    # determine alpha
    s_best = minimize(loglik, [2])
    print(s_best)
    print(s_best.x[0])


    alpha = r'$\alpha$'
    ax.plot(c*10**4 * word_counts**-s_best.x,
            word_counts,
            '--', color="orange", lw=1, label = f'fitted {alpha}')
    ax.set_title(f'Zipf Law for {chords_preprocess}, ngrams={ngrams}\nc={c}, {alpha}={round(s_best.x[0],2)}')
    ax.legend()

    plt.savefig(f"images/92_zipf_{chords_preprocess}_{get_ngram_string()}.pdf", format="pdf", bbox_inches="tight")


In [None]:
plot_distribution(df_chords)

### Prepare Overview for Simplified Chords, Roots removed

In [None]:
df_noroot = raw_chords_to_df(df, remove_root=True)

In [None]:
df_noroot.head(10)

In [None]:
plot_distribution(df_noroot, root_removed=True)




# Read Basic Chords

In [None]:
chords_preprocess = 'chordsBasic'
prep = PrepareData(chords_preprocess, ngrams=ngrams)
df = corpus_chord_ngram(prep, prep.ngrams)
df = df['chords']

### Prepare Overview for Basic Chords, any Root

In [None]:
df_chords = raw_chords_to_df(df)
df_chords.head()



In [None]:
word_counts, freq_of_word_counts = plot_zipf(df_chords)


In [None]:
if False:
    f,ax = plt.subplots()
    ax.scatter(freq_of_word_counts, word_counts,
               marker='.',
               label = "data")
    ax.set_xlabel('Log Frequency Rank of Token')
    ax.set_ylabel('Log Absolute Frequency of Token')
    ax.set_xscale("log")
    ax.set_yscale("log")

    if ngrams == [1]:
        c = 0.03 # for ngram=[1]
        min_rank = 1
        max_rank = -1
    elif ngrams == [1,2]:
        c = 0.3 # for ngram=[1,2]
        min_rank = 10
        max_rank = 1000
    elif ngrams == [1,2,3]:
        c = 1 # for ngram=[1,2,3]
        min_rank = 0
        max_rank = 8000
    elif ngrams == [1,2,3,4]:
        c = 2 # for ngram=[1,2,3,4]
        min_rank = 10
        max_rank = 8000

    print(f'Limiting data fit to {min_rank} to {max_rank}')
    print(freq_of_word_counts[:5])
    word_counts_all = word_counts
    word_counts = word_counts[min_rank:max_rank]
    freq_of_word_counts = freq_of_word_counts[min_rank:max_rank]
    print(freq_of_word_counts[:5])

    # determine alpha
    s_best = minimize(loglik, [2])
    print(s_best)
    print(s_best.x[0])

    #c = 0.03 # for ngram=[1]
    #c = 0.3 # for ngram=[1,2]
    #c = 2 # for ngram=[1,2,3]
    #c = 5 # for ngram=[1,2,3,4]
    alpha = r'$\alpha$'
    ax.plot(c*10**4 * word_counts_all**-s_best.x,
            word_counts_all,
            '--', color="orange", lw=1, label = f'fitted {alpha}')
    ax.set_title(f'Zipf Law for {chords_preprocess}, ngrams={ngrams}\nc={c}, {alpha}={round(s_best.x[0],2)}')
    ax.legend()

    plt.savefig(f"images/92_zipf_{chords_preprocess}_{get_ngram_string()}.pdf", format="pdf", bbox_inches="tight")

In [None]:
plot_distribution(df_chords)

### Prepare Overview for Basic Chords, Roots removed

In [None]:
df_noroot = raw_chords_to_df(df, remove_root=True)

In [None]:
df_noroot.head(10)

In [None]:
plot_distribution(df_noroot, root_removed=True)
