In [None]:
import os
import json

from PIL import Image, ImageFilter
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

import guitarpro

from wordcloud import WordCloud

from tqdm import tqdm

In [None]:
dataset_path = '../datasets/DadaGP-v1.1/'

# Genres Figure

In [None]:
with open(os.path.join(dataset_path, '_DadaGP_genre_frequency.json')) as f:
    genre_frequency = json.load(f)

In [None]:
genre_frequency = {k.split(':')[-1]: v for k, v in sorted(genre_frequency.items(), key=lambda item: item[1], reverse=True)}

In [None]:
mask_image = Image.open('guitar_mask.jpg').convert('L')

mask_image = mask_image.filter(ImageFilter.GaussianBlur(3))

width, height = mask_image.size

new_width = int(width * 1.2)
new_height = int(height * 1.2)
background = Image.new('L', (new_width, new_height), 255)

x = (new_width - width) // 2
y = (new_height - height) // 2

background.paste(mask_image, (x, y))

guitar_mask = np.array(background)

In [None]:
wordcloud = WordCloud(
    mask=guitar_mask,
    contour_color='black',
    contour_width=7,
    colormap='copper',    # gnuplot2
    width=800,
    height=400,
    background_color='white',
).generate_from_frequencies(genre_frequency)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
wordcloud.to_file('genre_wordcloud.png')

# Statistics of DadaGP

In [None]:
def get_measures(ax, c):
    global idx

    with open(os.path.join(dataset_path, '_DadaGP_all_metadata.json')) as f:
        metadata = json.load(f)
    
    measures_ls = []

    STEP_MEASURES = 25
    for k, v in tqdm(metadata.items(), desc='Number of Measures per Song', total=len(metadata)):
        path = os.path.join(dataset_path, v['tokens.txt'])
        try:
            with open(path) as f:
                tokens = f.read().split()
                measures_ls.append(tokens.count('new_measure') // STEP_MEASURES)
        except FileNotFoundError:
            pass
    
    measures_counter = Counter(measures_ls)
    measures_counter = {k: v for k, v in sorted(measures_counter.items(), key=lambda item: item[0])}
    
    MAXN = 9
    
    x_labels = list(measures_counter.keys())[:MAXN]
    y_values = list(measures_counter.values())[:MAXN]
    
    x_labels = [f'{x * STEP_MEASURES}-{(x + 1) * STEP_MEASURES}' for x in x_labels]
    
    x_labels.append(f'{MAXN * STEP_MEASURES}+')
    y_values.append(sum(list(measures_counter.values())[MAXN:]))

    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Number of Measures per Song')
    # ax.set_xlabel('Number of Measures')
    # ax.set_ylabel('Frequency')

    idx = idx + 1

    return ax

In [None]:
def get_tracks(ax, c):
    global idx

    with open(os.path.join(dataset_path, '_DadaGP_all_metadata.json')) as f:
        metadata = json.load(f)
    
    tracks_ls = []

    for k, v in tqdm(metadata.items(), desc='Number of Tracks per Song', total=len(metadata)):
        path = '.'.join(os.path.join(dataset_path, v['tokens.txt']).split('.')[:-2])
        try:
            song = guitarpro.parse(path)
            
            tracks_ls.append(len(song.tracks))
        except FileNotFoundError:
            pass
    
    tracks_counter = Counter(tracks_ls)
    tracks_counter = {k: v for k, v in sorted(tracks_counter.items(), key=lambda item: item[0])}
    
    MAXN = 9
    
    x_labels = list(tracks_counter.keys())[:MAXN]
    y_values = list(tracks_counter.values())[:MAXN]
    
    x_labels = [str(x) for x in x_labels]
    
    x_labels.append(f'{MAXN}+')
    y_values.append(sum(list(tracks_counter.values())[MAXN:]))

    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Number of Tracks per Song')
    # ax.set_xlabel('Number of Tracks')
    # ax.set_ylabel('Frequency')

    idx = idx + 1

    return ax

In [None]:
def get_instruments(ax, c):
    global idx

    with open(os.path.join(dataset_path, '_DadaGP_all_metadata.json')) as f:
        metadata = json.load(f)
    
    instruments_ls = []

    for k, v in tqdm(metadata.items(), desc='Instruments Distribution', total=len(metadata)):
        path = os.path.join(dataset_path, v['tokens.txt'])
        try:
            with open(path) as f:
                tokens = f.read().split()
                ins = set()
                for token in tokens:
                    if token.startswith('clean'):
                        ins.add('clean')
                    elif token.startswith('distorted'):
                        ins.add('distorted')
                    elif token.startswith('bass'):
                        ins.add('bass')
                    elif token.startswith('drums'):
                        ins.add('drums')
                    elif token.startswith('pads'):
                        ins.add('pads')
                    elif token.startswith('leads'):
                        ins.add('leads')
                instruments_ls.extend(ins)
        except FileNotFoundError:
            pass
    
    instruments_counter = Counter(instruments_ls)
    instruments_counter = {k: v for k, v in sorted(instruments_counter.items(), key=lambda item: item[1], reverse=True)}
    
    x_labels = list(instruments_counter.keys())
    y_values = list(instruments_counter.values())
    
    x_labels = [str(x) for x in x_labels]

    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Frequency of Instruments')
    # ax.set_xlabel('Number of Instruments')
    # ax.set_ylabel('Frequency')

    idx = idx + 1

    return ax

In [None]:
def get_tokens(ax, c):
    global idx
    
    with open(os.path.join(dataset_path, '_DadaGP_song_tokens_frequency.json')) as f:
        song_tokens_frequency = json.load(f)
    
    tokens_frequency = {k: v for k, v in sorted(song_tokens_frequency.items(), key=lambda item: item[1], reverse=True)}
    
    MAXN = 9
    
    x_labels = list(tokens_frequency.keys())[:MAXN]
    y_values = list(tokens_frequency.values())[:MAXN]
    
    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Top 10 Tokens Frequency')
    # ax.set_xlabel('Tokens')
    # ax.set_ylabel('Frequency')
    
    idx = idx + 1
    
    return ax

In [None]:
def get_nfx(ax, c):
    global idx
    
    with open(os.path.join(dataset_path, '_DadaGP_all_metadata.json')) as f:
        metadata = json.load(f)
    
    nfx_ls = []
    
    for k, v in tqdm(metadata.items(), desc='Number of Effects per Song', total=len(metadata)):
        path = os.path.join(dataset_path, v['tokens.txt'])
        try:
            with open(path) as f:
                tokens = f.read().split()
                for token in tokens:
                    if token.startswith('nfx'):
                        nfx_ls.append(token)
        except FileNotFoundError:
            pass
    
    nfx_counter = Counter(nfx_ls)
    nfx_counter = {k: v for k, v in sorted(nfx_counter.items(), key=lambda item: item[1], reverse=True)}
    
    MAXN = 9
    
    x_labels = list(nfx_counter.keys())[:MAXN]
    y_values = list(nfx_counter.values())[:MAXN]
    
    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Top 10 nfx Frequency')
    # ax.set_xlabel('nfx Tokens')
    # ax.set_ylabel('Frequency')
    
    idx = idx + 1
    
    return ax

In [None]:
def get_bfx(ax, c):
    global idx

    with open(os.path.join(dataset_path, '_DadaGP_all_metadata.json')) as f:
        metadata = json.load(f)

    bfx_ls = []

    for k, v in tqdm(metadata.items(), desc='Number of Effects per Song', total=len(metadata)):
        path = os.path.join(dataset_path, v['tokens.txt'])
        try:
            with open(path) as f:
                tokens = f.read().split()
                for token in tokens:
                    if token.startswith('bfx'):
                        bfx_ls.append(token)
        except FileNotFoundError:
            pass

    bfx_counter = Counter(bfx_ls)
    bfx_counter = {k: v for k, v in sorted(bfx_counter.items(), key=lambda item: item[1], reverse=True)}

    MAXN = 9

    x_labels = list(bfx_counter.keys())[:MAXN]
    y_values = list(bfx_counter.values())[:MAXN]

    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Top 10 bfx Frequency')
    # ax.set_xlabel('bfx Tokens')
    # ax.set_ylabel('Frequency')

    idx = idx + 1

    return ax

In [None]:
def get_tempo(ax, c):
    global idx

    STEP_TEMPOS = 50

    with open(os.path.join(dataset_path, '_DadaGP_all_metadata.json')) as f:
        metadata = json.load(f)

    tempos_ls = []
    for k, v in tqdm(metadata.items(), desc='Tempos', total=len(metadata)):
        path = os.path.join(dataset_path, v['tokens.txt'])
        try:
            with open(path) as f:
                tokens = f.read().split()
                tempos_ls.append(int(tokens[2].split(':')[-1]) // STEP_TEMPOS)
        except FileNotFoundError:
            pass
    
    tempos_counter = Counter(tempos_ls)
    tempos_counter = {k: v for k, v in sorted(tempos_counter.items(), key=lambda item: item[0])}
    
    MAXN = 300 // STEP_TEMPOS
    
    x_labels = list(tempos_counter.keys())[:MAXN]
    y_values = list(tempos_counter.values())[:MAXN]
    
    x_labels = [f'{x * STEP_TEMPOS}-{(x + 1) * STEP_TEMPOS}' for x in x_labels]
    
    x_labels.append(f'{MAXN * STEP_TEMPOS}+')
    y_values.append(sum(list(tempos_counter.values())[MAXN:]))

    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Tempo Distribution')
    # ax.set_xlabel('Tempo')
    # ax.set_ylabel('Frequency')

    idx = idx + 1

    return ax

In [None]:
def get_time_signatures(ax, c):
    global idx

    with open(os.path.join(dataset_path, '_DadaGP_all_metadata.json')) as f:
        metadata = json.load(f)
    
    time_signatures_ls = []

    for k, v in tqdm(metadata.items(), desc='Time Signatures Distribution', total=len(metadata)):
        path = '.'.join(os.path.join(dataset_path, v['tokens.txt']).split('.')[:-2])
        try:
            song = guitarpro.parse(path)

            for measure in song.tracks[0].measures:
                time_signature = measure.timeSignature
                time_signatures_ls.append(f'{time_signature.numerator}/{time_signature.denominator.value}')
        except FileNotFoundError:
            pass
    
    time_signatures_counter = Counter(time_signatures_ls)
    time_signatures_counter = {k: v for k, v in sorted(time_signatures_counter.items(), key=lambda item: item[1], reverse=True)}
    
    MAXN = 9
    
    x_labels = list(time_signatures_counter.keys())[:MAXN]
    y_values = list(time_signatures_counter.values())[:MAXN]
    
    x_labels = [str(x) for x in x_labels]
    
    x_labels.append(f'Others')
    y_values.append(sum(list(time_signatures_counter.values())[MAXN:]))

    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Time Signatures Distribution')
    # ax.set_xlabel('Time Signatures')
    # ax.set_ylabel('Frequency')

    idx = idx + 1

    return ax

In [None]:
def get_duration(ax, c):
    global idx

    with open(os.path.join(dataset_path, '_DadaGP_all_metadata.json')) as f:
        metadata = json.load(f)
    
    duration_ls = []

    for k, v in tqdm(metadata.items(), desc='Duration Distribution', total=len(metadata)):
        path = os.path.join(dataset_path, v['tokens.txt'])
        try:
            with open(path) as f:
                tokens = f.read().split()
                for token in tokens:
                    if token.startswith('wait:'):
                        duration_ls.append(int(token.split(':')[-1]))
        except FileNotFoundError:
            pass
    
    duration_counter = Counter(duration_ls)
    duration_counter = {k: v for k, v in sorted(duration_counter.items(), key=lambda item: item[1], reverse=True)}
    
    MAXN = 9
    
    x_labels = list(duration_counter.keys())[:MAXN]
    y_values = list(duration_counter.values())[:MAXN]
    
    x_labels = [str(x) for x in x_labels]
    
    x_labels.append(f'Others')
    y_values.append(sum(list(duration_counter.values())[MAXN:]))

    ax.bar(x_labels, y_values, color=c)
    ax.set_title(f'({chr(idx+ord("a"))}) Duration Distribution (wait:xx)')
    # ax.set_xlabel('Duration')
    # ax.set_ylabel('Frequency')

    idx = idx + 1

    return ax

In [None]:
colors = ['#FF0000', '#FFA500', '#FFFF00', '#008000', '#0000FF', '#4B0082', '#800080', '#00FFFF', '#FFC0CB']
idx = 0

fig, axs = plt.subplots(3, 3, figsize=(10, 10))

get_measures(axs[0, 0], colors[idx])
get_tracks(axs[0, 1], colors[idx])
get_instruments(axs[0, 2], colors[idx])
get_tokens(axs[1, 0], colors[idx])
get_nfx(axs[1, 1], colors[idx])
get_bfx(axs[1, 2], colors[idx])
get_tempo(axs[2, 0], colors[idx])
get_time_signatures(axs[2, 1], colors[idx])
get_duration(axs[2, 2], colors[idx])

for ax in axs.flat:
    ax.yaxis.set_major_formatter(ScalarFormatter(useMathText=True))
    ax.ticklabel_format(style='sci', axis='y', scilimits=(0,0))

    for label in ax.get_xticklabels():
        label.set_rotation(45)
        label.set_ha('right')

plt.tight_layout()
plt.savefig('all_statistics.png')
plt.show()