In [None]:
import pandas as pd
import numpy as np
import re

def load_dataframe(relative_path,dataframe_name):
    df = pd.read_pickle(f'{relative_path}/{dataframe_name}.pkl')    
    return df

def read_file(relative_path,file_name):
    text= ""
    with open(f'{relative_path}/{file_name}.abc','r') as f:
        text = f.read()
    return text

In [None]:
relative_path ="notebooks/data/final_dataset"
filename_name = 'clean_augmented_data'
#filename_name = 'clean_original_training_data'
#relative_path ="notebooks/data/original_dataset"
training_data_df = load_dataframe(relative_path,filename_name)
training_data_df.columns


In [None]:
training_data_df.tail()

In [None]:
training_data_df["clean_header"].str.len().max()

In [None]:
training_data_df["clean_body"].str.len().max()

In [None]:
bodies = ""
silences = 0
for body in training_data_df["clean_body"]:
    if 'z' in body:
        silences +=1 
    bodies += body+"\n"
chars = sorted(list(set(bodies)))
vocab_size = len(chars)
print('vocab: ',''.join(chars))
print('vocab_size',vocab_size)
print("silences ",silences)

In [None]:
training_data_text = read_file(relative_path,filename_name)

print("number of chars:",len(training_data_text))

In [None]:
chars = sorted(list(set(training_data_text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

In [None]:
import torch

print(torch.__version__)

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import wandb
import tiktoken

print(wandb.__version__)

In [None]:
!ls

In [None]:
from dotenv import load_dotenv
import os
nano_path = 'notebooks/nanoGPT'
os.chdir(nano_path)
load_dotenv()

In [None]:
!ls

In [None]:
# Dataset with multiple voices present
#length of dataset in characters: 4,149,703
#all the unique characters: 
#"#'()+,-/123456789:=ABCDEFGKLM[]^_abcdefgmz|~
#vocab size: 46
#train has 3,734,732 tokens
#val has 414,971 tokens

In [14]:
!python3 data/abc_char/prepare.py

length of dataset in characters: 4,062,773
all the unique characters: 
"#'(),-/123456789:=ABCDEFGKLM[]^_abcdefgmz|~
vocab size: 45
train has 3,656,495 tokens
val has 406,278 tokens


In [15]:
!python3 train.py config/train_abc_char.py

Overriding config with config/train_abc_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-abc-char'
eval_interval = 10 # keep frequent because we'll overfit
eval_iters = 500
log_interval = 5 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'abc-char'
wandb_run_name = 'mini-char-gpt-hd-8-ly-12-bt-1'

dataset = 'abc_char'
batch_size = 1
block_size = 512 # context of up to 512 previous characters

# baby GPT model :)
n_layer = 12
n_head = 8
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 5 # n

## Test key with most occurrences: G

In [16]:
!python3 sample.py --out_dir=out-abc-char --start='M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]'

Overriding: out_dir = out-abc-char
Overriding: start = M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]
number of parameters: 21.26M
abc_char
Loading meta from data/abc_char/meta.pkl...
M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]
B/2c/2|"G"ddb|"C"a/2g/2e/2gg/2a/2|"D"ff/2e/2d/2e/2f/2|"D"F/2A/2B/2G/2AB/2c/2|"G"ddb|"C"a/2g/2e/2g/2f/2g3/2e/2|"D"d/2c/2B/2A/2G/2A/2c/2B/2A/2|"G"GBG|]

M:4/4
L:1/4
K:G
|"G"|"G"|"Am"|"D"|"G"|"G"|"D"|"G"|"G"|"Am"|"D"|"G"|"G"|"G"|"Am"|"D"|"G"|]
"G"G3/2B/2d3/2B/2|"G"d3/2B/2g3/2B/2|"Am"e3/2A/2b3/2a/2|"D"g3/2e/2f3/2e/2|"G"d3/2c/2B3/2d/2|"D"A/2G/2F/2A/2d3/2B/2|"G"d3/2B/2g3/2B/2|"G"d3/2B/2g3/2B/2|"Am"e3/2A/2b3/2a/2|"D"d3/2c/2a3/2g/2|"G"g3/2d/2B3/2g/2|"G"d3/2B/2g3/2B/2|"Am"A3/2G/2F3/2A/2|"D"f3/2e/2f3/2g/
---------------
M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]
D|"G"G/2B/2B/2d/2|"C"e/2d/2c/2d/2|"D"DF|"G"G/2B/2B/2d/2|"C"ece/2d/2|"D"f/2e/2d/2c/2|"G"B3|B/2c/2|"G"d/2d/2d/2d/2B/2|"C"ecA|"D"A/2B/2c/2A/2|"G"B/2A/2G/2F/2G/2|"C"EGG/2B/2|"D"A/2G/2F/2E/2|"G"DGG/2A/2|"C"B/

## Test major key with low samples: C

In [17]:
!python3 sample.py --out_dir=out-abc-char --start='M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]'

Overriding: out_dir = out-abc-char
Overriding: start = M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]
number of parameters: 21.26M
abc_char
Loading meta from data/abc_char/meta.pkl...
M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]
A/2B/2|"C"c/2ed/2c/2|B/2A/2G/2EA/2B/2|"F"c/2A/2F/2A/2B/2|c/2A/2F/2A/2B/2c/2|"G"d/2cB/2A/2|"G"G/2ed/2c/2|B/2A/2G/2A/2G/2E/2G/2|"C"c/2ed/2c/2|B/2A/2G/2EA/2B/2|"F"c/2A/2F/2A/2B/2c/2|"G"d/2cB/2A/2|G/2e/2d/2c/2B/2A/2G/2|"C"c/2ed/2c/2|B/2A/2G/2EA/2B/2|"F"c/2A/2F/2A/2c/2F/2A/2|B/2A/2G/2Fe/2d/2|"G"g/2fe/2d/2|"C"ced/2c/2|"G"Bd/2c/2B/2A/2G/2e/2|"C"ced/2c/2|B/2A/2G/2EA/2B/2|"F"c/2A/2F/2A/2c/2F/2A/2|B/2A/2G/2F/2A/2B/2c/2|d/2f/2e/2d/2c/2B/2A/2G/2|"G"B/2G/2e/2d/2c/2B/2A/2G/2|"C"cede/2d/2|"F"ced2|"G"Bd/2c/2B/
---------------
M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]
G|"C"cc/2d/2|c3B|A3/2G/2EC|"F"FA"C"G2|"F"F3/2G/2AF|A3/2G/2FC|"G"=EG3/2G/2|"G"cB3|"C"E3/2C/2DC|E3C|"F"FA"C"G2|"F"A3/2G/2FC|"G"=EGA|"C"BGE|"F"F3/2G/2AF|A3/2G/2FC|"G"=EGA|"G"=BGD|GAB|"C"c3/2B/2cE|"F"fff|"G"

## Test minor key with low samples: Am

In [18]:
!python3 sample.py --out_dir=out-abc-char --start='M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]'

Overriding: out_dir = out-abc-char
Overriding: start = M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]
number of parameters: 21.26M
abc_char
Loading meta from data/abc_char/meta.pkl...
M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]
A/2|"Am"aae/2g/2|"Dm"a/2g/2f/2e/2f/2g/2|"E"e/2c/2B/2A/2B/2c/2|"E"B/2c/2B/2A/2G/2E/2B/2|"Am"c/2A/2E/2F/2G/2A/2E/2|"Dm"F/2G/2A/2G/2F/2G/2A/2|"E"B/2c/2B/2A/2"E7"G/2A/2B/2G/2|"Am"A2A|]

M:4/4
L:1/4
K:Am
|"Am"|"Am"|"G"|"G"|"Am"|"Am""Em"|"Am"|"Am""Am"|"G"|"G"|"Am"|"Am""G"|"Am"|]
A/2d/2|"Am"e/2d/2e/2f/2e/2f/2g/2|"Am"a/2e/2g/2f/2e/2d/2c/2B/2|"G"d/2c/2d/2B/2GA/2B/2|"G"d/2e/2d/2e/2f/2g/2|"Am"a/2e/2g/2f/2e/2d/2c/2B/2|"Am""Am"cAA|c/2d/2|"Am"e/2a/2b/2c'/2b/2a/2g/2f/2|"Am"e/2a/2b/2c'/2"Em"b/2a/2g/2b/2|"Am"a2
---------------
M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]
E|"Am"A3/2B/2c/2B/2A/2|"Dm"EDE|"E"B3/2A/2G/2F/2|"E"EDE|"E"B3/2A/2G/2F/2|"Am"EDE|"Dm"F3/2E/2DC|"E"B,ED|"E"B3/2A/2GF|"Am"EDE|"Am"A2AB/2c/2|"Dm"d3|]

M:4/4
L:1/4
K:F#
|"F"|"F""C7"|"F"|"F""

## Test older checkpoint

In [None]:
 !python3 sample.py --out_dir=older_ckpt/m_voices --path_meta=older_ckpt/m_voices --start='M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]'

In [None]:
cat older_ckpt/m_voices/ckpt.pkl

In [None]:
!ls -l older_ckpt/m_voices

In [None]:
!ls -l out-abc-char/

In [None]:
!id