In [None]:
import pandas as pd
import numpy as np
import re

def load_dataframe(relative_path,dataframe_name):
    df = pd.read_pickle(f'{relative_path}/{dataframe_name}.pkl')    
    return df

def read_file(relative_path,file_name):
    text= ""
    with open(f'{relative_path}/{file_name}.abc','r') as f:
        text = f.read()
    return text

In [None]:
relative_path ="notebooks/data/final_dataset"
filename_name = 'clean_augmented_data'
#filename_name = 'clean_original_training_data'
#relative_path ="notebooks/data/original_dataset"
training_data_df = load_dataframe(relative_path,filename_name)
training_data_df.columns


In [None]:
training_data_df.tail()

In [None]:
training_data_df["clean_header"].str.len().max()

In [None]:
training_data_df["clean_body"].str.len().max()

In [None]:
bodies = ""
silences = 0
for body in training_data_df["clean_body"]:
    if 'z' in body:
        silences +=1 
    bodies += body+"\n"
chars = sorted(list(set(bodies)))
vocab_size = len(chars)
print('vocab: ',''.join(chars))
print('vocab_size',vocab_size)
print("silences ",silences)

In [None]:
training_data_text = read_file(relative_path,filename_name)

print("number of chars:",len(training_data_text))

In [None]:
chars = sorted(list(set(training_data_text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

In [None]:
import torch

print(torch.__version__)

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import wandb
import tiktoken

print(wandb.__version__)

In [1]:
!ls

Dockerfile  docker-compose.yaml  overrides.json
README.md   notebooks		 requirements.txt


In [2]:
from dotenv import load_dotenv
import os
nano_path = 'notebooks/nanoGPT'
os.chdir(nano_path)
load_dotenv()

True

In [3]:
!ls

LICENSE      assets	      data	  out-abc-char	wandb
README.md    config	      model.py	  sample.py
__pycache__  configurator.py  older_ckpt  train.py


In [None]:
# Dataset with multiple voices present
#length of dataset in characters: 4,149,703
#all the unique characters: 
#"#'()+,-/123456789:=ABCDEFGKLM[]^_abcdefgmz|~
#vocab size: 46
#train has 3,734,732 tokens
#val has 414,971 tokens

In [4]:
!python3 data/abc_char/prepare.py

length of dataset in characters: 4,062,773
all the unique characters: 
"#'(),-/123456789:=ABCDEFGKLM[]^_abcdefgmz|~
vocab size: 45
train has 3,656,495 tokens
val has 406,278 tokens


In [5]:
!python3 train.py config/train_abc_char.py

Overriding config with config/train_abc_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-abc-char'
eval_interval = 10 # keep frequent because we'll overfit
eval_iters = 500
log_interval = 5 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'abc-char'
wandb_run_name = 'mini-char-gpt-hd-8-ly-12-bt-4-ctx-256'

dataset = 'abc_char'
batch_size = 4
block_size = 256 # context of up to 512 previous characters

# baby GPT model :)
n_layer = 12
n_head = 8
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters

## Test key with most occurrences: G

In [6]:
!python3 sample.py --out_dir=out-abc-char --start='M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]'

Overriding: out_dir = out-abc-char
Overriding: start = M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]
number of parameters: 21.26M
abc_char
Loading meta from data/abc_char/meta.pkl...
M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]
B/2c/2|"G"d3/2e/2d/2c/2|B/2G/2D/2G/2|"C"cde|"D"d3/2e/2f/2e/2|"D"d/2B/2A/2G/2|"G"G/2B/2d3/2e/2|"D"d3/2e/2f/2e/2|"G"B/2G/2D3/2G/2|"C"cde|"D"dc"G"c|B/2c/2|"G"d3/2g/2g3/2f/2|"C"e/2c/2e/2g3/2e/2|"D"d/2B/2A/2G/2F/2G/2A/2|"D"c/2A/2d/2f/2a3/2b/2|"C"a/2g/2f/2e/2"G"d/2B/2G/2B/2|"C"c/2e/2d/2e/2f/2g3/2B/2|"D"A/2G/2A/2B/2"G"G2|]

M:4/4
L:1/4
K:Gm
|"Gm"|"Dm"|"Dm"|"Gm"|"Gm""Dm"|"Gm"|"Gm"|"Dm"|"Dm"|"Gm"|"Dm"|"Gm"|"Gm""C7"|"F"|"F"|"B""F"|"F""Dm"|"Gm"|]
D|"Gm"GABBAG|"Dm"Adddef|"Dm"dcAAdd|"Gm"GABBAG|"Gm"dcd"Dm"d2
---------------
M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]
B/2c/2|"G"d3/2d/2e/2d/2|"C"e/2d/2c/2B/2"D"A/2B/2|"G"B/2c/2d/2e/2d/2|"C"e/2d/2c/2B/2c/2G/2|"D"AD/2D/2F/2A/2|"D"A/2G/2F/2E/2D/2E/2F/2|"G"GGG|d/2c/2|"G"B/2d/2g/2d/2B/2d/2g/2d/2|"C"e/2g/2c/2a/2e/2g/2c/2g/2|"

## Test major key with low samples: C

In [7]:
!python3 sample.py --out_dir=out-abc-char --start='M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]'

Overriding: out_dir = out-abc-char
Overriding: start = M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]
number of parameters: 21.26M
abc_char
Loading meta from data/abc_char/meta.pkl...
M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]
A/2B/2|"C"c/2ee/2c/2e|c/2e/2g/2e/2ce|"F"f/2e/2d/2f/2d/2e/2d/2c/2|"G"B/2c/2d/2e/2f/2e/2d/2|"G"B/2e/2d/2c/2B/2G/2A/2B/2|"C"c/2ee/2c/2e|c/2e/2g/2e/2ce|"F"d/2e/2f/2g/2aa|c/2d/2e/2f/2d/2e/2d/2c/2|"G"B/2c/2d/2e/2f/2e/2d/2f/2|"G"e/2d/2c/2B/2"C"ce|"G"B/2c/2d/2e/2f/2e/2d/2f/2|"C"g/2e/2d/2e/2f/2e/2d/2f/2|"C"g/2e/2d/2c/2B/2c/2d/2|"D7"B/2G/2A/2B/2"G"G|]

M:4/4
L:1/4
K:D
|"D"|"D"|"G"|"A7"|"D"|"D"|"G"|"A7"|"D"|"D"|"G"|"A7"|"D"|]
AG|"D"FDFD|F/2A/2d/2c/2B/2A/2B/2A/2|"D"FDFD|"G"GBE3/2G/2|"A7"c/2B/2A/2B/2c/2B/2A
---------------
M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]
G|"C"e/2e/2e/2d/2e3/2c/2d/2|e/2e/2e/2d/2e/2d/2c/2d/2|"F"c/2c/2c/2B/2A/2G/2F/2|"G"e/2e/2d/2e3/2G/2|"G"d/2d/2d/2e/2f/2e/2d/2c/2|"G"B/2G/2A/2B/2G/2A/2B/2c/2|"G"d/2d/2c/2B/2G/2A/2B/2c/2|"G"d/2d/2c/2B/2"D7"A/

## Test minor key with low samples: Am

In [8]:
!python3 sample.py --out_dir=out-abc-char --start='M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]'

Overriding: out_dir = out-abc-char
Overriding: start = M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]
number of parameters: 21.26M
abc_char
Loading meta from data/abc_char/meta.pkl...
M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]
A/2B/2|"F"c/2F/2F/2c/2F/2c/2F/2|"F"A/2G/2F/2d/2F/2c/2F/2|"G"=B/2G/2d/2G/2B/2G/2d/2G/2|"G"=B/2G/2d/2G/2e/2G/2d/2G/2|"Am"A/2B/2c/2d/2e/2f/2g/2e/2|"Dm"f/2e/2d/2c/2"A"AB/2c/2|"Dm"d/2^c/2d/2e/2f/2e/2d/2c/2|"E"=B/2G/2A^B|"Am"c/2A/2^B/2c/2"Dm"d/2c/2d|"C"=e/2g/2e/2cg/2e/2|"Dm"f/2e/2d/2c/2"G7"=B/2c/2d|"C"c/2d/2e/2c/2"G7"d/2e/2f/2d/2|"C"c/2d/2e/2c/2"G"=B/2c/2d|"G"d/2e/2f/2d/2"C"e/2f/2g/2e/2|"Dm"f/2e/2d/2c/2"A"Ad/2e/2|"Dm"f/2e/2d/2c/2"G7"=B/2c/2d/2B/2|"C"c/2d/2e/2c/2"G"=B/2c/2d|"F"A/2G/2A/2=B/2"C"c/2G/2E/
---------------
M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]
E|"A"AA/2B/2c/2B/2c/2d/2|"Am"e/2f/2e/2c/2e/2c/2d/2|"Dm"a/2d/2c/2d/2B/2"E"e2|"E"^G/2B/2e/2G/2B/2e/2G/2B/2|"E"^G/2B/2e/2c/2B/2c/2d/2|"E"^g/2e/2c/2B/2c/2d/2e/2c/2|"Am"A/2B/2c/2d/2e/2d/2c/2

## Test older checkpoint

In [None]:
 !python3 sample.py --out_dir=older_ckpt/m_voices --path_meta=older_ckpt/m_voices --start='M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]'

In [None]:
cat older_ckpt/m_voices/ckpt.pkl

In [None]:
!ls -l older_ckpt/m_voices

In [None]:
!ls -l out-abc-char/

In [None]:
!id