In [None]:
import pandas as pd
import numpy as np
import re

def load_dataframe(relative_path,dataframe_name):
    df = pd.read_pickle(f'{relative_path}/{dataframe_name}.pkl')    
    return df

def read_file(relative_path,file_name):
    text= ""
    with open(f'{relative_path}/{file_name}.abc','r') as f:
        text = f.read()
    return text

In [None]:
relative_path ="notebooks/data/final_dataset"
filename_name = 'clean_augmented_data'
#filename_name = 'clean_original_training_data'
#relative_path ="notebooks/data/original_dataset"
training_data_df = load_dataframe(relative_path,filename_name)
training_data_df.columns


In [None]:
training_data_df.tail()

In [None]:
training_data_df["clean_header"].str.len().mean()

In [None]:
training_data_df["clean_body"].str.len().mean()

In [None]:
bodies = ""
silences = 0
for body in training_data_df["clean_body"]:
    if 'z' in body:
        silences +=1 
    bodies += body+"\n"
chars = sorted(list(set(bodies)))
vocab_size = len(chars)
print('vocab: ',''.join(chars))
print('vocab_size',vocab_size)
print("silences ",silences)

In [None]:
training_data_text = read_file(relative_path,filename_name)

print("number of chars:",len(training_data_text))

In [None]:
chars = sorted(list(set(training_data_text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

In [None]:
import torch

print(torch.__version__)

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import wandb
import tiktoken

print(wandb.__version__)

In [None]:
!ls

In [2]:
from dotenv import load_dotenv
import shlex
import os
nano_path = 'notebooks/nanoGPT'
os.chdir(nano_path)
load_dotenv()

True

In [3]:
!ls

LICENSE      assets	      data	  out-abc-char	wandb
README.md    config	      model.py	  sample.py
__pycache__  configurator.py  older_ckpt  train.py


In [None]:
# Dataset with multiple voices present
#length of dataset in characters: 4,149,703
#all the unique characters: 
#"#'()+,-/123456789:=ABCDEFGKLM[]^_abcdefgmz|~
#vocab size: 46
#train has 3,734,732 tokens
#val has 414,971 tokens

## Train Normal Dataset

In [6]:
!python3 data/abc_char/prepare.py

length of dataset in characters: 4,062,773
all the unique characters: 
"#'(),-/123456789:=ABCDEFGKLM[]^_abcdefgmz|~
vocab size: 45
train has 3,656,495 tokens
val has 406,278 tokens


In [7]:
!python3 train.py config/train_abc_char.py

Overriding config with config/train_abc_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-abc-char'
eval_interval = 10 # keep frequent because we'll overfit
eval_iters = 500
log_interval = 5 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'abc-char'
wandb_run_name = 'mini-char-gpt-hd-8-ly-12-bt4-ovrf'

dataset = 'abc_char'
batch_size = 4
block_size = 512 # context of up to 512 previous characters

# baby GPT model :)
n_layer = 12
n_head = 8
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 8000
lr_decay_iters = 8000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 5

## Train Roman Numeral Dataset

In [4]:
!python3 data/abc_roman_num_char/prepare.py

length of dataset in characters: 4,051,724
all the unique characters: 
"#'(),-/12345689:=ABCDEFGIKLMV[]^_abcdefgimvz|~
vocab size: 48
train has 3,646,551 tokens
val has 405,173 tokens


In [5]:
!python3 train.py config/train_abc_char.py

Overriding config with config/train_abc_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-abc-char'
eval_interval = 10 # keep frequent because we'll overfit
eval_iters = 500
log_interval = 5 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'abc-char'
wandb_run_name = 'mini-char-gpt-hd-8-ly-12-bt4-rn-data-ovrf'

dataset = 'abc_roman_num_char'
batch_size = 4
block_size = 512 # context of up to 512 previous characters

# baby GPT model :)
n_layer = 12
n_head = 8
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 8000
lr_decay_iters = 8000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

## Test

In [6]:
folder_name = 'hd-8-ly-12-bt4-rn-data-ovrf'
examples_folder = f'./older_ckpt/{folder_name}'
examples_folder

'./older_ckpt/hd-8-ly-12-bt4-rn-data-ovrf'

In [7]:
songs_start = {
    'G':'M:4/4L:1/4K:G|"G"|"C"|"D"|"D"|"G"|"C"|"D"|"G"|]',
    'C':'M:4/4L:1/4K:C|"C"|"F"|"G"|"G"|"C"|"F"|"G"|"C"|]',
    'Am':'M:4/4L:1/4K:Am|"Am"|"Dm"|"E"|"E"|"Am"|"Dm"|"E"|"Am"|]'
    }

songs_roman_start = {
    'G':'M:4/4L:1/4K:G|"I"|"IV"|"V"|"V"|"I"|"IV"|"V"|"I"|]',
    'C':'M:4/4L:1/4K:C|"I"|"IV"|"V"|"V"|"I"|"IV"|"V"|"I"|]',
    'Am':'M:4/4L:1/4K:Am|"i"|"iv"|"V"|"V"|"i"|"iv"|"V"|"i"|]'
}

## Test key with most occurrences: G

In [8]:
song_start = songs_roman_start['G']
#song_start = songs_start['G']
song_start

'M:4/4L:1/4K:G|"I"|"IV"|"V"|"V"|"I"|"IV"|"V"|"I"|]'

In [9]:
!python3 sample.py --out_dir=out-abc-char --start={shlex.quote(song_start)} > {examples_folder}/examples_G.txt

## Test major key with low samples: C

In [10]:
song_start = songs_roman_start['C']
#song_start = songs_start['C']
song_start

'M:4/4L:1/4K:C|"I"|"IV"|"V"|"V"|"I"|"IV"|"V"|"I"|]'

In [11]:
!python3 sample.py --out_dir=out-abc-char --start={shlex.quote(song_start)} > {examples_folder}/examples_C.txt

## Test minor key with low samples: Am

In [12]:
song_start = songs_roman_start['Am']
#song_start = songs_start['Am']
song_start

'M:4/4L:1/4K:Am|"i"|"iv"|"V"|"V"|"i"|"iv"|"V"|"i"|]'

In [13]:
!python3 sample.py --out_dir=out-abc-char --start={shlex.quote(song_start)} > {examples_folder}/examples_Am.txt

## Move checkpoint files

In [14]:
source = './data/abc_roman_num_char/meta.pkl'
#source = './data/abc_char/meta.pkl'
target_folder = examples_folder
!mv {source} {target_folder}/meta.pkl

In [15]:
source = './out-abc-char/ckpt.pt'
!mv {source} {target_folder}/ckpt.pt

In [16]:
source = './config/train_abc_char.py'
!cp {source} {target_folder}/config.txt

## Test older checkpoint

In [None]:
song_start = songs_start['Am']
!echo {shlex.quote(song_start)}

In [None]:
 song_start = songs_start['Am']
 !python3 sample.py --out_dir=older_ckpt/m_voices --path_meta=older_ckpt/m_voices --start={shlex.quote(current_start)}

In [None]:
!id

In [None]:
!python3 --version
