# Mount and import

In [31]:
from google.colab import drive
drive.mount('/content/drive')

# Base folder path
data_folder = '/content/drive/My Drive/Colab Notebooks/MARG/Deep Realbook/data/'
models_folder = '/content/drive/My Drive/Colab Notebooks/MARG/Deep Realbook/models/'
utils_folder = '/content/drive/My Drive/Colab Notebooks/MARG/Deep Realbook/utils/'

import sys
sys.path.append(utils_folder)

import importlib
import utils, config

importlib.reload(utils)  # Reload the module after making changes
importlib.reload(config)  # Reload the module after making changes

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<module 'config' from '/content/drive/My Drive/Colab Notebooks/MARG/Deep Realbook/utils/config.py'>

# Preprocess

## Chord sequences

In [2]:
from utils import extract_chords_from_json

file_name = 'playlist.json'
json_file_path = data_folder + file_name
chord_sequences = extract_chords_from_json(json_file_path)

for song, chords in chord_sequences.items():
    print(f"{song}: {chords}")

9.20 Special: ['D9', '|', 'F-6', '|', 'D9', '|', 'F-6', '|', 'C', '|', 'C7', 'B7', 'Bb7', 'A7', '|', 'D9', '|', 'G7', 'Ab7', 'G7', '|', 'D9', '|', 'F-6', '|', 'D9', '|', 'F-6', '|', 'C', '|', 'C7', 'B7', 'Bb7', 'A7', '|', 'D9', '|', 'G7', 'C6', '|', 'C7', '|', 'C7', '|', 'F6', '|', 'F6', '|', 'D7', '|', 'D7', '|', 'G7', '|', 'G7', '|', 'D9', '|', 'F-6', '|', 'D9', '|', 'F-6', '|', 'C', '|', 'C7', 'B7', 'Bb7', 'A7', '|', 'D9', '|', 'G7', 'C6']
26-2: ['F^7', 'Ab7', '|', 'Db^7', 'E7', '|', 'A^7', 'C7', '|', 'C-7', 'F7', '|', 'Bb^7', 'Db7', '|', 'Gb^7', 'A7', '|', 'D-7', 'G7', '|', 'G-7', 'C7', '|', 'F^7', 'Ab7', '|', 'Db^7', 'E7', '|', 'A^7', 'C7', '|', 'C-7', 'F7', '|', 'Bb^7', 'Ab7', '|', 'Db^7', 'E7', '|', 'A^7', 'C7', '|', 'F^7', '|', 'C-7', 'F7', '|', 'E-7', 'A7', '|', 'D^7', 'F7', '|', 'Bb^7', '|', 'Eb-7', '|', 'Ab7', '|', 'Db^7', '|', 'G-7', 'C7', '|', 'F^7', 'Ab7', '|', 'Db^7', 'E7', '|', 'A^7', 'C7', '|', 'C-7', 'F7', '|', 'Bb^7', 'Ab7', '|', 'Db^7', 'E7', '|', 'A^7', 'C7', '|', 

## Vector representations

In [3]:
from utils import create_vector_representation

vector_representations = create_vector_representation(chord_sequences)

for song, vectors in vector_representations.items():
    print(f"{song}: {vectors}")

Output hidden; open in https://colab.research.google.com to view.

## Circular representations

In [5]:
from utils import create_circular_representations

circular_representations = create_circular_representations(vector_representations)

for song, circulars in circular_representations.items():
    print(f"{song}: {circulars}")

9.20 Special: ['AADADABCBACBF', 'AADADACBCABCF', 'AADADABCBACBF', 'AAADABACBACBF', 'AAADAADAACADF', 'AACBACBACBCBE', 'AACBACBACBCBE', 'AACBACBACBCBE', 'AACBCADACBADF', 'ABACBADABCBCF', 'AABCABCABCBCE', 'AACBACBACBCBE', 'ABCADACBCBACF', 'AADADABCBACBF', 'AADADACBCABCF', 'AADADABCBACBF', 'AAADABACBACBF', 'AAADAADAACADF', 'AACBACBACBCBE', 'AACBACBACBCBE', 'AACBACBACBCBE', 'AACBCADACBADF', 'ABACBADABCBCF', 'ABACBADACABCE', 'AAADAADABCADF', 'AAADAADAADADF', 'ABACBADACABCF', 'AAADAADADAADF', 'AABCAADAADADF', 'AAADAADAADADF', 'AACBCABACBADF', 'AAADAADAADADF', 'ABCADACBCBACF', 'AADADABCBACBF', 'AADADACBCABCF', 'AADADABCBACBF', 'AAADABACBACBF', 'AAADAADAACADF', 'AACBACBACBCBE', 'AACBACBACBCBE', 'AACBACBACBCBE', 'AACBCADACBADF', 'ABACBADABCBCF', 'ABACBADACABCE']
26-2: ['AACBBCACBAADE', 'AAADCABACBADF', 'AACBBCACBAADE', 'AAADCABACBADF', 'AACBBCACBAADE', 'AACBAADAADADF', 'AADACABACBADE', 'AAADCABACBADF', 'AACBBCACBAADE', 'AAADCABACBADF', 'AACBBCACBAADE', 'AACBCABCABADF', 'AADACABACBADE', 'AACBAADA

## Mapped results

In [6]:
from config import tension_intervals_reduced
from utils import chord_to_vector, map_vectors_to_categories, get_lexicographically_smallest_rotation

from itertools import product

# Define the roots and tensions
first_root = 'C'
other_roots = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
tensions = list(tension_intervals_reduced.keys())

# Generate all combinations for chord-to-chord mapping with first root as 'C'
chord_to_chord_pairs = [(first_root + tension1, root2 + tension2) for tension1, root2, tension2 in product(tensions, other_roots, tensions)]

# Generate pairs involving 'NC', using 'C' as the root
nc_pairs = [(first_root + tension, 'NC') for tension in tensions] + \
           [('NC', first_root + tension) for tension in tensions] + \
           [('NC', 'NC')]

# Combine both sets of pairs
all_chord_pairs = chord_to_chord_pairs + nc_pairs

# Process each chord pair
mapped_results = []
for first_chord, next_chord in all_chord_pairs:
    vector1 = chord_to_vector(first_chord)
    vector2 = chord_to_vector(next_chord)
    categories = map_vectors_to_categories(vector1, vector2)
    smallest_rotation = get_lexicographically_smallest_rotation(categories)
    mapped_results.append((first_chord, next_chord, smallest_rotation))

# Print some examples from the results
print("First few mapped results:")
for example in mapped_results[:5]:
    print(example)


First few mapped results:
('C', 'C', ['A', 'A', 'A', 'A', 'D', 'A', 'A', 'A', 'D', 'A', 'A', 'D'])
('C', 'C7#11', ['A', 'A', 'A', 'D', 'A', 'C', 'D', 'A', 'A', 'C', 'A', 'D'])
('C', 'C-7', ['A', 'A', 'C', 'A', 'D', 'A', 'A', 'C', 'B', 'A', 'A', 'D'])
('C', 'C13#9', ['A', 'A', 'C', 'D', 'A', 'A', 'D', 'A', 'C', 'C', 'A', 'D'])
('C', 'C69', ['A', 'A', 'D', 'A', 'C', 'A', 'A', 'D', 'A', 'C', 'A', 'D'])


# Save and load preprocessed data

In [7]:
import json

# Save as JSON
with open(data_folder + 'chord_sequences.json', 'w') as file:
    json.dump(chord_sequences, file)

with open(data_folder + 'vector_representations.json', 'w') as file:
    json.dump(vector_representations, file)

with open(data_folder + 'circular_representations.json', 'w') as file:
    json.dump(circular_representations, file)

with open(data_folder + 'mapped_results.json', 'w') as file:
    json.dump(mapped_results, file)

In [8]:
import json

# Load the dictionary
with open(data_folder + 'chord_sequences.json', 'r') as file:
    chord_sequences = json.load(file)

with open(data_folder + 'vector_representations.json', 'r') as file:
    vector_representations = json.load(file)

with open(data_folder + 'circular_representations.json', 'r') as file:
    circular_representations = json.load(file)

with open(data_folder + 'mapped_results.json', 'r') as file:
    mapped_results = json.load(file)

# GPT2 fine_tune

## Import and device

In [9]:
!pip install accelerate -U

import torch
from tokenizers import Tokenizer, models, trainers
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m286.7/290.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m57.8 M

## Tokenizer

In [10]:
representation_to_use = circular_representations

# Extract unique sequences
unique_sequences = set()

for sequences in representation_to_use.values():
    unique_sequences.update(sequences)

# Write the sequences to a file
with open("unique_sequences.txt", "w") as file:
    for sequence in unique_sequences:
        file.write(sequence + "\n")

In [11]:
# Initialize a tokenizer with BPE
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Train the tokenizer
tokenizer.train(["unique_sequences.txt"], trainer)

# Save the tokenizer
tokenizer.save(models_folder + "custom_tokenizer")

## Config

In [12]:
from config import validation_set

# Split data into training and validation
train_sequences = {k: v for k, v in representation_to_use.items() if k not in validation_set}
validation_sequences = {k: representation_to_use[k] for k in validation_set}

# Write the training sequences to a file
with open("training_sequences.txt", "w") as file:
    for sequences in train_sequences.values():
        file.write(" ".join(sequences) + "\n")

# Write the validation sequences to a file
with open("validation_sequences.txt", "w") as file:
    for sequences in validation_sequences.values():
        file.write(" ".join(sequences) + "\n")

In [16]:
# Load your custom tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file=models_folder + "custom_tokenizer")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
model.resize_token_embeddings(len(tokenizer))

# Prepare datasets for training and validation
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="training_sequences.txt",
    block_size=512
)
validation_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="validation_sequences.txt",
    block_size=512
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Update TrainingArguments to include evaluation during training
training_args = TrainingArguments(
    output_dir=models_folder + "gpt2-circular-reps_custom_token",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=4,
    save_steps=1_000,
    save_total_limit=3,
    evaluation_strategy="epoch",  # Evaluate each epoch
)

# Instantiate Trainer with validation dataset
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:
from transformers import TrainerCallback

class SaveOnBestValidationLossCallback(TrainerCallback):
    def __init__(self):
        self.best_loss = float('inf')

    def on_evaluate(self, args, state, control, **kwargs):
        # Check if the current validation loss is better (lower) than the best loss
        if state.log_history:
            current_loss = state.log_history[-1].get('eval_loss')
            if current_loss and current_loss < self.best_loss:
                self.best_loss = current_loss
                # Save the model
                model.save_pretrained(args.output_dir)
                tokenizer.save_pretrained(args.output_dir)

# Add the callback to your trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    callbacks=[SaveOnBestValidationLossCallback()]
)

## Train

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.062236
2,No log,0.892454
3,No log,0.821731
4,No log,0.769424
5,1.235400,0.740422
6,1.235400,0.709799
7,1.235400,0.691685
8,1.235400,0.6863
9,0.837300,0.669823
10,0.837300,0.657022


Checkpoint destination directory /content/drive/My Drive/Colab Notebooks/MARG/Deep Realbook/models/gpt2-circular-reps_custom_token/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/drive/My Drive/Colab Notebooks/MARG/Deep Realbook/models/gpt2-circular-reps_custom_token/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/drive/My Drive/Colab Notebooks/MARG/Deep Realbook/models/gpt2-circular-reps_custom_token/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


KeyboardInterrupt: 

## Inference and decoding

In [19]:
# Load the best model saved by the callback
model_path = training_args.output_dir  # The directory where the best model is saved
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)

# Ensure the device is set correctly for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(5201, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=5201, bias=False)
)

In [20]:
# Create a text generation pipeline using the best model
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device)

In [34]:
# Define a starting prompt
prompt = "AADACABACBADF"  # Example starting sequence

# The length you want for the generated part
desired_length = 400

# Calculate the total length including the prompt
total_length = len(tokenizer.encode(prompt)) + desired_length

# Generate text with the specified total length
generated_sequences = [sequence['generated_text'] for sequence in generator(prompt, max_length=total_length, num_return_sequences=3)]

print(generated_sequences)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['AADACABACBADF AAADC ABACBAD F AAADAAD AAADD F ABACBAC ADB AC F AADAC ABACBAD F AAAD AADACBAD F AADACCB ACBAD F ABADB ACBC ABC F AADAC ABACBAD F ABACBAD ACABC F ABADACBAC ABC F AACBAD AADACD F AABCABC ABADB F AAACDAAD AAAD F AAAD AACBD AAD F AAADACBAD AAD F AAAD AB AADAAD F AAADACB ACBAD F AACBAD AADACD E ABADB ACBC ABC F AADAC ABACBAD F ABACBAD ACABC F ABACBACABCADF\n AAADACBAD AAD F AAADABCAD AAD F AAADACBAD AAD F AADACBAC ABAD F AAADAC AADAAD F AACBAD AADACD F AAAD AB AADAAD F AAADC ABACABD F AAADBAC ABACD F AACBAADAC ABD F AADACABC ABAD F AADAC ABACBAD E AAADC ABACBAD F AACBB AC ABCAD E AACBACBCB AAD E ABACBACBCB AC F AACBACBCB AAD E AAADC ABACBAD F AAADAAD AAADD F AACBACBAC ADB F AACBACB ACBCB E AAADC ABACBAD F AADAADAC ABBC F AACBACB ACBCB E AAAD AADACBAD F AADACCB ACBAD E ABADB ACBC ABC F AAADACBAD AAD F AAADABCAD AAD F AAADACBAD AAD F AADACBAC ABAD F AAADAC AADAAD F AACBAD AADACD F AAAD AB AADAAD F AAADC ABACABD F AAADBAC ABACD F AACBAADAC ABD F AADACABC ABAD F AADAC ABACBAD E

In [35]:
from utils import process_and_transpose_sequences

rearranged_sequences = []

for sequence in generated_sequences:
    # Remove all existing spaces
    sequence_no_spaces = sequence.replace(" ", "")

    new_sequence = ''
    for char in sequence_no_spaces:
        new_sequence += char
        if char in ['E', 'F']:
            new_sequence += ' '

    # Strip trailing space and add to list
    rearranged_sequences.append(new_sequence.strip())

# rearranged_sequences now contains the modified sequences

# Process and transpose the generated sequences
all_transposed_sequences, all_final_sequences = process_and_transpose_sequences(rearranged_sequences, mapped_results)

# Print the final transposed chord stream for each generated sequence
for sequence_index, transposed_chords in enumerate(all_transposed_sequences):
    print(f"Transposed Sequence {sequence_index + 1}:")
    for first_chord, second_chord in transposed_chords:
        print(f"{first_chord} {second_chord}")
    print()  # Print a new line for separation between sequences

# Print the final sequence as a stream of first chords with bar tokens
for sequence_index, final_sequence in enumerate(all_final_sequences):
    print(f"Final Chord Stream {sequence_index + 1}: {' '.join(final_sequence)}")
    print()  # Print a new line for separation between sequences

Generated Sequence: AADACABACBADF AAADCABACBADF AAADAADAAADDF ABACBACADBACF AADACABACBADF AAADAADACBADF AADACCBACBADF ABADBACBCABCF AADACABACBADF ABACBADACABCF ABADACBACABCF AACBADAADACDF AABCABCABADBF AAACDAADAAADF AAADAACBDAADF AAADACBADAADF AAADABAADAADF AAADACBACBADF AACBADAADACDE ABADBACBCABCF AADACABACBADF ABACBADACABCF ABACBACABCADF 
AAADACBADAADF AAADABCADAADF AAADACBADAADF AADACBACABADF AAADACAADAADF AACBADAADACDF AAADABAADAADF AAADCABACABDF AAADBACABACDF AACBAADACABDF AADACABCABADF AADACABACBADE AAADCABACBADF AACBBACABCADE AACBACBCBAADE ABACBACBCBACF AACBACBCBAADE AAADCABACBADF AAADAADAAADDF AACBACBACADBF AACBACBACBCBE AAADCABACBADF AADAADACABBCF AACBACBACBCBE AAADAADACBADF AADACCBACBADE ABADBACBCABCF AAADACBADAADF AAADABCADAADF AAADACBADAADF AADACBACABADF AAADACAADAADF AACBADAADACDF AAADABAADAADF AAADCABACABDF AAADBACABACDF AACBAADACABDF AADACABCABADF AADACABACBADE ABACBADACABCF ABACBACABCADF 
AAAADAADAAADF AAADACBACABCE AACBADAADACBE AACBCABCABADF AAADAADADAADF AADACBABCABC

In [36]:
formatted_chord_sequences = []

for transposed_chords in all_transposed_sequences:
    # Join the first chord of each pair with the specified format
    formatted_sequence = "|".join([f"{chord[0]}." for chord in transposed_chords]) + "|"
    formatted_chord_sequences.append(formatted_sequence)

# Print the formatted chord sequences
for sequence_index, sequence in enumerate(formatted_chord_sequences):
    print(f"Formatted Sequence {sequence_index + 1}: {sequence}")


Formatted Sequence 1: C-7.|F7.|Bb^7.|Bb^7.|A-7.|D7.|B-7.|E7b9.|A-7.|D7.|E-7.|Gbh7.|B7b9.|E-.|E-^7.|E-7.|Eh7.|G7.|Dbh7.|Gb7b9.|B-7.|E7.|Gb-7.|E-7.|Eh7.|G-7.|Gh7.|C-.|Ch7.|Fh7.|Ab^7.|Db^7.|Ab^7.|Ab-7.|Db-7.|Gb7.|B^7.|Gb-7.|F7.|E-7.|Eb7.|Ab^7.|Ab^7.|G7.|Gb7.|B^7.|Ab7.|G7.|E-7.|A7b9.|D-7.|Dh7.|F-7.|Fh7.|Bb-.|Bbh7.|Ebh7.|Gb^7.|B^7.|Gb^7.|Gb-7.|B-7.|E7.|Gb-7.|E-.|E-.|Gbh7.|B7.|E-7.|E-7.|Db7b9.|Gb-.|Gb-.|Abh7.|Db7.|Gb-7.|Gb-7.|Eb7.|D7b9.|G-.|G-.|Ah7.|D7.|G-7.|Ab-7.|A-7.|D7.|Gbh7.|B7.|E7.|A7.|D7.|Ab-7.|Db7.|Gb-7.|B7.|E^7.|Gbh7.|B7.|E-7.|
Formatted Sequence 2: C-7.|F7.|F7.|F-7.|Bb7.|Eb^7.|Eb^7.|Eb^7.|D-7.|G7.|C-7.|F7.|D7.|G-7.|C-7.|C7.|F-7.|Bb7.|G-7.|C7.|F-7.|Bb7.|Ab-.|Ab7b9#5.|Db-7.|Db7.|Gb-7.|B7.|Ab-7.|Db7.|Gb-7.|B7.|A.|D.|Ebh7.|Gb-7.|B7.|E^7.|Db-7.|Gb7.|B^7.|E-7.|A7.|D^7.|A^7.|Ah7.|B-7.|E7.|A^7.|Ab7.|G7b9.|C^7.|F^7.|B7.|Bb^7.|Bb^7.|Ab7#11.|Ab7#9.|Db-7.|Db7.|Gb-7.|B7.|Ab-7.|Db7.|Gb-7.|B7.|A-.|A7b9#5.|D-7.|D7.|G-7.|C7.|A-7.|D7.|G-7.|C7.|Bb.|Eb.|Eh7.|G-7.|C7.|F^7.|D-7.|G7.|C^7.|F-7.|Bb7.|Eb^7.|