#### This file is responsible for building the token vocabulary 

1. Scan all the saved token sequences
2. Build a list of unique tokens
3. Map each to an integer
4. save as a vocab.json or vocab.pkl

load the REMI tokens

In [6]:
import json 
from pathlib import Path   
from collections import Counter

In [7]:
project_root = Path.cwd().parent 
remi_segments_path = project_root / "remi_segments.jsonl"

remi_segments = []
with open(remi_segments_path, "r") as f:
    for line in f:
        remi_segments.append(json.loads(line))

token_counter = Counter()
for segment in remi_segments:
    token_counter.update(segment["tokens"])

Build the vocab mapping with <PAD> and <UNK> reserved

In [8]:
special_tokens = ["<PAD>", "<UNK>"]
all_tokens = special_tokens + sorted(token_counter.keys())
vocab = {tok: idx for idx, tok in enumerate(all_tokens)}
reverse_vocab = {idx: tok for tok, idx in vocab.items()}

Save the vocab

In [9]:
vocab_path = project_root / "vocab.json"
with open(vocab_path, "w") as f:
    json.dump(vocab, f, indent=2)
print(f"Vocab saved to {vocab_path} ({len(vocab)} tokens)")

Vocab saved to d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\vocab.json (1769 tokens)


Encode/Decode Function

In [10]:
def encode(tokens):
    oov_count = 0
    indices = []
    for t in tokens:
        if t in vocab:
            indices.append(vocab[t])
        else:
            indices.append(vocab["<UNK>"])
            oov_count += 1
    return indices, oov_count

def decode(indices):
    tokens = []
    oov_count = 0
    for idx in indices:
        tok = reverse_vocab.get(idx, "<UNK>")
        tokens.append(tok)
        if tok == "<UNK>":
            oov_count += 1
    return tokens, oov_count