In [None]:
import torch
import pytorch_lightning as pl
from importlib import reload
from omegaconf import OmegaConf
import sys

sys.path.append(r"../../")
from circe.models.LightningClassifier import LightningClassifier

cfg_model = OmegaConf.load('../training/conf/model/hf-gpt-starttoken.yaml')

ckpt_path = "???"
model = LightningClassifier(cfg=cfg_model)
model.configure_sharded_model()
model.load_state_dict(torch.load(ckpt_path)["state_dict"])
model.eval()

In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters()) / 1e6
pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6
print(f"Total params: {pytorch_total_params:.2f}M")
print(f"Trainable params: {pytorch_trainable_params:.2f}M")

In [None]:
out_dir = 'out' # ignored if init_from is not 'resume'
num_samples = 5 # number of samples to draw
max_new_tokens = 3_000 # number of tokens generated in each sample
temperature = 0.5 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 10 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = "cuda"
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast

In [None]:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [None]:
from contextlib import nullcontext
from tqdm.notebook import tqdm

generated_chunks = []
start_ids = []
num_codebooks = 4
x = (torch.tensor(start_ids, dtype=torch.long, device=next(model.parameters()).device)[None, ...])
# run generation
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=torch.float16)
with torch.no_grad():
    with ctx:
        for k in tqdm(range(num_samples)):
            y = model.generate_multiple(x, max_new_tokens, multiple=num_codebooks, temperature=temperature, top_k=top_k)
            generated_chunks.append(y[0].tolist())
            print(generated_chunks[-1])
            print('---------------')

In [None]:
from encodec import EncodecModel
from encodec.utils import convert_audio
from einops import rearrange
import torchaudio
import torch
from IPython.display import Audio

# Instantiate a pretrained EnCodec model
encodec = EncodecModel.encodec_model_24khz()
encodec.eval()
encodec.set_target_bandwidth(1.5)
codes = torch.tensor(generated_chunks[3])
codes = rearrange(codes, "(s nq) -> 1 nq s", nq=num_codebooks)  # [B, n_q, T]
decoded_wav = encodec.decode([(codes, None)])
Audio(decoded_wav.squeeze().detach(), rate=encodec.sample_rate)

## Conditioning with other sad piano songs

In [None]:
path_to_song = "???"

In [None]:
import numpy as np
codes = np.load(path_to_song)
codes.shape

In [None]:
out_dir = 'out' # ignored if init_from is not 'resume'
num_samples = 5 # number of samples to draw
max_new_tokens = 3_000 # number of tokens generated in each sample
temperature = 1.25 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 20 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = "cuda"
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast

In [None]:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [None]:
from contextlib import nullcontext
from tqdm.notebook import tqdm
from einops import rearrange

num_codebooks = 4
generated_chunks = []
# I get the middle because there is probably some active music
start_ids = rearrange(codes, "1 q n -> (n q)", q=num_codebooks)
first_id = (start_ids.shape[-1] // 8) * 4
print(first_id)
start_ids = start_ids[first_id:first_id + cfg_model.block_size]
x = (torch.tensor(start_ids, dtype=torch.long, device=next(model.parameters()).device)[None, ...])
# run generation
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=torch.float16)
with torch.no_grad():
    with ctx:
        for k in tqdm(range(num_samples)):
            y = model.generate_multiple(x, max_new_tokens, multiple=num_codebooks, temperature=temperature, top_k=top_k)
            generated_chunks.append(y[0].tolist())
            print(generated_chunks[-1])
            print('---------------')

In [None]:
from encodec import EncodecModel
from encodec.utils import convert_audio
from einops import rearrange
import torchaudio
import torch
from IPython.display import Audio

# Instantiate a pretrained EnCodec model
encodec = EncodecModel.encodec_model_24khz() # [B, n_q, T]
encodec.eval()
encodec.set_target_bandwidth(1.5)
codes = torch.tensor(generated_chunks[4])
codes = rearrange(codes, "(s nq) -> 1 nq s", nq=num_codebooks)  # [B, n_q, T]
decoded_wav = encodec.decode([(codes, None)])
Audio(decoded_wav.squeeze().detach(), rate=encodec.sample_rate)