In [None]:
RANDOM_SEED = 42
TOKENIZER="nicholasKluge/TeenyTinyLlama-460m"
MODEL="nicholasKluge/TeenyTinyLlama-460m"
TOTAL_SAMPLES = 2108999 # hard-coded because the dataset does not provide this metadata

In [None]:
%pip install huggingface_hub
%pip install datasets==3.6.0
%pip install transformers
%pip install torch

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import torch
import random
import math

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER, revision='main')
model = AutoModelForCausalLM.from_pretrained(MODEL, revision='main')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.eval()
model.to(device)

print(f"Connected to {device}.")

In [None]:
ds_stream = load_dataset("carolina-c4ai/corpus-carolina", split="corpus", streaming=True, trust_remote_code=True)

In [None]:
test_sentence = "Isto Ã© um teste."
test_input = tokenizer(test_sentence, return_tensors='pt').to(device)

for token in test_input.input_ids[0]:
  print(f"{token} -> {tokenizer.decode(token)}")

print(f"Number of tokens is {test_input.input_ids.shape[1]}")

In [None]:
random.seed(RANDOM_SEED)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

BATCH_SIZE = 32

cum_nll_nats = 0
cum_chars = 0
batch_texts = []
big_batch_texts = []
cnt = 1

def process_batch(texts, cum_nll, cum_chr):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length = 4096).to(device)

    # Create labels: -100 is ignored by the loss function
    labels = inputs.input_ids.clone()
    if tokenizer.pad_token_id is not None:
        labels[labels == tokenizer.pad_token_id] = -100

    with torch.no_grad():
        outputs = model(inputs.input_ids, attention_mask=inputs.attention_mask, labels=labels)

    # outputs.loss is the average loss per valid token in the batch
    # We need to scale it back to the total sum of losses
    # The model shifts labels internally (labels[..., 1:]), so we count valid tokens in the shifted labels
    shift_labels = labels[..., 1:].contiguous()
    n_valid_tokens = (shift_labels != -100).sum()

    cum_nll += outputs.loss * n_valid_tokens
    cum_chr += sum(len(t) for t in texts)
    return cum_nll, cum_chr

for record in tqdm(ds_stream, total=TOTAL_SAMPLES):
    p = random.random()

    if p > 0.1:
        continue

    text = record['text']

    batch_texts.append(text)

    for batch in [batch_texts]:
      if len(batch) >= BATCH_SIZE:
          cum_nll_nats, cum_chars = process_batch(batch, cum_nll_nats, cum_chars)
          batch.clear()

          cnt -= 1
          if cnt == 0:
            break

# Process any remaining samples
for batch in [batch_texts]:
      if len(batch) >= BATCH_SIZE:
          cum_nll_nats, cum_chars = process_batch(batch, cum_nll_nats, cum_chars)
          batch.clear()

In [None]:
cum_nll_bits = cum_nll_nats / math.log(2)
print(f"Bits per character: {cum_nll_bits / cum_chars}")