In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import SequentialLR, LinearLR, CosineAnnealingLR
import re

  from .autonotebook import tqdm as notebook_tqdm


# Load a pre-trained model

## Load from HuggingFace

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32
).to(device)

tokenizer.save_pretrained("tinyllama")
model.save_pretrained("tinyllama")

## Load from local repo

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("./tinyllama")
model = AutoModelForCausalLM.from_pretrained("./tinyllama", torch_dtype=torch.float32).to(device)

In [None]:
prompt = "Il était tard lorsque  K. arriva. Une neige épaisse couvrait le village. La colline était cachée par la brume et par la nuit, nul rayon de lumière n’indiquait le grand Château."
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.8,
        top_p=0.9,
        do_sample=True,
    )

print(tokenizer.decode(output[0], skip_special_tokens=True))

## Prepare the dataset

In [4]:
with open("chateau.txt", "r", encoding="utf-8") as f:
    txt = f.read()

# remove page numbers
txt = re.sub(r"–\s*\d+\s*–\n", "", txt)

# fix split words
txt = re.sub(r"-\n", "", txt)

# remove line breaks
txt = re.sub(r"\n", " ", txt)

# use a single type of -
txt = re.sub(r"–", "-", txt)

with open("clean.txt", "w", encoding="utf-8") as f:
    f.write(txt)

## Fine-tune on Le Château by Kafka

In [None]:
text = open("clean.txt", encoding="utf-8").read()
tokens = tokenizer.encode(text)

seq_len = 128
stride = seq_len // 2
total_steps = (len(tokens) - seq_len - 1) // stride
print(total_steps)

log_path = "tinyllama_kafka.log"
with open(log_path, "w", encoding="utf-8") as log:
    log.write("step,loss,lr\n")

inputs, targets = [], []
for i in range(0, len(tokens) - seq_len - 1, stride):
    chunk = tokens[i : i + seq_len + 1]
    inputs.append(chunk[:-1])
    targets.append(chunk[1:])

X = torch.tensor(inputs, dtype=torch.long)
Y = torch.tensor(targets, dtype=torch.long)

loader = DataLoader(TensorDataset(X, Y), batch_size=2, shuffle=True)
optimizer = AdamW(model.parameters(), lr=1e-6)

warmup_steps = int(total_steps * 0.05)
scheduler = SequentialLR(
    optimizer,
    schedulers=[
        LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps),
        CosineAnnealingLR(optimizer, T_max=total_steps - warmup_steps, eta_min=1e-7),
    ],
    milestones=[warmup_steps]
)
scaler = GradScaler()
model.train()

total_loss, step = 0, 0

for Xb, Yb in loader:
    step += 1
    Xb, Yb = Xb.to(device), Yb.to(device)
    optimizer.zero_grad()

    with autocast():
        outputs = model(Xb, labels=Yb)
        loss = outputs.loss

    scaler.scale(loss).backward()
    scaler.unscale_(optimizer)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    scaler.step(optimizer)
    scaler.update()
    scheduler.step()
    total_loss += loss.item()
    avg_loss = total_loss / step
    lr_now = scheduler.get_last_lr()[0]
    with open(log_path, "a", encoding="utf-8") as log:
        log.write(f"step {step} | loss {avg_loss:.10f} | lr {lr_now:.2e}\n")

torch.save(model.state_dict(), f"tinyllama_kafka.pt")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("./tinyllama")
model = AutoModelForCausalLM.from_pretrained("./tinyllama", torch_dtype=torch.float16).to(device)

state_dict = torch.load(f"tinyllama_kafka.pt", map_location=device)
model.load_state_dict(state_dict)

In [None]:
prompt = "Il était tard lorsque  K. arriva. Une neige épaisse couvrait le village. La colline était cachée par la brume et par la nuit, nul rayon de lumière n’indiquait le grand Château."
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.8,
        top_p=0.9,
        do_sample=True,
    )

print(tokenizer.decode(output[0], skip_special_tokens=True))