In [1]:
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import torch
import json
import gc

## Load the model as 4-bit

In [2]:
model_name = "openllama_f16"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="cuda",
    trust_remote_code=True,
    local_files_only=True
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Preparation for QLoRA

In [3]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=64, # rank
    lora_alpha=16, # scaling factor
    target_modules=[
        "q_proj", "v_proj", "k_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05, # 5% activation randomly set to 0 during training
    bias="none",
    task_type="CAUSAL_LM" # next token
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 101,703,680 || all params: 3,528,177,280 || trainable%: 2.8826


In [4]:
def load_jsonl_dataset(path):
    with open(path, "r", encoding="utf-8") as f:
        texts = [json.loads(line)["text"] for line in f]
    
    tokens = tokenizer(texts)
    tokens["labels"] = [ids.copy() for ids in tokens["input_ids"]]
    
    return Dataset.from_dict(tokens)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="kafka_openllama",
    
    # Batch & Accumulation
    per_device_train_batch_size=1,        # Real batch size per GPU
    gradient_accumulation_steps=32,       # Accumulate 32 steps → effective batch = 32
    
    # Epochs control
    num_train_epochs=1,                   # 1 epoch
    max_steps=-1,                         # -1 = use num_train_epochs instead of fixed steps
    
    # Learning rate
    learning_rate=1e-5,                   # Max LR (will be modulated by scheduler)
    lr_scheduler_type="cosine",           # Cosine annealing: smooth decay from max to 0
    warmup_ratio=0.03,                    # 3% of steps for warmup (prevents initial shock)
    
    # Optimizer
    optim="paged_adamw_8bit",             # 8-bit Adam: saves ~20% VRAM vs standard Adam
    
    # Gradient clipping
    max_grad_norm=0.3,                    # Clip gradients to prevent exploding gradients
    
    # Precision
    bf16=True,                            # Use bfloat16 (Ampere+ GPUs: A100, RTX 30xx+)
    fp16=False,                           # Don't use float16 (bf16 is better)
    
    # Logging & Checkpoints
    logging_steps=50,                     # Print logs every 50 steps
    save_steps=500,                       # Save checkpoint every 500 steps
    save_total_limit=3,                   # Keep only 3 latest checkpoints (saves disk space)
    
    # DataLoader optimizations
    dataloader_pin_memory=True,           # Faster GPU transfers (if enough RAM)
    dataloader_num_workers=2,             # Parallel data loading (2 CPU threads)
    remove_unused_columns=False,          # Don't auto-remove columns (we handle it manually)
    
    # Monitoring
    report_to="none"                      # No WandB/TensorBoard (set "tensorboard" if needed)
)

In [5]:
train_dataset = load_jsonl_dataset("train.jsonl")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

trainer.train()

model = model.merge_and_unload()
final_path = "kafkallama"
model.save_pretrained(final_path)
tokenizer.save_pretrained(final_path)

  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
50,2.1392
100,2.1013
150,2.0801




('kafkallama/tokenizer_config.json',
 'kafkallama/special_tokens_map.json',
 'kafkallama/tokenizer.model',
 'kafkallama/added_tokens.json',
 'kafkallama/tokenizer.json')

In [11]:
del model
gc.collect()
torch.cuda.empty_cache()

# Test

In [10]:
final_path = "kafkallama"
model = AutoModelForCausalLM.from_pretrained(
    "kafkallama",
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(final_path)

prompt = "K. ouvrit la porte. "

for i in range(10):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1000,
            temperature=0.8,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1
        )
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"== generation {i} ==")
    print(generated)
    print("\n")

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


== generation 0 ==
K. ouvrit la porte. 12:45, le dimanche
Je vais au cinéma avec les jeunes. Il y aura des snacks! (Pour ma famille)


== generation 1 ==
K. ouvrit la porte. 16503, The Gnostic Bible (2 vols), London : SPCK
SPECIAL OFFER: Paperback £9.47 + FREE P&P worldwide on our webstore!


== generation 2 ==
K. ouvrit la porte. 01:52 ------------------------------- #PokemonSwordAndShield#MewtwoIsMyFavoriteHero Pikachu and Squirtle are now on YouTube! Don't forget to SUBSCRIBE, like this video, share it with your friends or leave a comment below :D


== generation 3 ==
K. ouvrit la porte. À quelques pas, en face du foyer d’Hervé et de son épouse, il se dirigea vers le sien.
“Il y a des choses qui nous changent, poursuivit-il avec un air de dépit. “Tout à coup on ne peut plus rien faire pour les autres.” Et puis: “Il faut leur parler bien clairement de toutes les choses que tu es prêt à faire pour eux”… Il se leva de nouveau et alla s’asseoir dans une armoire dont l’extrémité se trouv