In [1]:
!pip uninstall -y transformers peft accelerate bitsandbytes protobuf tensorflow tensorboard
!pip install protobuf==5.29.2
!pip install transformers peft accelerate bitsandbytes

#import os
#os.kill(os.getpid(), 9)

Found existing installation: transformers 4.53.3
Uninstalling transformers-4.53.3:
  Successfully uninstalled transformers-4.53.3
Found existing installation: peft 0.16.0
Uninstalling peft-0.16.0:
  Successfully uninstalled peft-0.16.0
Found existing installation: accelerate 1.9.0
Uninstalling accelerate-1.9.0:
  Successfully uninstalled accelerate-1.9.0
[0mFound existing installation: protobuf 6.33.0
Uninstalling protobuf-6.33.0:
  Successfully uninstalled protobuf-6.33.0
Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
Found existing installation: tensorboard 2.18.0
Uninstalling tensorboard-2.18.0:
  Successfully uninstalled tensorboard-2.18.0
Collecting protobuf==5.29.2
  Downloading protobuf-5.29.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading protobuf-5.29.2-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3

In [2]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from transformers import Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch


In [3]:
# Load dataset
data_files = {
    "train": "/kaggle/input/ensate-llm-fine-tuning-dataset/train.jsonl",
    "validation": "/kaggle/input/ensate-llm-fine-tuning-dataset/val.jsonl",
    "test": "/kaggle/input/ensate-llm-fine-tuning-dataset/test.jsonl"
}
dataset = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
# Load base model + tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [6]:
# Prepare model for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)



In [7]:
# Format examples into text
def format_example(example):
    if example["input"]:
        prompt = f"Instruction: {example['instruction']}\nInput: {example['input']}\nResponse:"
    else:
        prompt = f"Instruction: {example['instruction']}\nResponse:"
    return {"text": prompt + " " + example["output"]}

dataset = dataset.map(format_example)


Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Map:   0%|          | 0/391 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [8]:
# Format examples into text
def format_example(example):
    if example["input"]:
        prompt = f"Instruction: {example['instruction']}\nInput: {example['input']}\nResponse:"
    else:
        prompt = f"Instruction: {example['instruction']}\nResponse:"
    return {"text": prompt + " " + example["output"]}

dataset = dataset.map(format_example)

# Tokenize with labels
def tokenize(batch):
    tokenized = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
    # Add labels for causal language modeling
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Map:   0%|          | 0/391 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Map:   0%|          | 0/391 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [9]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./mistral-school-finetuned",
    num_train_epochs=4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=10,
    learning_rate=2e-4,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    report_to="none",
)


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer
)

trainer.train()

# Evaluate on test set
metrics = trainer.evaluate(tokenized["test"])
print(metrics)



  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,5.984,5.844446
2,5.9064,5.822745
3,5.9008,5.81712
4,5.9131,5.81796


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


{'eval_loss': 6.040086269378662, 'eval_runtime': 18.6408, 'eval_samples_per_second': 2.146, 'eval_steps_per_second': 0.268, 'epoch': 4.0}


In [11]:
# Save final model
trainer.save_model("./mistral-school-finetuned")
tokenizer.save_pretrained("./mistral-school-finetuned")

('./mistral-school-finetuned/tokenizer_config.json',
 './mistral-school-finetuned/special_tokens_map.json',
 './mistral-school-finetuned/chat_template.jinja',
 './mistral-school-finetuned/tokenizer.model',
 './mistral-school-finetuned/added_tokens.json',
 './mistral-school-finetuned/tokenizer.json')

In [12]:
# Clear GPU memory
import gc
del model
del trainer
gc.collect()
torch.cuda.empty_cache()

# Now test with the saved model
from transformers import pipeline

generator = pipeline(
    "text-generation",
    model="/kaggle/working/mistral-school-finetuned",
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16
)

# Test on examples
test_samples = dataset["test"].select(range(5))

print("\n=== MODEL EVALUATION ===\n")
for example in test_samples:
    if example["input"]:
        prompt = f"Instruction: {example['instruction']}\nInput: {example['input']}\nResponse:"
    else:
        prompt = f"Instruction: {example['instruction']}\nResponse:"
    
    generated = generator(prompt, max_new_tokens=200, do_sample=False, pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
    model_answer = generated.split("Response:")[-1].strip()
    
    print(f"Question: {example['instruction']}")
    print(f"Ground Truth: {example['output']}")
    print(f"Model Answer: {model_answer}")
    print("-" * 80 + "\n")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== MODEL EVALUATION ===

Question: Quand se termine le semestre de printemps 2025 avant les examens finaux ?
Ground Truth: Le semestre de printemps 2025 se termine le 17 mars 2025, dernier jour d'enseignement avant la pause.
Model Answer: Le semestre de printemps 2025 se termine le 28 juin 2025, avant les examens finaux qui commencent le 5 juillet 2025.
--------------------------------------------------------------------------------

Question: Combien de jours fériés publics y a-t-il pendant le semestre printemps 2025 ?
Ground Truth: Il y a trois jours fériés publics pendant le semestre printemps 2025 : le 1er mai (Fête du Travail), le 27 juin (Premier Moharram) et le 30 juillet (Fête du Trône).
Model Answer: Il y a trois jours fériés publics : 1er mai, 27 juin et 19 juillet.
--------------------------------------------------------------------------------

Question: Quelle est la mission principale du club CETEC concernant les étudiants sub-sahariens ?
Ground Truth: Le club CETEC off