# Finetuning Notebook
## LLM finetuning for AES

Code to finetune models served via Ollama for AES tasks

### Create Slim Version of test/train set

Remove metadata and just leave scores with texts

In [1]:
import pandas as pd
import os

name_dir = "data"
file_name_train = "persuade_corpus_2.0_train.csv"
file_name_test  = "persuade_corpus_2.0_test.csv"
slim_columns = ["essay_id_comp", "full_text", "holistic_essay_score"]

for split, file_name in [("train", file_name_train), ("test", file_name_test)]:
    fullpath = os.path.join(name_dir, file_name)
    print(f"loading {split} from {fullpath} …")
    df = pd.read_csv(fullpath)

    # pick only the three
    slim = df[slim_columns].copy()

    outpath = os.path.join(name_dir, f"slim_persuade_{split}.csv")
    print(f"writing slim {split} -> {outpath}")
    slim.to_csv(outpath, index=False)


loading train from data\persuade_corpus_2.0_train.csv …


  df = pd.read_csv(fullpath)


writing slim train -> data\slim_persuade_train.csv
loading test from data\persuade_corpus_2.0_test.csv …


  df = pd.read_csv(fullpath)


writing slim test -> data\slim_persuade_test.csv


Load datasets

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

llm_name  = "mistralai/Mistral-7B-v0.3"
tokenizer = AutoTokenizer.from_pretrained(llm_name, use_fast=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

raw_ds = load_dataset("csv", data_files={"train":"data/slim_persuade_train.csv",
                                     "valid":"data/slim_persuade_test.csv"})

# prompt for essay scoring
def build_prompt(text:str, score:int) -> str:
    return (
        "<s>[INST] You are an essay rater specializing in the evaluation of essays written by students from 6th to 12th grade. "
        "Read and evaluate the essay. Assign it a score from 1 to 6, in increments of 1. "
        "Your response should be only a numeric value representing the score you gave. [/INST]\n"
        f"{text.strip()}\n"
        f"</s> {score}"
    )

# tokenize and format for pytorch
def preprocess(ex):
    prompt = build_prompt(ex["full_text"], ex["holistic_essay_score"])
    tok    = tokenizer(prompt,
                       truncation=True,
                       max_length=2048,
                       padding="max_length")
    tok["labels"] = tok["input_ids"].copy()
    return tok

ds = raw_ds.map(preprocess, batched=False, remove_columns=raw_ds["train"].column_names)
ds.set_format("torch")

In [2]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


# 4-bit quantization config
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit            = True,
    bnb_4bit_quant_type     = "nf4",
    bnb_4bit_compute_dtype  = "float16",
    bnb_4bit_use_double_quant=True
)

# load model & tokenizer
model = AutoModelForCausalLM.from_pretrained(
    llm_name,
    quantization_config=bnb_cfg,
    device_map="auto",
    # trust_remote_code=True  # Gemma needs this
)

model.config.use_cache = False
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

# prepare PEFT (LoRA) adapter
peft_cfg = LoraConfig(
    r           = 8,
    lora_alpha  = 16,
    lora_dropout= 0.05,
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 3,407,872 || all params: 7,251,431,424 || trainable%: 0.0470


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

import os, torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir            = "checkpoints",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    eval_accumulation_steps     = 1,
    learning_rate         = 3e-4,
    fp16                  = True,
    num_train_epochs      = 1,
    max_steps             = 2000,
    logging_steps         = 50,
    eval_steps            = 500,
    save_steps            = 500,
    save_total_limit      = 2,
    report_to             = "none"
)

trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = ds["train"],
    eval_dataset  = ds["valid"],
    data_collator = data_collator,
    tokenizer     = tokenizer
)

import time
from transformers import TrainerCallback

class StepLogger(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0:
            print(f"[{time.strftime('%H:%M:%S')}] Complete step {state.global_step}")

trainer.add_callback(StepLogger())

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
50,2.0102
100,1.9603
150,1.955
200,1.9612
250,1.9338
300,1.9406
350,1.8774
400,1.9238
450,1.914
500,1.9078


[14:18:39] Complete step 100
[14:41:08] Complete step 200
[15:03:33] Complete step 300
[15:26:05] Complete step 400
[15:48:33] Complete step 500


  return fn(*args, **kwargs)


[16:10:59] Complete step 600
[16:33:26] Complete step 700
[16:55:50] Complete step 800
[17:18:16] Complete step 900
[17:40:40] Complete step 1000


  return fn(*args, **kwargs)


[18:02:57] Complete step 1100
[18:25:18] Complete step 1200
[18:47:35] Complete step 1300
[19:09:50] Complete step 1400
