# Fine tuning

## Load data

In [None]:
import pandas as pd
import requests
from datasets import Dataset, DatasetDict
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
import torch
from preprocessing_finetune import get_dataset_text_format

In [None]:
# used only for the 1st time to generate the training data
if False:
    dataset_train_test = get_dataset_text_format('NLP_in_industry-original_data.csv',3000)
    dataset_train_test.save_to_disk('unsloth_train_test')

In [3]:
dataset_train_test = DatasetDict.load_from_disk('unsloth_train_test')
data_train = dataset_train_test['train']
data_train

Dataset({
    features: ['Gold published date', 'url', 'text version', 'text', '__index_level_0__'],
    num_rows: 328
})

## Load model

In [None]:

max_seq_length = 2048
dtype = None
load_in_4bit = True
checkpoint = "unsloth/llama-3-8b-bnb-4bit"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = checkpoint,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.11: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.713 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

Unsloth 2024.11.11 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Prepare prompts

In [29]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def format_conversation(row):
    context = row['text']
    prompt = 'What is the publication date of the document? Output as a structured JSON object with a format DD/MM/YYYY.'
    gold_date = row['Gold published date']
    gold_date = f"{{'predicted_date' : '{gold_date}'}}"
    return {'conversations': [{'role': 'user', 'content': f'{context}\n{prompt}'}, {'role': 'assistant', 'content':f'{gold_date}'}]}

def format_prompts(examples):
    convo = examples["conversations"]
    texts = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False)
    return { "prompt" : texts, }

In [31]:
data_train = data_train.map(format_conversation)
data_train = data_train.map(format_prompts)

Map: 100%|██████████| 328/328 [00:00<00:00, 5979.93 examples/s]
Map: 100%|██████████| 328/328 [00:00<00:00, 7362.05 examples/s]


In [32]:
data_train

Dataset({
    features: ['Gold published date', 'url', 'text version', 'text', '__index_level_0__', 'conversations', 'prompt'],
    num_rows: 328
})

In [33]:
print(data_train['prompt'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

RÉPUBLIQUE FRANÇAISE
Département de
SEINE ET MARNE
Arrondissement de
TORCY

EXTRAIT DU
REGISTRE DES
DÉLIBÉRATIONS
DU CONSEIL
MUNICIPAL
SÉANCE DU 9 JUIN 2020
Le mardi 9 juin 2020 à 18 h 30, les Membres du Conseil municipal, régulièrement convoqués en
séance le 3 juin 2020, se sont réunis au Centre culturel de Chelles, salle Tristan et Iseult, sous la
présidence de Monsieur RABASTE, Maire.
Étaient présents :
M. Brice Rabaste, Mme Colette Boissot, M. Philippe Maury, Mme Céline Netthavongs, M. Jacques
Philippon, Mme Audrey Duchesne, M. Benoît Breysse, Mme Annie Ferri, M. Guillaume Ségala,
Mme Angela Avond, M. Frank Billard, Mme Ingrid Caillis-Brandl, M. Christian Couturier,
Mme Laëtitia Millet, Mme Michèle Dengreville, Mme Nicole Saunier, Mme Martine Broyon (à partir
du point 3), M. Alain Coudray, M. Gildas Cosson, 

## Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = data_train,
    dataset_text_field = "prompt",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2): 100%|██████████| 328/328 [00:01<00:00, 279.32 examples/s]


In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    r = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map: 100%|██████████| 328/328 [00:00<00:00, 1174.38 examples/s]


In [36]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 328 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 123
 "-____-"     Number of trainable parameters = 41,943,040
                                       
  0%|          | 0/123 [05:09<?, ?it/s]         

{'loss': 2.1201, 'grad_norm': 1.3766183853149414, 'learning_rate': 0.00019152542372881357, 'epoch': 0.24}


                                       
  0%|          | 0/123 [07:52<?, ?it/s]         

{'loss': 0.6887, 'grad_norm': 0.8390240669250488, 'learning_rate': 0.0001745762711864407, 'epoch': 0.49}


                                       
  0%|          | 0/123 [10:33<?, ?it/s]         

{'loss': 0.6027, 'grad_norm': 1.4763462543487549, 'learning_rate': 0.0001576271186440678, 'epoch': 0.73}


                                       
  0%|          | 0/123 [13:13<?, ?it/s]         

{'loss': 0.5126, 'grad_norm': 0.3450949192047119, 'learning_rate': 0.00014067796610169492, 'epoch': 0.98}


                                       
  0%|          | 0/123 [15:56<?, ?it/s]         

{'loss': 0.51, 'grad_norm': 0.7156978249549866, 'learning_rate': 0.00012372881355932205, 'epoch': 1.22}


                                       
  0%|          | 0/123 [18:38<?, ?it/s]         

{'loss': 0.536, 'grad_norm': 0.4333380162715912, 'learning_rate': 0.00010677966101694916, 'epoch': 1.46}


                                       
  0%|          | 0/123 [21:20<?, ?it/s]         

{'loss': 0.4896, 'grad_norm': 0.6360849142074585, 'learning_rate': 8.983050847457629e-05, 'epoch': 1.71}


                                       
  0%|          | 0/123 [24:01<?, ?it/s]         

{'loss': 0.5628, 'grad_norm': 0.54438316822052, 'learning_rate': 7.288135593220338e-05, 'epoch': 1.95}


                                       
  0%|          | 0/123 [26:43<?, ?it/s]         

{'loss': 0.4556, 'grad_norm': 0.3130186200141907, 'learning_rate': 5.593220338983051e-05, 'epoch': 2.2}


                                       
  0%|          | 0/123 [29:25<?, ?it/s]          

{'loss': 0.4979, 'grad_norm': 0.3184017241001129, 'learning_rate': 3.898305084745763e-05, 'epoch': 2.44}


                                       
  0%|          | 0/123 [32:06<?, ?it/s]          

{'loss': 0.4715, 'grad_norm': 0.18385499715805054, 'learning_rate': 2.2033898305084748e-05, 'epoch': 2.68}


                                       
  0%|          | 0/123 [34:49<?, ?it/s]          

{'loss': 0.4769, 'grad_norm': 0.037042152136564255, 'learning_rate': 5.084745762711865e-06, 'epoch': 2.93}


                                       
100%|██████████| 123/123 [33:12<00:00, 16.20s/it]

{'train_runtime': 1992.3762, 'train_samples_per_second': 0.494, 'train_steps_per_second': 0.062, 'train_loss': 0.655511701010107, 'epoch': 3.0}





In [None]:
model.save_pretrained("llama3_8b-date_prediction-lora_adapters-3_epochs-clean") # saving locally
# model.push_to_hub("zmilczarek/llama3_8b-finetuned-nlp_industry-adapters", token = "...") # pushing to hub
# tokenizer.push_to_hub("zmilczarek/llama3_8b-finetuned-nlp_industry-adapters", token = "...") # pushing to hub

## Loading the trained model for inference

In [1]:
from finetuned_llama_inference import load_lora_model_inference, label_dataframe

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
data_test = dataset_train_test['test']

In [5]:
df_test = data_test.to_pandas()

In [7]:
df_test = label_dataframe(df_test, "zmilczarek/llama3_8b-finetuned-nlp_industry-adapters")

==((====))==  Unsloth 2024.11.11: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.713 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.11 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [51]:
df_test.to_csv('llama_finetuned_predictions_fulltext_clean.csv')

In [None]:
acc_strict = (df_test['predicted_date'] == df_test['Gold published date']).mean()
print(f"Exact-match accuracy : {acc_strict*100:.2f}%")

Exact-match accuracy : 71.63%


In [11]:
acc_year_month = (df_test['predicted_date'].str[-7:] == df_test['Gold published date'].str[-7:]).mean()
print(f"Accuracy of predicting the correct month and year : {acc_year_month*100:.2f}%")

Accuracy of predicting the correct month and year : 80.85%


In [2]:
acc_year = (df_test['predicted_date'].str[-4:] == df_test['Gold published date'].str[-4:]).mean()
print(f"Accuracy of predicting the correct year : {acc_year*100:.2f}%")

Accuracy of predicting the correct year : 92.91%
