In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
from preprocessing_finetune import get_dataset_text_format

## Load data

In [2]:
dataset_train_test = get_dataset_text_format('NLP_in_industry-original_data.csv',4_000)

Fetching link https://datapolitics-public.s3.gra.io.cloud.ovh.net/LORIA/txt/6357/71845_1698228833-PV---Conseil-Municipal-16-01-2023.pdf.txt
Fetching link https://datapolitics-public.s3.gra.io.cloud.ovh.net/LORIA/txt/2515/213c7_proces-verbal-25-01-2023.pdf.txt
Fetching link https://datapolitics-public.s3.gra.io.cloud.ovh.net/LORIA/txt/1086/ee2ec_2023_1_1.pdf.txt
Fetching link https://datapolitics-public.s3.gra.io.cloud.ovh.net/LORIA/txt/3020/68132_cms_viewFile.php.txt
Fetching link https://datapolitics-public.s3.gra.io.cloud.ovh.net/LORIA/txt/3132/6df22_cms_viewFile.php.txt
Fetching link https://datapolitics-public.s3.gra.io.cloud.ovh.net/LORIA/txt/2785/384c7_D%C3%A9lib%C3%A9rations_Conseil_Communautaire_27_f%C3%A9vrier_2023.pdf.txt
Failed to fetch https://datapolitics-public.s3.gra.io.cloud.ovh.net/LORIA/txt/2785/384c7_D%C3%A9lib%C3%A9rations_Conseil_Communautaire_27_f%C3%A9vrier_2023.pdf.txt: 403 Client Error: Forbidden for url: https://datapolitics-public.s3.gra.io.cloud.ovh.net/LORI

In [34]:
dataset_train_test.save_to_disk('unsloth_train_test_clean')

Saving the dataset (0/1 shards):   0%|          | 0/328 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/141 [00:00<?, ? examples/s]

In [3]:
dataset_train_test

DatasetDict({
    train: Dataset({
        features: ['Gold published date', 'url', 'text version', 'text', '__index_level_0__'],
        num_rows: 328
    })
    test: Dataset({
        features: ['Gold published date', 'url', 'text version', 'text', '__index_level_0__'],
        num_rows: 141
    })
})

## Load model

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None
load_in_4bit = True

checkpoint = "unsloth/llama-3-8b-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = checkpoint,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

## Format prompts

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

In [None]:
def format_conversation(row):
    context = row['text']
    prompt = 'What is the publication date of the document?'
    gold_date = row['Gold published date']
    return [{'role': 'user', 'content': f'Beggining and end of the document :\n{context}\n{prompt}'}, {'role': 'assistant', 'content':f'{gold_date}'}]

## Train

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
trainer_stats = trainer.train()

In [None]:
model.save_pretrained("lora_model") # Local saving

In [12]:
dataset = load_dataset("maribr/publication_dates_fr")['train']
df_hf = dataset.to_pandas().drop(columns='Text')
df = pd.read_pickle('llamacpp.pkl').drop(columns='Gold published date')
df_hf = df_hf.merge(df, on='url')#.drop(columns='Text')
df_hf.to_pickle('llamacpp_newlabels.pkl')