# Fine tuning

## Load data

In [1]:
# !pip install -q datasets
# !pip install -q unsloth
# !pip install -q requests
# !pip install -q trl

In [2]:
import pandas as pd
import requests
from datasets import Dataset, DatasetDict
from unsloth.chat_templates import get_chat_template

  from .autonotebook import tqdm as notebook_tqdm


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [3]:
# df = pd.read_csv('dataset_text_url.csv')

In [26]:
def get_text_doc(link, length = 3_000):
    # print(f'Fetching link {link}')
    def fetch_text(link):
        try:
            response = requests.get(link)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch {link}: {e}")
            return None

    text = fetch_text(link)
    if text is None: return None
    text_start = text[:length]
    text_end = text[-length:]
    text = text_start + " " + text_end
    return text

Loading the labeled data from huggingface datasets.

In [5]:
# df['text'] = df['text version'].apply(get_text_doc)

In [6]:
# df.to_csv('dataset_text_url_with_text.csv', index = False)

# # download to disk
# from google.colab import files
# files.download('dataset_text_url_with_text.csv')

In [7]:
# df['text'].apply(len).describe()

In [8]:
# df = pd.read_csv('dataset_text_url_with_text.csv')

In [9]:
# dataset = Dataset.from_pandas(df)

In [10]:
# dataset_train_test = dataset.train_test_split(test_size = 0.3, seed=42)

In [11]:
# dataset_train_test.save_to_disk('unsloth_train_test_clean')

In [None]:
dataset_train_test = DatasetDict.load_from_disk('unsloth_train_test_clean')
# data_train = dataset_train_test['train']

In [27]:
dataset_train_test = dataset_train_test.map(lambda x : {'text': get_text_doc(x['text version'])})

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 328/328 [00:50<00:00,  6.46 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 141/141 [00:22<00:00,  6.26 examples/s]


In [28]:
data_train = dataset_train_test['train']
data_train

Dataset({
    features: ['Gold published date', 'url', 'text version', 'text', '__index_level_0__'],
    num_rows: 328
})

## Load model

In [15]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True
checkpoint = "unsloth/llama-3-8b-bnb-4bit"

In [16]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = checkpoint,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...",
)

==((====))==  Unsloth 2024.11.11: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.713 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [17]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.11.11 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Prepare prompts

In [29]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def format_conversation(row):
    context = row['text']
    prompt = 'What is the publication date of the document? Output as a structured JSON object with a format DD/MM/YYYY.'
    gold_date = row['Gold published date']
    gold_date = f"{{'predicted_date' : '{gold_date}'}}"
    return {'conversations': [{'role': 'user', 'content': f'{context}\n{prompt}'}, {'role': 'assistant', 'content':f'{gold_date}'}]}

def format_prompts(examples):
    convo = examples["conversations"]
    texts = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False)
    return { "prompt" : texts, }

In [30]:
ex = {'conversations' :[[{"role": "system", "content": "You are an assistant"}, {"role": "user", "content": "What is 2+2?"}]]}
format_prompts(ex)

{'prompt': ['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\nYou are an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is 2+2?<|eot_id|>']}

In [31]:
data_train = data_train.map(format_conversation)
data_train = data_train.map(format_prompts)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 328/328 [00:00<00:00, 5979.93 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 328/328 [00:00<00:00, 7362.05 examples/s]


In [32]:
data_train

Dataset({
    features: ['Gold published date', 'url', 'text version', 'text', '__index_level_0__', 'conversations', 'prompt'],
    num_rows: 328
})

In [33]:
print(data_train['prompt'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

R√âPUBLIQUE FRAN√áAISE
D√©partement de
SEINE ET MARNE
Arrondissement de
TORCY

EXTRAIT DU
REGISTRE DES
D√âLIB√âRATIONS
DU CONSEIL
MUNICIPAL
S√âANCE DU 9 JUIN 2020
Le mardi 9 juin 2020 √† 18 h 30, les Membres du Conseil municipal, r√©guli√®rement convoqu√©s en
s√©ance le 3 juin 2020, se sont r√©unis au Centre culturel de Chelles, salle Tristan et Iseult, sous la
pr√©sidence de Monsieur RABASTE, Maire.
√âtaient pr√©sents :
M. Brice Rabaste, Mme Colette Boissot, M. Philippe Maury, Mme C√©line Netthavongs, M. Jacques
Philippon, Mme Audrey Duchesne, M. Beno√Æt Breysse, Mme Annie Ferri, M. Guillaume S√©gala,
Mme Angela Avond, M. Frank Billard, Mme Ingrid Caillis-Brandl, M. Christian Couturier,
Mme La√´titia Millet, Mme Mich√®le Dengreville, Mme Nicole Saunier, Mme Martine Broyon (√† partir
du point 3), M. Alain Coudra

## Training

In [34]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = data_train,
    dataset_text_field = "prompt",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 328/328 [00:01<00:00, 279.32 examples/s]


In [35]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 328/328 [00:00<00:00, 1174.38 examples/s]


In [36]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 328 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 123
 "-____-"     Number of trainable parameters = 41,943,040
                                       
  0%|          | 0/123 [05:09<?, ?it/s]         

{'loss': 2.1201, 'grad_norm': 1.3766183853149414, 'learning_rate': 0.00019152542372881357, 'epoch': 0.24}


                                       
  0%|          | 0/123 [07:52<?, ?it/s]         

{'loss': 0.6887, 'grad_norm': 0.8390240669250488, 'learning_rate': 0.0001745762711864407, 'epoch': 0.49}


                                       
  0%|          | 0/123 [10:33<?, ?it/s]         

{'loss': 0.6027, 'grad_norm': 1.4763462543487549, 'learning_rate': 0.0001576271186440678, 'epoch': 0.73}


                                       
  0%|          | 0/123 [13:13<?, ?it/s]         

{'loss': 0.5126, 'grad_norm': 0.3450949192047119, 'learning_rate': 0.00014067796610169492, 'epoch': 0.98}


                                       
  0%|          | 0/123 [15:56<?, ?it/s]         

{'loss': 0.51, 'grad_norm': 0.7156978249549866, 'learning_rate': 0.00012372881355932205, 'epoch': 1.22}


                                       
  0%|          | 0/123 [18:38<?, ?it/s]         

{'loss': 0.536, 'grad_norm': 0.4333380162715912, 'learning_rate': 0.00010677966101694916, 'epoch': 1.46}


                                       
  0%|          | 0/123 [21:20<?, ?it/s]         

{'loss': 0.4896, 'grad_norm': 0.6360849142074585, 'learning_rate': 8.983050847457629e-05, 'epoch': 1.71}


                                       
  0%|          | 0/123 [24:01<?, ?it/s]         

{'loss': 0.5628, 'grad_norm': 0.54438316822052, 'learning_rate': 7.288135593220338e-05, 'epoch': 1.95}


                                       
  0%|          | 0/123 [26:43<?, ?it/s]         

{'loss': 0.4556, 'grad_norm': 0.3130186200141907, 'learning_rate': 5.593220338983051e-05, 'epoch': 2.2}


                                       
  0%|          | 0/123 [29:25<?, ?it/s]          

{'loss': 0.4979, 'grad_norm': 0.3184017241001129, 'learning_rate': 3.898305084745763e-05, 'epoch': 2.44}


                                       
  0%|          | 0/123 [32:06<?, ?it/s]          

{'loss': 0.4715, 'grad_norm': 0.18385499715805054, 'learning_rate': 2.2033898305084748e-05, 'epoch': 2.68}


                                       
  0%|          | 0/123 [34:49<?, ?it/s]          

{'loss': 0.4769, 'grad_norm': 0.037042152136564255, 'learning_rate': 5.084745762711865e-06, 'epoch': 2.93}


                                       
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 123/123 [33:12<00:00, 16.20s/it]

{'train_runtime': 1992.3762, 'train_samples_per_second': 0.494, 'train_steps_per_second': 0.062, 'train_loss': 0.655511701010107, 'epoch': 3.0}





In [37]:
model.save_pretrained("llama3_8b-date_prediction-lora_adapters-3_epochs-clean") # Local saving

## Loading the trained model for inference

In [38]:
# df = pd.read_csv('dataset_text_url_with_text.csv')
data_test = dataset_train_test['test']
# data_test = data_test.map(lambda x : {'text': get_text_doc(x['text version'])})

In [7]:
# loading lora adapters and merging

model, tokenizer = FastLanguageModel.from_pretrained('llama3_8b-date_prediction-lora_adapters-3_epochs',
                                                      max_seq_length = max_seq_length,
                                                      dtype = dtype,
                                                      load_in_4bit = load_in_4bit)

# model, tokenizer = FastLanguageModel.from_pretrained(checkpoint,
#                                                       max_seq_length = max_seq_length,
#                                                       dtype = dtype,
#                                                       load_in_4bit = load_in_4bit)

==((====))==  Unsloth 2024.11.11: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.713 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.11 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [39]:
FastLanguageModel.for_inference(model)
tokenizer = get_chat_template(
        tokenizer,
        chat_template = "llama-3.1",
    )

In [40]:
def format_question(context):
    prompt = 'What is the publication date of the document? Output as a structured JSON object with a format DD/MM/YYYY.'
    return [{'role': 'user', 'content': f'Beggining and end of the document :\n{context}\n{prompt}'}]

In [41]:
def predict_date(message:list[dict]):
    inputs = tokenizer.apply_chat_template(
        message,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 13, use_cache = True,)
    outputs = outputs[:,len(inputs[0]):]
    answer_only = tokenizer.batch_decode(outputs, skip_special_tokens=False)
    return answer_only[0]

In [42]:
df_test = data_test.to_pandas()

In [43]:
df_test

Unnamed: 0,Gold published date,url,text version,text,__index_level_0__
0,31/01/2024,https://www.saintcyr78.fr/wp-content/uploads/2...,https://datapolitics-public.s3.gra.io.cloud.ov...,REPUBLIQUE FRANCAISE\nLibert√© - Egalit√© - Frat...,493
1,01/09/2019,https://www.manche.gouv.fr/contenu/telechargem...,https://datapolitics-public.s3.gra.io.cloud.ov...,PREFET DE LA MANCHE\n\nRECUEIL DES ACTES\nADMI...,241
2,02/02/2023,https://www.suresnes.fr/wp-content/uploads/202...,https://datapolitics-public.s3.gra.io.cloud.ov...,VILLE DE SURESNES\nConseil Municipal\nS√©ance d...,55
3,17/12/2020,https://www.hautesavoie.fr/sites/default/files...,https://datapolitics-public.s3.gra.io.cloud.ov...,Recueil des Actes Administratifs\n\nRegistre d...,453
4,06/07/2020,https://www.lombez-gers.com/uploads/documents/...,https://datapolitics-public.s3.gra.io.cloud.ov...,Proc√®s-verbal de la s√©ance du Conseil Municipa...,471
...,...,...,...,...,...
136,23/02/2023,https://www.villeneuve-yonne.fr/wp-content/upl...,https://datapolitics-public.s3.gra.io.cloud.ov...,R√©publique Fran√ßaise N √º \ A D√©partement de l'...,88
137,07/03/2023,https://www.ville-noisiel.fr/wp-content/upload...,https://datapolitics-public.s3.gra.io.cloud.ov...,Envoy√© en pr√©fecture le 07/03/2023\nRe√ßu en pr...,92
138,02/03/2022,https://www.sud-retz-atlantique.fr/wp-content/...,https://datapolitics-public.s3.gra.io.cloud.ov...,PUBLICATION D√âLIB√âRATIONS COMMUNAUTAIRES_ANN√âE...,164
139,2022,https://www.alpes-de-haute-provence.gouv.fr/co...,https://datapolitics-public.s3.gra.io.cloud.ov...,Ez\nPR√âFETE\n\nDES ALPES-\nDE-HAUTE-\nPROVENCE...,270


In [44]:
msg = format_question(df_test.loc[0,'text'])

tokenizer.apply_chat_template(
        msg,
        tokenize = False,
        add_generation_prompt = True,
        # return_tensors = "pt",
    )

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nBeggining and end of the document :\nREPUBLIQUE FRANCAISE\nLibert√© - Egalit√© - Fraternit√©\n------------------------------------COMMUNE DE SAINT-CYR-L‚Äô√âCOLE\n------------------------------------PROCES-VERBAL DE LA SEANCE\nDU CONSEIL MUNICIPAL\nEN DATE DU 06 F√âVRIER 2024\n(Ex√©cution de l‚Äôarticle L.2121-15 du Code g√©n√©ral des collectivit√©s territoriales)\nDate de la convocation : 31 janvier 2024\nDate de son affichage : 31 janvier 2024\nPr√©sidence : Madame Sonia BRAU, Maire.\nPr√©sents : Mme Sonia BRAU, M. Yves JOURDAN, Mme Lydie DUCHON, M. Henri LANCELIN,\nMme Marie-Laure CAILLON, M. Fr√©d√©ric BUONO-BLONDEL, Mme Sophie MARVIN, Mme Isabelle\nGENEVELLE, M. J√©r√¥me de NAZELLE, M. Joseph SAMAMA, Mme Brigitte AUBONNET, Mme Christine\nGOSSELIN, M. Ahmed BELKACEM, Mme Olga KHALDI, M. Kamel HAMZA, 

In [45]:
print(predict_date(format_question(df_test.loc[136,'text'])))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


{'predicted_date' : '23/02/2023'}


In [46]:
df_test

Unnamed: 0,Gold published date,url,text version,text,__index_level_0__
0,31/01/2024,https://www.saintcyr78.fr/wp-content/uploads/2...,https://datapolitics-public.s3.gra.io.cloud.ov...,REPUBLIQUE FRANCAISE\nLibert√© - Egalit√© - Frat...,493
1,01/09/2019,https://www.manche.gouv.fr/contenu/telechargem...,https://datapolitics-public.s3.gra.io.cloud.ov...,PREFET DE LA MANCHE\n\nRECUEIL DES ACTES\nADMI...,241
2,02/02/2023,https://www.suresnes.fr/wp-content/uploads/202...,https://datapolitics-public.s3.gra.io.cloud.ov...,VILLE DE SURESNES\nConseil Municipal\nS√©ance d...,55
3,17/12/2020,https://www.hautesavoie.fr/sites/default/files...,https://datapolitics-public.s3.gra.io.cloud.ov...,Recueil des Actes Administratifs\n\nRegistre d...,453
4,06/07/2020,https://www.lombez-gers.com/uploads/documents/...,https://datapolitics-public.s3.gra.io.cloud.ov...,Proc√®s-verbal de la s√©ance du Conseil Municipa...,471
...,...,...,...,...,...
136,23/02/2023,https://www.villeneuve-yonne.fr/wp-content/upl...,https://datapolitics-public.s3.gra.io.cloud.ov...,R√©publique Fran√ßaise N √º \ A D√©partement de l'...,88
137,07/03/2023,https://www.ville-noisiel.fr/wp-content/upload...,https://datapolitics-public.s3.gra.io.cloud.ov...,Envoy√© en pr√©fecture le 07/03/2023\nRe√ßu en pr...,92
138,02/03/2022,https://www.sud-retz-atlantique.fr/wp-content/...,https://datapolitics-public.s3.gra.io.cloud.ov...,PUBLICATION D√âLIB√âRATIONS COMMUNAUTAIRES_ANN√âE...,164
139,2022,https://www.alpes-de-haute-provence.gouv.fr/co...,https://datapolitics-public.s3.gra.io.cloud.ov...,Ez\nPR√âFETE\n\nDES ALPES-\nDE-HAUTE-\nPROVENCE...,270


In [47]:
df_test['predicted_date'] = df_test['text'].apply(lambda x : predict_date(format_question(x)))

In [48]:
df_test

Unnamed: 0,Gold published date,url,text version,text,__index_level_0__,predicted_date
0,31/01/2024,https://www.saintcyr78.fr/wp-content/uploads/2...,https://datapolitics-public.s3.gra.io.cloud.ov...,REPUBLIQUE FRANCAISE\nLibert√© - Egalit√© - Frat...,493,{'predicted_date' : '06/02/2024'}
1,01/09/2019,https://www.manche.gouv.fr/contenu/telechargem...,https://datapolitics-public.s3.gra.io.cloud.ov...,PREFET DE LA MANCHE\n\nRECUEIL DES ACTES\nADMI...,241,{'predicted_date' : '02/09/2019'}
2,02/02/2023,https://www.suresnes.fr/wp-content/uploads/202...,https://datapolitics-public.s3.gra.io.cloud.ov...,VILLE DE SURESNES\nConseil Municipal\nS√©ance d...,55,{'predicted_date' : '02/02/2023'}
3,17/12/2020,https://www.hautesavoie.fr/sites/default/files...,https://datapolitics-public.s3.gra.io.cloud.ov...,Recueil des Actes Administratifs\n\nRegistre d...,453,{'predicted_date' : '07/12/2020'}
4,06/07/2020,https://www.lombez-gers.com/uploads/documents/...,https://datapolitics-public.s3.gra.io.cloud.ov...,Proc√®s-verbal de la s√©ance du Conseil Municipa...,471,{'predicted_date' : '06/07/2020'}
...,...,...,...,...,...,...
136,23/02/2023,https://www.villeneuve-yonne.fr/wp-content/upl...,https://datapolitics-public.s3.gra.io.cloud.ov...,R√©publique Fran√ßaise N √º \ A D√©partement de l'...,88,{'predicted_date' : '23/02/2023'}
137,07/03/2023,https://www.ville-noisiel.fr/wp-content/upload...,https://datapolitics-public.s3.gra.io.cloud.ov...,Envoy√© en pr√©fecture le 07/03/2023\nRe√ßu en pr...,92,{'predicted_date' : '07/03/2023'}
138,02/03/2022,https://www.sud-retz-atlantique.fr/wp-content/...,https://datapolitics-public.s3.gra.io.cloud.ov...,PUBLICATION D√âLIB√âRATIONS COMMUNAUTAIRES_ANN√âE...,164,{'predicted_date' : '30/03/2022'}
139,2022,https://www.alpes-de-haute-provence.gouv.fr/co...,https://datapolitics-public.s3.gra.io.cloud.ov...,Ez\nPR√âFETE\n\nDES ALPES-\nDE-HAUTE-\nPROVENCE...,270,{'predicted_date' : '17/03/2022'}


In [62]:
import ast
def format_predicted_date(date_str):
    try:
        return ast.literal_eval(date_str)['predicted_date']
    except SyntaxError as e:
        try:
            return ast.literal_eval(date_str[:27])['predicted_date']
        except:
            print(e)
            print(date_str)
            return date_str

In [63]:
df_test['predicted_date_clean'] = df_test['predicted_date'].apply(format_predicted_date)

In [51]:
df_test.to_csv('llama_finetuned_predictions_fulltext_clean.csv')

In [74]:
# df_test['hit']=df_test['predicted_date_clean'] == df_test['Gold published date']
acc_strict = (df_test['predicted_date_clean'] == df_test['Gold published date']).mean()
print(f"Exact-match accuracy : {acc_strict*100:.2f}%")

Exact-match accuracy : 70.21%


In [73]:
acc_year_month = (df_test['predicted_date_clean'].str[3:] == df_test['Gold published date'].str[3:]).mean()
print(f"Accuracy of predicting the correct month and year : {acc_year_month*100:.2f}%")

Accuracy of predicting the correct month and year : 80.14%


In [67]:
df_test[df_test['Gold published date'].str.len() < 6][['Gold published date', 'predicted_date_clean']]

Unnamed: 0,Gold published date,predicted_date_clean
44,2019,01/01/2020
54,2023,2023
75,2023,12/01/2023
123,2022,2022
139,2022,17/03/2022


In [58]:
df_train = data_train.to_pandas()
df_train[df_train['Gold published date'].str.len() < 6]

Unnamed: 0,Gold published date,url,text version,text,__index_level_0__,conversations,prompt
7,2020,https://www.cc-molsheim-mutzig.fr/reglement-as...,https://datapolitics-public.s3.gra.io.cloud.ov...,R√àGLEMENT DU SERVICE D‚ÄôASSAINISSEMENT\n\nPR√âAM...,199,[{'content': 'R√àGLEMENT DU SERVICE D‚ÄôASSAINISS...,<|begin_of_text|><|start_header_id|>system<|en...
8,2020,https://www.ville-thiais.fr/wp-content/uploads...,https://datapolitics-public.s3.gra.io.cloud.ov...,DISPOSITIONS R√âGLEMENTAIRES\nAPPLICABLES DANS ...,193,[{'content': 'DISPOSITIONS R√âGLEMENTAIRES APPL...,<|begin_of_text|><|start_header_id|>system<|en...
72,2017,http://sudestavenir.fr/wp-content/uploads/2021...,https://datapolitics-public.s3.gra.io.cloud.ov...,CONSEIL DU\n\nN¬∞ DE LA\nDELIBERATION\n\n1er f√©...,393,[{'content': 'CONSEIL DU N¬∞ DE LA DELIBERATIO...,<|begin_of_text|><|start_header_id|>system<|en...
105,2020,https://www.ville-thiais.fr/wp-content/uploads...,https://datapolitics-public.s3.gra.io.cloud.ov...,DISPOSITIONS R√âGLEMENTAIRES\nAPPLICABLES DANS ...,194,[{'content': 'DISPOSITIONS R√âGLEMENTAIRES APPL...,<|begin_of_text|><|start_header_id|>system<|en...
118,2024,https://www.alpes-de-haute-provence.gouv.fr/co...,https://datapolitics-public.s3.gra.io.cloud.ov...,EE L DELEGATION DEPARTEMENTALE\n; DE L'AGENCE ...,271,[{'content': 'EE L DELEGATION DEPARTEMENTALE ;...,<|begin_of_text|><|start_header_id|>system<|en...
163,2023,http://www.agglo-niort.fr/fileadmin/CAN/agglo/...,https://datapolitics-public.s3.gra.io.cloud.ov...,CONVENTION CADRE VALANT\nOPERATION DE REVITALI...,72,[{'content': 'CONVENTION CADRE VALANT OPERATIO...,<|begin_of_text|><|start_header_id|>system<|en...
173,2021,https://www.cc-paysfouesnantais.fr/medias/2021...,https://datapolitics-public.s3.gra.io.cloud.ov...,\n\nFouesnantais\n\nCommunaut√© de Communes...,267,[{'content': ' Fouesnantais Communaut√© d...,<|begin_of_text|><|start_header_id|>system<|en...
186,2023,https://paysdelaserre.fr/wp-content/uploads/20...,https://datapolitics-public.s3.gra.io.cloud.ov...,R√©vision All√©g√©e du Plan Local d‚ÄôUrbanisme de\...,30,[{'content': 'R√©vision All√©g√©e du Plan Local d...,<|begin_of_text|><|start_header_id|>system<|en...
218,2023,https://www.ville-saintgratien.fr/medias/2023/...,https://datapolitics-public.s3.gra.io.cloud.ov...,\n \n\nZONE UA\n\n \n\n \n\n \n\nZone de centr...,131,[{'content': '  ZONE UA  Zone de ce...,<|begin_of_text|><|start_header_id|>system<|en...
257,2022,https://www.olemps.fr/uploads/sites/95/2023/03...,https://datapolitics-public.s3.gra.io.cloud.ov...,PRINCIPAUX CHIFFRES\nCOMPTE ADMINISTRATIF 2022...,44,[{'content': 'PRINCIPAUX CHIFFRES COMPTE ADMIN...,<|begin_of_text|><|start_header_id|>system<|en...
