In [1]:
from jinja2 import Template
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import re

In [17]:
prompt_template = Template(
    """
    You are an expert in structured data extraction.
    Given a document in French, extract and output only the **publication date** of the document in the format DD/MM/YYYY. Do not include any additional text or context — just the date.

    Document: "{{ document }}"
    Publication date:
    """
)

In [18]:
unsloth_checkpoint = "unsloth/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(unsloth_checkpoint,
                                             torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(unsloth_checkpoint)

def llm_complete(prompt, max_tokens=2048, device = 'cuda', temperature=0.5):
    # Fill it
    model.to(device)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature = temperature, pad_token_id=tokenizer.eos_token_id)
    outputs_ans_only = outputs[:,len(inputs['input_ids'][0]):]
    answer_only = tokenizer.batch_decode(outputs_ans_only, skip_special_tokens=True)
    return answer_only

In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#llm_complete(prompt, max_tokens = 30)

TODO:
- use regex to generate text chunks, then join them into one long context


In [20]:
df = pd.read_pickle('llamacpp.pkl')
df.sample(5)

Unnamed: 0,doc_id,url,cache,text version,nature,published,entity,entity_type,Text,Gold published date,regex_chunks,regex_chunks_strict,prediction
460,1482/08b01_DEL-2b_PV_CM_16_12_2020.pdf,https://www.villederueil.fr/sites/default/file...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.full,16/12/2020,Rueil-Malmaison,Commune,COMMUNE DE RUEIL-MALMAISON\n(HAUTS-DE-SEINE)\n...,16/12/2020,"[SÉANCE DU 16 DÉCEMBRE 2020, L’AN DEUX MILLE V...","[SÉANCE DU 16 DÉCEMBRE 2020, convoqué le 10 DÉ...",2020-12-19
367,2389/31352_DEL2020_128%20-%20Annexe%207.pdf,https://www.lecotentin.fr/system/files/2022-06...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,acte.delib,16/10/2020,CA du Cotentin,Intercommunalité,,,,,2022-01-01
52,1843/3ebe32039c9e85a684615399f53cb9be2c8b961b_...,https://ville-figeac.fr/download/6599/2023/529...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.full,13/03/2023,Figeac,Commune,CONSEIL MUNICIPAL DU 13 MARS 2023\n18H00\nOrdr...,13/03/2023,"[CONSEIL MUNICIPAL DU 13 MARS 2023, Approbatio...","[CONSEIL MUNICIPAL DU 13 MARS 2023, Approbatio...",2023-03-07
147,1364/c68e7_conseil_municipal_2022-12-14_liste.pdf,https://www.villejuif.fr/fileadmin/www.ville-v...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.cr,14/12/2022,Villejuif,Commune,Date de publication : 25 octobre 2022,25/10/2022,[ 2121-25 du code général des collectivités te...,"[HR ESAlE ""FTP o Compte-rendu de la séance du ...",2022-12-14
415,1291/a9c54_COMPTE_RENDU_SYNTHETIQUE_CM_23-05-2...,http://www.lhaylesroses.fr/images/2-Ma-ville/c...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.cr,23/05/2020,L'Ha¤-les-Roses,Commune,LES ROSES\n\nSEANCE DU CONSEIL MUNICIPAL\nDU 2...,23/05/2020,"[DU 23 MAI 2020, 41 rue Jean Jaurès, 94246 L'H...",[DU 23 MAI 2020],2020-05-23


In [23]:
def predict_date(document, temperature=0.5):
    input = {'document': document}
    prompt = prompt_template.render(**input)
    output = llm_complete(prompt, max_tokens = 10, temperature=temperature)
    date = re.findall(r"\d{2}/\d{2}/\d{4}\b", str(output))
    if date: return date[0]
    else: return str(output)

In [24]:
test = df.sample(10)
print(test['published'])
test['prediction'] = test.apply(lambda x: predict_date(str(x['regex_chunks'])+x['doc_id']), axis=1)
print(test['prediction'])

435    03/04/2024
132    01/01/2023
248    12/12/2022
142    14/12/2022
165    01/12/2023
292    04/04/2022
185    13/02/2020
189    13/02/2020
322    14/12/2023
482    15/07/2020
Name: published, dtype: object
435             03/04/2024
132             02/07/2007
248             12/12/2022
142             14/12/2022
165    [' 04 74 32 21 25']
292             04/04/2022
185             13/02/2020
189             13/02/2020
322             26/12/2023
482             15/07/1482
Name: prediction, dtype: object


In [25]:
df['prediction'] = df.apply(lambda x: predict_date(str(x['regex_chunks'])+x['doc_id']), axis=1)

In [26]:
df[['published','prediction']]

Unnamed: 0,published,prediction
0,16/01/2023,16/01/2023
1,25/01/2023,25/01/2023
2,31/01/2023,23/01/2023
3,26/01/2023,26/01/2023
4,16/01/2023,16/01/2023
...,...,...
495,24/01/2024,14/01/2024
496,09/01/2024,01/01/2024
497,22/11/2022,22/11/2022
498,21/12/2023,21/12/2023


In [27]:
pattern = r"^\d{4}-\d{2}-\d{2}$"
test = df[df["prediction"].str.match(pattern)]

In [28]:
from datetime import datetime
accuracy = (df['published']==df['prediction']).mean() * 100
print(accuracy)
# accuracy = (pd.to_datetime(test["published"], format="%d/%m/%Y") == pd.to_datetime(test["prediction"], format="%Y-%m-%d")).mean() * 100
# print(accuracy)

53.6


In [None]:
df.to_pickle('llamacpp.pkl')