In [1]:
from jinja2 import Template
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
prompt_template = Template(
    """
    You are an expert in structured data extraction.
    The document is: "{{ document }}"

    Extract and output only the **publication date** of the document in the format YYYY-MM-DD. 
    Do not include any additional text or context—just the date. The answer has to be 10 characters long.
    Publication date:
    """
)

In [3]:
unsloth_checkpoint = "unsloth/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(unsloth_checkpoint,
                                             torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(unsloth_checkpoint)

def llm_complete(prompt, max_tokens=2048, device = 'cuda', temperature=0.5):
    # Fill it
    model.to(device)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature = temperature, pad_token_id=tokenizer.eos_token_id)
    outputs_ans_only = outputs[:,len(inputs['input_ids'][0]):]
    answer_only = tokenizer.batch_decode(outputs_ans_only, skip_special_tokens=True)
    return answer_only

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#llm_complete(prompt, max_tokens = 30)

TODO:
- use regex to generate text chunks, then join them into one long context


In [5]:
df = pd.read_pickle('df_with_regex_chunks.pkl')
df.sample(5)

Unnamed: 0,doc_id,url,cache,text version,nature,published,entity,entity_type,Text,Gold published date,regex_chunks,regex_chunks_strict
482,1482/bc2fa_Proc%C3%A8s%20verbal%20du%20CM%20du...,https://www.villederueil.fr/sites/default/file...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,acte.delib,15/07/2020,Rueil-Malmaison,Commune,COMMUNE DE RUEIL-MALMAISON\n(HAUTS-DE-SEINE)\n...,15/07/2020,,
382,94/d32fa446d741892848b9c6cb24b689ede37b8413_re...,https://www.bouches-du-rhone.gouv.fr/contenu/t...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,acte.arrete,04/09/2020,Bouches-du-Rhône,Conseil départemental,Centre de détention de Salon de Provence – BP ...,04/09/2020,"[N°13-2020-219, PUBLIÉ LE 4 SEPTEMBRE 2020, 13...","[PUBLIÉ LE 4 SEPTEMBRE 2020, 13-2020-09-01-018..."
184,2928/35ddb_cms_viewFile.php,https://www.valdedrome.com/cms_viewFile.php?id...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,acte.delib,27/02/2020,CC du Val de Drôme en Biovallée,Intercommunalité,Visa de la Préfecture : 026-242600252-20200225...,27/02/2020,[Visa de la Préfecture : 026-242600252-2020022...,"[Date de convocation : 11 février 2020, Vu la ..."
153,1542/5587f_pv_cm_du_12-12-2022_signe.pdf,https://www.villedegarges.fr/sites/default/fil...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.cr,12/12/2022,Garges-lès-Gonesse,Commune,Acte affiché du 27/12/2022 au 28/02/2023 (not ...,27/12/2022,"[DU 12 DÉCEMBRE, 2022, assembkâ à la Mairie, S...","[du 24 juin 2015,]"
94,3245/f50038dafee07af671e46f8b6d58cc28cd0191a8_...,https://www.iledefrance.fr/actes/proces-verbau...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.cr,29/03/2023,Île-de-France,Conseil régional,,29/03/2023,"[Commission permanente du 29 mars 2023, 2 rue ...","[Commission permanente du 29 mars 2023, Approb..."


In [6]:
def predict_date(document, temperature=0.5):
    input = {'document': document}
    prompt = prompt_template.render(**input)
    output = llm_complete(prompt, max_tokens = 10, temperature=temperature)
    date = re.findall(r"\d{4}-\d{2}-\d{2}", str(output))
    if date: return date[0]
    else: return str(output)

In [10]:
test = df.sample(10)
print(test['published'])
test['prediction'] = test.apply(lambda x: predict_date(x['regex_chunks']), axis=1)
print(test['prediction'])

100    30/03/2023
253    09/03/2024
450    14/12/2020
106    01/02/2023
47     13/03/2023
81     31/03/2023
290    16/03/2022
174    09/12/2022
72     27/03/2023
351    01/01/2022
Name: published, dtype: object
100    2017-03-15
253    2024-02-26
450    2020-12-14
106    2022-09-01
47     2023-03-13
81     2023-04-12
290    2022-02-19
174    2022-12-07
72     2023-03-27
351    2019-01-01
Name: prediction, dtype: object


In [11]:
df['prediction'] = df.apply(lambda x: predict_date(x['regex_chunks']), axis=1)

In [12]:
df[['published','prediction']]

Unnamed: 0,published,prediction
0,16/01/2023,2023-01-16
1,25/01/2023,2023-01-25
2,31/01/2023,2023-02-02
3,26/01/2023,2023-01-26
4,16/01/2023,2023-01-09
...,...,...
495,24/01/2024,2024-01-24
496,09/01/2024,2024-01-10
497,22/11/2022,2022-11-21
498,21/12/2023,2023-12-21


In [13]:
pattern = r"^\d{4}-\d{2}-\d{2}$"
test = df[df["prediction"].str.match(pattern)]

In [14]:
from datetime import datetime

accuracy = (pd.to_datetime(test["published"], format="%d/%m/%Y") == pd.to_datetime(test["prediction"], format="%Y-%m-%d")).mean() * 100
print(accuracy)

46.50205761316872


In [15]:
df.to_pickle('llamacpp.pkl')