In [1]:
from jinja2 import Template
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import re

In [2]:
prompt_template = Template(
    """
    You are an expert in structured data extraction.
    Given a document in French, extract and output only the **publication date** of the document in the format DD/MM/YYYY. Do not include any additional text or context — just the date.

    Document: "{{ document }}"
    Publication date:
    """
)

In [3]:
!pip install -U bitsandbytes



In [9]:
unsloth_checkpoint = "unsloth/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(unsloth_checkpoint,
                                             load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(unsloth_checkpoint)

def llm_complete(prompt, max_tokens=2048, device = 'cuda', temperature=0.5):
    # Fill it
    # model.to(device)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature = temperature, pad_token_id=tokenizer.eos_token_id)
    outputs_ans_only = outputs[:,len(inputs['input_ids'][0]):]
    answer_only = tokenizer.batch_decode(outputs_ans_only, skip_special_tokens=True)
    return answer_only

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#llm_complete(prompt, max_tokens = 30)

TODO:
- use regex to generate text chunks, then join them into one long context


In [15]:
df = pd.read_pickle('df_2.pkl')
df.sample(5)

Unnamed: 0,doc_id,url,cache,text version,nature,published_datapolitics,entity_datapolitics,entity_type_datapolitics,Text,Gold published date,regex_chunks,regex_chunks_strict
129,1500/8e894fa17d3c18ca9f22dbe579ac12624474e26f_...,https://www.chelles.fr/wp-content/uploads/2023...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,acte.raa,15/01/2023,Chelles,Commune,,13/01/2023,[Direction des espaces publics\nNo A 2023-15\n...,"[\nVu le Code de la voirie routière,\nVu l’arr..."
189,2465/dbfb6cec02f7c2692f79e5b12b22650b5b81acff_...,http://www.agglo-pvm.fr/fileadmin/medias/Publi...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.cr,06/02/2020,CA Paris - Vallée de la Marne,Intercommunalité,DOSSIER DE PRESSE\r\nConseil communautaire\r\n...,06/02/2020,[DOSSIER DE PRESSE\nConseil communautaire\nJeu...,[DOSSIER DE PRESSE\nConseil communautaire\nJeu...
457,4850/e7b57_Compte-rendu-04-mars-2020-annexe.pdf,https://www.neuillysurmarne.fr/wp-content/uplo...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,bdj.annexes,04/03/2020,Neuilly-sur-Marne,Commune,,04/03/2020,[AA\nPublication faite en conformité de l'arti...,[ ouvre la séance du Conseil Municipal du merc...
78,2609/14f39579b9846c6a37d16d42151d0b52fb439d1b_...,https://ccpp06.fr/wp-content/uploads/2022/10/2...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,acte.delib,16/03/2023,CC du Pays des Paillons,Intercommunalité,CONSEIL COMMUNAUTAIRE\nSéance du jeudi 16 mars...,16/03/2023,[Conseil Communautaire du jeudi 16 mars 2023\n...,[Conseil Communautaire du jeudi 16 mars 2023\n...
179,635/c7b0d_3-1-reglement-plu-modifie-en-2020.pdf,http://www.creutzwald.fr/UserFiles/File/plu/3-...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,dlao.autres,24/02/2020,Creutzwald,Commune,P.L.U. approuvé par DCM n° 14168 du 28/11/2005...,24/02/2020,[P.L.U. approuvé par DCM n° 14168 du 28/11/200...,[re communal :\n\nL’article R.111-2\n(D. n° 76...


In [7]:
def predict_date(document, temperature=0.5):
    input = {'document': document}
    prompt = prompt_template.render(**input)
    output = llm_complete(prompt, max_tokens = 10, temperature=temperature)
    date = re.findall(r"\d{2}/\d{2}/\d{4}\b", str(output))
    if date: return date[0]
    else: return str(output)

In [16]:
test = df.sample(10)
print(test['Gold published date'])
test['prediction'] = test.apply(lambda x: predict_date(str(x['regex_chunks'])+x['doc_id']), axis=1)
print(test['prediction'])

70     27/02/2023
403    29/09/2020
316    13/04/2022
408    18/10/2023
392    27/07/2020
451    14/12/2020
152    08/12/2022
311    09/05/2022
390    03/11/2020
149    25/10/2022
Name: Gold published date, dtype: object
70                                 27/02/2023
403                                29/09/2020
316                                13/04/2022
408                                18/10/2023
392                                01/07/2018
451                                14/12/2020
152                [' 8/12/2022\n\n    Here']
311           [" '09-05-2022'\n\n    Output"]
390             [' 3/11/2020\n\n    """\n\n']
149    [' "24 octobre 2022"\n\n    Expected']
Name: prediction, dtype: object


In [17]:
df['prediction'] = df.apply(lambda x: predict_date(str(x['regex_chunks'])+x['doc_id']), axis=1)

In [18]:
df[['Gold published date','prediction']]

Unnamed: 0,Gold published date,prediction
0,21/12/2023,21/12/2023
1,16/01/2023,"[' ""Lundi 16 January 2023""\n\n']"
2,25/01/2023,"[' ""25 January 2023""\n\n Output:\n']"
3,02/02/2023,"[' ""02/ 02/2023""\n\n ']"
4,26/01/2023,26/01/2023
...,...,...
495,15/02/2024,24/02/2024
496,24/01/2024,24/01/2024
497,09/01/2024,01/01/2024
498,22/11/2022,22/11/2022


In [19]:
pattern = r"^\d{4}-\d{2}-\d{2}$"
test = df[df["prediction"].str.match(pattern)]

In [20]:
from datetime import datetime
accuracy = (df['Gold published date']==df['prediction']).mean() * 100
print(accuracy)
# accuracy = (pd.to_datetime(test["published"], format="%d/%m/%Y") == pd.to_datetime(test["prediction"], format="%Y-%m-%d")).mean() * 100
# print(accuracy)

38.0


In [None]:
df.to_pickle('llamacpp.pkl')