In [None]:
from jinja2 import Template
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import re

In [46]:
prompt_template = Template(
    """
    <|begin_of_text|>

    <|start_header_id|>user<|end_header_id|>

    Document : {{document}}

    You are an expert in structured data extraction. What is the publication date?
    Output as a structured json with format YYYY-MM-DD.
    The output should only be the date.
    <|eot_id|>

    {'publication_date':
    """
)

In [31]:
unsloth_checkpoint = "unsloth/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(unsloth_checkpoint,
                                             torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(unsloth_checkpoint)

def llm_complete(prompt, max_tokens=2048, device = 'cuda', temperature=0.5):
    # Fill it
    model.to(device)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature = temperature, pad_token_id=tokenizer.eos_token_id)
    outputs_ans_only = outputs[:,len(inputs['input_ids'][0]):]
    answer_only = tokenizer.batch_decode(outputs_ans_only, skip_special_tokens=True)
    return answer_only

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#llm_complete(prompt, max_tokens = 30)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[" '2023-02-09'}\n    }"]

TODO:
- use regex to generate text chunks, then join them into one long context


In [6]:
df = pd.read_pickle('df_with_regex_chunks.pkl')
df.sample(5)

Unnamed: 0,doc_id,url,cache,text version,nature,published,entity,entity_type,Text,Gold published date,regex_chunks,regex_chunks_strict
387,1500/b87f792e0d954ca7a84491726071a720422eb354_...,https://www.chelles.fr/wp-content/uploads/2022...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.full,17/11/2020,Chelles,Commune,RÉPUBLIQUE FRANÇAISE\nDépartement de SEINE-ET-...,17/11/2020,"[SÉANCE DU 17 NOVEMBRE 2020, Le mardi 17 novem...","[SÉANCE DU 17 NOVEMBRE 2020, Le mardi 17 novem..."
217,693/726fe_CM-2022-06-28_deliberations.pdf,https://www.bagneux92.fr/images/1-Decouvrir/ac...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,acte.delib,28/06/2022,Bagneux,Commune,,,"[Envoyé en préfecture le 06/07/2022, Reçu en p...","[légalement convoqué le 17 juin 2022, s'est as..."
218,693/fb9de_CM-2022-06-28_liste-deliberations.pdf,https://www.bagneux92.fr/images/1-Decouvrir/ac...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.cr,28/06/2022,Bagneux,Commune,,,"[mardi 28 juin 2022, en date du mardi 24 mai 2...","[mardi 28 juin 2022, en date du mardi 24 mai 2..."
420,3897/0b973a568067af5b81599f352ab598002aa31397_...,https://www.gennesvaldeloire.fr/medias/2020/07...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.cr,06/07/2020,Gennes-Val-de-Loire,Commune,,,"[Séance du 06 juillet 2020, 06 juillet, 20 heu...","[Séance du 06 juillet 2020, 30 juin 2020, 10 j..."
17,669/9b3d1_7-Procs-verbal_20230220.pdf,https://www.puygouzon.fr/wp-content/uploads/20...,https://datapolitics-public.s3.gra.io.cloud.ov...,https://datapolitics-public.s3.gra.io.cloud.ov...,pv.cr,20/02/2023,Puygouzon,Commune,CONSEIL MUNICIPAL DU 20 FÉVRIER 2023.\n\nL'an ...,20/02/2023,"[CONSEIL MUNICIPAL DU 20 FÉVRIER 2023, Adopti...","[CONSEIL MUNICIPAL DU 20 FÉVRIER 2023, Adopti..."


In [56]:
def predict_date(document, temperature=0.5):
    input = {'document': document}
    prompt = prompt_template.render(**input)
    output = llm_complete(prompt, max_tokens = 10, temperature=temperature)
    date = re.findall(r"\d{4}-\d{2}-\d{2}", str(output))
    if date: return date[0]
    else: return str(output)

In [47]:
test = df.sample(10)
print(test['published'])
test['prediction'] = test.apply(lambda x: predict_date(x['regex_chunks']), axis=1)
print(test['prediction'])

261    30/06/2022
462    08/12/2020
450    14/12/2020
130    13/01/2023
469    11/04/2024
337    27/03/2022
354    25/09/2020
244    22/03/2023
438    29/06/2023
97     26/02/2024
Name: published, dtype: object
261    2022-06-30
462    2018-02-14
450    2020-12-14
130    2023-01-13
469    2022-07-25
337    2022-02-14
354    2020-09-25
244    2023-03-22
438    2023-06-29
97     2024-02-20
Name: prediction, dtype: object


In [64]:
df['prediction'] = df.apply(lambda x: predict_date(x['regex_chunks']), axis=1)

In [65]:
df[['published','prediction']]

Unnamed: 0,published,prediction
0,16/01/2023,2023-01-16
1,25/01/2023,2023-01-18
2,31/01/2023,2023-01-26
3,26/01/2023,2023-01-26
4,16/01/2023,2023-01-09
...,...,...
495,24/01/2024,2024-01-09
496,09/01/2024,2024-01-10
497,22/11/2022,2022-11-22
498,21/12/2023,2023-12-21


In [66]:
pattern = r"^\d{4}-\d{2}-\d{2}$"
test = df[df["prediction"].str.match(pattern)]

In [67]:
from datetime import datetime

accuracy = (pd.to_datetime(test["published"], format="%d/%m/%Y") == pd.to_datetime(test["prediction"], format="%Y-%m-%d")).mean() * 100
print(accuracy)

45.65217391304348
