In [1]:
from jinja2 import Template
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompt_template = Template(
    """
    You are an expert in structured data extraction.
    Given a document in French, extract and output only the **publication date** of the document in the format DD/MM/YYYY. Do not include any additional text or context — just the date.

    Document: "{{ document }}"
    Publication date:
    """
)

In [3]:
!pip install -U bitsandbytes




[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
unsloth_checkpoint = "unsloth/Llama-3.2-3B"
model = AutoModelForCausalLM.from_pretrained(unsloth_checkpoint,
                                             load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(unsloth_checkpoint)

def llm_complete(prompt, max_tokens=2048, device = 'cuda', temperature=0.5):
    # Fill it
    # model.to(device)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature = temperature, pad_token_id=tokenizer.eos_token_id)
    outputs_ans_only = outputs[:,len(inputs['input_ids'][0]):]
    answer_only = tokenizer.batch_decode(outputs_ans_only, skip_special_tokens=True)
    return answer_only

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#llm_complete(prompt, max_tokens = 30)

TODO:
- use regex to generate text chunks, then join them into one long context


In [8]:
df = pd.read_pickle('df_with_regex_chunks.pkl')

In [9]:
def predict_date(document, temperature=0.5):
    input = {'document': document}
    prompt = prompt_template.render(**input)
    output = llm_complete(prompt, max_tokens = 10, temperature=temperature)
    date = re.findall(r"\d{2}/\d{2}/\d{4}\b", str(output))
    if date: return date[0]
    else: return str(output)

In [10]:
test = df.sample(10)
print(test['Gold published date'])
test['prediction'] = test.apply(lambda x: predict_date(str(x['regex_chunks'])+x['doc_id']), axis=1)
print(test['prediction'])

175    31/01/2023
248    12/12/2022
3      26/01/2023
231    20/03/2024
396    30/09/2020
208    03/10/2022
70     06/02/2023
41     03/11/2022
360          None
417    25/03/2013
Name: Gold published date, dtype: object




175                                     08/02/2023
248                                     12/12/2022
3                                       26/01/2023
231                                     06/03/2024
396    [' 30 septembre 2020\n    Expected output']
208                              [' 2022-06-27\n']
70                 [' 2015-12-17\n\n    Document']
41                                      04/09/2019
360                                     26/11/2020
417                                     25/03/2013
Name: prediction, dtype: object


In [11]:
df['prediction'] = df.apply(lambda x: predict_date(str(x['regex_chunks'])+x['doc_id']), axis=1)

In [12]:
df[['Gold published date','prediction']]

Unnamed: 0,Gold published date,prediction
0,16/01/2023,[' * 16 January 2023\n Expected']
1,25/01/2023,25/01/2023
2,02/02/2023,02/02/2023
3,26/01/2023,26/01/2023
4,16/01/2023,16/01/2023
...,...,...
495,02/04/2024,24/01/2024
496,09/01/2024,[' - 2024-01-10\n ']
497,22/11/2022,22/11/2022
498,21/12/2023,15/12/2023


In [13]:
pattern = r"^\d{4}-\d{2}-\d{2}$"
test = df[df["prediction"].str.match(pattern)]

In [14]:
from datetime import datetime
accuracy = (df['Gold published date']==df['prediction']).mean() * 100
print(accuracy)
# accuracy = (pd.to_datetime(test["published"], format="%d/%m/%Y") == pd.to_datetime(test["prediction"], format="%Y-%m-%d")).mean() * 100
# print(accuracy)

49.0


In [15]:
df.to_pickle('huggingface_llama.pkl')