In [1]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
import pandas as pd
from datasets import load_dataset
import os

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
                                                # torch_dtype=torch.bfloat16,
                                                _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager").to(DEVICE)


Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


In [4]:
from PIL import Image
from transformers.image_utils import load_image


# Load images
image1 = load_image("extracted_images/428.png")
image2 = load_image("extracted_images/1.png")

# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is the publication date of this document? Answer in a numerical date format YYYY-MM-DD"}
        ]
    },
]

# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image1], return_tensors="pt")
inputs = inputs.to(DEVICE)

In [5]:
# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=50)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])

User:<image>What is the publication date of this document? Answer in a numerical date format YYYY-MM-DD
Assistant: 2023-12-20


In [21]:
generated_texts[0].split('Assistant: ')[-1]

'2023-12-20'

In [2]:
df = pd.read_csv('data_VLM.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,url,cache,path
0,0,http://www.grandchambery.fr/fileadmin/mediathe...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/0.png
1,1,http://www.ville-saint-ay.fr/userfile/fichier-...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/1.png
2,2,https://www.gatine-racan.fr/wp-content/uploads...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/2.png
3,3,https://www.ville-mazeres.fr/IMG/pdf/2023_1_1.pdf,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/3.png
4,4,https://www.fier-et-usses.com/cms_viewFile.php...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/4.png
...,...,...,...,...
495,495,https://www.estuaire-sillon.fr/fileadmin/media...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/495.png
496,496,https://plombieres-les-dijon.fr/wp-content/upl...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/496.png
497,497,https://www.orne.gouv.fr/contenu/telechargemen...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/497.png
498,498,https://www.vosges.gouv.fr/contenu/telechargem...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/498.png


In [22]:
df_org = pd.read_csv("NLP_in_industry-original_data.csv")

In [23]:
dataset = load_dataset("maribr/publication_dates_fr")
dataset = dataset['train'].to_pandas()

In [26]:
df = pd.read_csv('data_VLM.csv')
df

Unnamed: 0.1,Unnamed: 0,url,cache,path
0,0,http://www.grandchambery.fr/fileadmin/mediathe...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/0.png
1,1,http://www.ville-saint-ay.fr/userfile/fichier-...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/1.png
2,2,https://www.gatine-racan.fr/wp-content/uploads...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/2.png
3,3,https://www.ville-mazeres.fr/IMG/pdf/2023_1_1.pdf,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/3.png
4,4,https://www.fier-et-usses.com/cms_viewFile.php...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/4.png
...,...,...,...,...
495,495,https://www.estuaire-sillon.fr/fileadmin/media...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/495.png
496,496,https://plombieres-les-dijon.fr/wp-content/upl...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/496.png
497,497,https://www.orne.gouv.fr/contenu/telechargemen...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/497.png
498,498,https://www.vosges.gouv.fr/contenu/telechargem...,https://datapolitics-public.s3.gra.io.cloud.ov...,extracted_images/498.png


In [27]:
dataset[['path', 'cache']] = df[['path','cache']]

In [None]:
def label_data(data, device):
    hypothesis_dates = []
    for i, row in data.iterrows():
        path = row['path']
        if not os.path.isfile(path): 
            hypothesis_dates.append(None)
            continue
        img = load_image(path)
        messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": "What is the publication date of this document? Answer in a numerical date format YYYY-MM-DD"}
                    ]
                },
            ]
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=prompt, images=[image1], return_tensors="pt")
        inputs.to(device)
        generated_ids = model.generate(**inputs, max_new_tokens=50)
        generated_texts = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True,
        )
        hyp_text = generated_texts[0].split('Assistant:')[-1]
        hypothesis_dates.append(hyp_text)
        
    data['date_hypothesis'] = hypothesis_dates
    return data
        

Unnamed: 0,Text,Gold published date,url,path,cache
0,Procès-Verbal\nBureau du jeudi 21 décembre 202...,21/12/2023,http://www.grandchambery.fr/fileadmin/mediathe...,extracted_images/0.png,https://datapolitics-public.s3.gra.io.cloud.ov...
1,PROCES-VERBAL DE LA REUNION PUBLIQUE\nDU CONSE...,16/01/2023,http://www.ville-saint-ay.fr/userfile/fichier-...,extracted_images/1.png,https://datapolitics-public.s3.gra.io.cloud.ov...
2,CONSEIL COMMUNAUTAIRE DU\n25 JANVIER 2023\nPRO...,25/01/2023,https://www.gatine-racan.fr/wp-content/uploads...,extracted_images/2.png,https://datapolitics-public.s3.gra.io.cloud.ov...
3,Date de mise en ligne de\nl’acte : 02/ 02/2023...,23/01/2023,https://www.ville-mazeres.fr/IMG/pdf/2023_1_1.pdf,extracted_images/3.png,https://datapolitics-public.s3.gra.io.cloud.ov...
4,Envoyé en préfecture le 26/01/2023\nReçu en pr...,26/01/2023,https://www.fier-et-usses.com/cms_viewFile.php...,extracted_images/4.png,https://datapolitics-public.s3.gra.io.cloud.ov...
...,...,...,...,...,...
495,PROJET DE RAPPORT\r\nD’ORIENTATIONS BUDGETAIRE...,15/02/2024,https://www.estuaire-sillon.fr/fileadmin/media...,extracted_images/495.png,https://datapolitics-public.s3.gra.io.cloud.ov...
496,Dépârtement de la COTE-D'OR\r\nCanton de TALAN...,24/01/2024,https://plombieres-les-dijon.fr/wp-content/upl...,extracted_images/496.png,https://datapolitics-public.s3.gra.io.cloud.ov...
497,Spécial n° 10 de janvier 2024\nn° 2024 01 10\n...,09/01/2024,https://www.orne.gouv.fr/contenu/telechargemen...,extracted_images/497.png,https://datapolitics-public.s3.gra.io.cloud.ov...
498,RECUEIL DES ACTES\r\nADMINISTRATIFS SPÉCIAL\r\...,22/11/2022,https://www.vosges.gouv.fr/contenu/telechargem...,extracted_images/498.png,https://datapolitics-public.s3.gra.io.cloud.ov...
