Lets start with the fine tuning

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForQuestionAnswering,
    AutoModelForSeq2SeqLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
from pathlib import Path
import os 
from natsort import natsorted

2024-09-11 00:25:15.273726: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

Loading the data

In [3]:
data_path = Path("../data/")
pdf_files = [data_path / "train/preprocess" / f for f in os.listdir(data_path/ "train/preprocess") if f.endswith(".pdf")]

In [4]:
pdf_files = natsorted(pdf_files)

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores.utils import filter_complex_metadata

In [6]:
import re
pdf_data = []
for f in pdf_files:
    try:
        WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
        loader = PyPDFLoader(f)
        data = loader.load()
        pdf_data.append(WHITESPACE_HANDLER(filter_complex_metadata(data)[0].page_content))
    except Exception as e:
        pdf_data.append("")

In [7]:
import pandas as pd

In [8]:
df = pd.read_excel("../data/train/Data.xlsx")

In [9]:
true_data = []
for index, row in df.iterrows():
    true_data.append(f"""Empresa: {row["Empresa"]}
Nit: {row["Nit"]}
Factura: {row["Numero_Factura"]}
Base: {row["Base"]}
IVA: {row["IVA"]}
Total: {row["Total"]}""")

From models

In [34]:
model_name = "google/flan-t5-base"

#from transformers import pipeline

#question_answerer = pipeline("question-answering", model=model_name, device=device)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [28]:
def tokenizer_func(example: str):
    text = example["text"]
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, #We pass the text of the dataset
        return_tensors="pt", #The datatype that we want the output data
        truncation=True,
        padding=True,
        max_length=4065 #This is intrinsic to the len of the model
    )
    labels = tokenizer(example["output"],
        return_tensors="pt", #The datatype that we want the output data
        truncation=True,
        padding=True,
        max_length=4065 #This is intrinsic to the len of the model
    )
                       
    tokenized_inputs["labels"]= labels["input_ids"]
    return tokenized_inputs

In [25]:
dataset_dic = {"text": pdf_data, "output": true_data}
dataset_dic = Dataset.from_dict(dataset_dic)
dataset_dic = dataset_dic.train_test_split(0.1)
dataset_dic

DatasetDict({
    train: Dataset({
        features: ['text', 'output'],
        num_rows: 87
    })
    test: Dataset({
        features: ['text', 'output'],
        num_rows: 10
    })
})

In [26]:
if tokenizer.pad_token is None:
    #Se debe adicionar el token de pad
    tokenizer.add_special_tokens({"pad_token": "[PAD]"}) 
    #También se debe actualizar en el modelo los nuevos largos del token
    model.resize_token_embeddings(len(tokenizer))

In [29]:
tokenizer_dataset = dataset_dic.map(tokenizer_func, batched=True)

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [36]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [31]:
tokenizer_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 87
    })
    test: Dataset({
        features: ['text', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})

In [30]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [35]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [18]:
peft_config = LoraConfig(
                        task_type="SEQ_2_SEQ_LM", # FLAN-T5
                        r=32,
                        lora_alpha=32,
                        lora_dropout=0.05,
                        target_modules = ['q',"v"])

In [19]:
model = get_peft_model(model, peft_config).to(device)
model.print_trainable_parameters()

Could not load bitsandbytes native library: /home/kaiki/anaconda3/envs/tensorflow/lib/python3.11/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /home/kaiki/anaconda3/envs/tensorflow/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_rocm61.so)
Traceback (most recent call last):
  File "/home/kaiki/anaconda3/envs/tensorflow/lib/python3.11/site-packages/bitsandbytes/cextension.py", line 126, in <module>
    lib = get_native_library()
          ^^^^^^^^^^^^^^^^^^^^
  File "/home/kaiki/anaconda3/envs/tensorflow/lib/python3.11/site-packages/bitsandbytes/cextension.py", line 104, in get_native_library
    dll = ct.cdll.LoadLibrary(str(binary_path))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kaiki/anaconda3/envs/tensorflow/lib/python3.11/ctypes/__init__.py", line 454, in LoadLibrary
    return self._dlltype(name)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/kaiki/anaconda3/envs/tensorflow/lib/python3.

g++ (GCC) 14.2.1 20240910
Copyright (C) 2024 Free Software Foundation, Inc.
Esto es software libre; vea el código para las condiciones de copia.  NO hay
garantía; ni siquiera para MERCANTIBILIDAD o IDONEIDAD PARA UN PROPÓSITO EN
PARTICULAR

trainable params: 1,376,256 || all params: 78,337,408 || trainable%: 1.7568


In [20]:
lr = 1e-3
batch_size = 4
num_epochs = 20

In [21]:
training_args = TrainingArguments(
    output_dir= "../models/" + model_name + "-lora-data-extraction",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    bf16=True
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_dataset["train"],
    eval_dataset=tokenizer_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
)

In [23]:
trainer.train()

  0%|          | 0/440 [00:00<?, ?it/s]

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

In [40]:
tokenizer_dataset["train"]["input_ids"][0]

[3,
 5211,
 6392,
 10,
 3,
 3651,
 4327,
 4906,
 22772,
 15,
 6348,
 2294,
 115,
 2469,
 4060,
 591,
 26,
 357,
 15,
 927,
 115,
 519,
 4225,
 115,
 3647,
 3647,
 15,
 2773,
 89,
 75,
 89,
 519,
 3341,
 15,
 75,
 2294,
 4198,
 15,
 357,
 15,
 4448,
 3264,
 115,
 89,
 1298,
 9,
 89,
 536,
 15442,
 9,
 5865,
 26,
 9,
 15,
 3539,
 4122,
 9,
 9,
 75,
 4608,
 89,
 591,
 3951,
 3628,
 26,
 2128,
 2128,
 3710,
 519,
 26,
 3710,
 377,
 16375,
 3597,
 11151,
 283,
 5905,
 10781,
 667,
 180,
 5,
 188,
 5,
 134,
 5,
 445,
 3177,
 5,
 3,
 3914,
 23758,
 4853,
 5,
 4440,
 20445,
 71,
 1744,
 9,
 2138,
 138,
 9,
 10,
 480,
 448,
 943,
 205,
 335,
 180,
 5905,
 898,
 536,
 3,
 18,
 10285,
 5,
 1844,
 3,
 632,
 2560,
 2773,
 17402,
 32,
 10,
 480,
 448,
 9065,
 12210,
 6897,
 3,
 18,
 10285,
 5,
 1844,
 3,
 632,
 2560,
 1714,
 10636,
 154,
 89,
 106,
 32,
 3,
 2,
 2532,
 32,
 10,
 41,
 3436,
 7256,
 1640,
 591,
 1630,
 3,
 4508,
 3,
 18,
 3147,
 668,
 24636,
 23360,
 8067,
 3820,
 2,
 29,
 3,
 18,
 18