In [None]:
# Instale as bibliotecas com o requirements.txt ------------------------------------
import pandas as pd
import numpy as np
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, BertForSequenceClassification
import torch
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay
from collections import Counter

In [None]:
# Execute esse bloco apenas se estiver usando o Google Colab para execução --------------------------
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Acesse o Hugging Face e crie uma chave para poder usar alguns modelos, como o LLama e o Gemma -------------------
from huggingface_hub import login
login('*********************************')

In [None]:
# Carregando o dataset ----------------------------------
pasta = '/content/drive/My Drive/Mestrado/Public_Contracts/'

df = pd.read_parquet(pasta + 'Notas_Fiscais_Itens_2023.parquet')
df.head()

Unnamed: 0,natureza_operacao,data_emissao,cnpj_cpf_emitente,razao_social_emitente,uf_emitente,mun_emitente,nome_destinatario,descricao_produto,ncm_produto,quantidade,valor_unitario,valor_total,classe,text
0,Outra saida merc./prest.serv. nao especif.,2023-01-01,7432517001847,SIMPRESS COMERCIO LOCACAO E SERVICOS LTDA,SC,ITAJAI,MINISTERIO DA JUSTICA E SEGURANCA PUBLICA,MLTD201LXAZ CARTUCHO DE TONER PRETO 20K PAGINAS,Cartuchos de revelador (toners),1.0,169.58,169.58,0,[CLS] Natureza da operacao: Outra saida merc./...
1,Remessa de bem p/conta contrato de comodato ou...,2023-01-01,7432517001847,SIMPRESS COMERCIO LOCACAO E SERVICOS LTDA,SC,ITAJAI,UNIVERSIDADE FEDERAL RURAL DO RIO DE JANEIRO,HP GABINETE METALICO,Outras partes e acessórios para aparelhos de f...,1.0,560.88,560.88,0,[CLS] Natureza da operacao: Remessa de bem p/c...
2,VENDA DE MARCADORIA ADQUIRIDA DE TERCEIROS,2023-01-01,34849096000189,"SEEK COMERCIO DE LIVROS, JORNAIS E REVISTAS LTDA",RJ,RIO DE JANEIRO,UNIVERSIDADE FEDERAL DO DELTA DO PARNAIBA UFDPAR,METODOS QUANTITATIVOS APLICADOS A CONTABILIDADE,"Outros livros, brochuras e impressos semelhantes",12.0,82.6,991.2,0,[CLS] Natureza da operacao: VENDA DE MARCADORI...
3,VENDA DE MARCADORIA ADQUIRIDA DE TERCEIROS,2023-01-01,34849096000189,"SEEK COMERCIO DE LIVROS, JORNAIS E REVISTAS LTDA",RJ,RIO DE JANEIRO,UNIVERSIDADE FEDERAL DO DELTA DO PARNAIBA UFDPAR,GESTAO ESTRATEGICA DE ARMAZENAMENTO,"Outros livros, brochuras e impressos semelhantes",12.0,78.4,940.8,0,[CLS] Natureza da operacao: VENDA DE MARCADORI...
4,VENDA DE MARCADORIA ADQUIRIDA DE TERCEIROS,2023-01-01,34849096000189,"SEEK COMERCIO DE LIVROS, JORNAIS E REVISTAS LTDA",RJ,RIO DE JANEIRO,UNIVERSIDADE FEDERAL DO DELTA DO PARNAIBA UFDPAR,RELACOES INTERNACIONAIS DA ASIA E DA AFRICA,"Outros livros, brochuras e impressos semelhantes",12.0,88.2,1058.4,0,[CLS] Natureza da operacao: VENDA DE MARCADORI...


In [None]:
# Removendo linhas com NA's ---------------------------------
df = df.dropna().reset_index(drop = True)
df = df[['text', 'classe']].rename(columns = {'classe':'label'})
len(df)

5980558

In [None]:
print('Dados de treino:\n',df.groupby('label', as_index = False).size())

Dados de treino:
    label     size
0      0  5731120
1      1   249438


In [None]:
# Criando dataset balanceado com N quantidades de cada classe selecionadas de forma random ----------------------
samples_per_class = 20000

# Verificar se há exemplos suficientes em cada classe
count_label_0 = df[df['label'] == 0].shape[0]
count_label_1 = df[df['label'] == 1].shape[0]

if count_label_0 >= samples_per_class and count_label_1 >= samples_per_class:

    df_label_0 = df[df['label'] == 0].sample(n=samples_per_class, random_state=42)

    df_label_1 = df[df['label'] == 1].sample(n=samples_per_class, random_state=42)

    df_amostra = pd.concat([df_label_0, df_label_1]).reset_index(drop=True)

else:

    print("Não há exemplos suficientes em uma das classes para realizar a amostragem desejada.")

In [None]:
print('Dados de treino:\n',df_amostra.groupby('label', as_index = False).size())

Dados de treino:
    label   size
0      0  20000
1      1  20000


In [None]:
# Converter o dataframe para o formato do Hugging Face Datasets ---------------
dataset = Dataset.from_pandas(df_amostra)

dataset = dataset.class_encode_column("label")

# Dividir o conjunto de dados em treino e teste ---------------
#dataset = dataset.train_test_split(test_size = 0.3, seed = 42, stratify_by_column='label')
dataset = dataset.train_test_split(test_size = 0.3, seed = 42)

In [None]:
# Conferindo quantidade de registros 0 e 1 para treino e teste ----------------------------
print("Treino: ", Counter(dataset['train']['label']))
print("Teste: ", Counter(dataset['test']['label']))

In [None]:
# Verificar se o Colab está usando a GPU disponível -------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device='cpu'
print(f'Usando o dispositivo: {device}')

Usando o dispositivo: cuda


In [None]:
# Modelos testados -----------------------------------
#model_name = 'meta-llama/Llama-2-7b-hf'
#model_name = 'meta-llama/Llama-3.2-3B'
#model_name = 'google/gemma-2-2b'
#model_name = 'openai-community/roberta-large-openai-detector'
#model_name = 'openai-community/gpt2'
#model_name = 'openai-community/gpt2-large'
#model_name = 'neuralmind/bert-base-portuguese-cased'
#model_name = 'neuralmind/bert-large-portuguese-cased'
#model_name = 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B'
model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

In [None]:
# Carregando o modelo ---------------------------------
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Configurar o LoRA ---------------------------------
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
)

# Aplicar o LoRA ao modelo ---------------------------------
model = get_peft_model(model, lora_config)

In [None]:
def preprocess_function(examples):

    # Use truncation = True e algum valor em max_length apenas se estiver consumindo muito recurso
    #return tokenizer(examples['text'], truncation=True, max_length=512)
    return tokenizer(examples['text'], truncation=False)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

Map:   0%|          | 0/28000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 28000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 12000
    })
})

In [None]:
# Carregar as métricas de avaliação -----------------------------
metric = evaluate.load('confusion_matrix')

def compute_metrics(eval_pred):

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, predictions)
    prec = precision_score(labels, predictions)
    rec = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    auc = roc_auc_score(labels, logits[:, 1])  

    return {'accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1': f1,
            'auc': auc}

Downloading builder script:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

In [None]:
# Definir os argumentos de treinamento -----------------------------
training_args = TrainingArguments(
    output_dir = "model_trained",
    learning_rate = 2e-5,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    num_train_epochs = 5,
    weight_decay = 0.01,
    fp16=False,
    #bf16=True,
    no_cuda=False,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    push_to_hub = False
)

# Criar o Trainer -----------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
     data_collator = data_collator
)


In [None]:
# Limpar eventual cache do Torch para ter mais GPU livre ------------------
torch.cuda.empty_cache()

In [None]:
# Usar só quando for com BERT ---------------------------------
#if tokenizer.pad_token is None:
#    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#    model.resize_token_embeddings(len(tokenizer))

In [None]:
# Iniciar o treinamento ------------------------------
# Obs: acesse https://wandb.ai/site/ e crie uma conta e depois uma chave, caso o código solicite, use essa chave
# o Wandb é para guardar métricas de treinamento

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcleitonotavio058[0m ([33monlyme058[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.8662,0.728157,0.871667,0.834888,0.929903,0.879838,0.940236
2,0.4221,0.486926,0.911083,0.9046,0.921161,0.912805,0.964368
3,0.43,0.473774,0.922917,0.907778,0.943262,0.92518,0.97072
4,0.1167,0.480658,0.930833,0.928303,0.935346,0.931811,0.973339
5,0.0267,0.456003,0.933667,0.932359,0.936665,0.934507,0.974594


TrainOutput(global_step=140000, training_loss=0.3851294105444636, metrics={'train_runtime': 59565.2422, 'train_samples_per_second': 2.35, 'train_steps_per_second': 2.35, 'total_flos': 9.706819002330778e+17, 'train_loss': 0.3851294105444636, 'epoch': 5.0})

In [None]:
# Limpar eventual cache do Torch para ter mais GPU livre ------------------
torch.cuda.empty_cache()

In [None]:
# Avaliar o modelo ------------------------------------------
evaluation = trainer.evaluate()
evaluation

{'eval_loss': 0.4560030996799469,
 'eval_accuracy': 0.9336666666666666,
 'eval_precision': 0.9323592185191266,
 'eval_recall': 0.9366650173181593,
 'eval_f1': 0.9345071581372387,
 'eval_auc': 0.9745940045445566,
 'eval_runtime': 2052.028,
 'eval_samples_per_second': 5.848,
 'eval_steps_per_second': 5.848,
 'epoch': 5.0}

In [None]:
# Criar a matriz de confusão -----------------------------------
predictions = trainer.predict(tokenized_datasets['test'])
logits, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(logits, axis=-1)

cm = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])

print(cm)
print(disp)

# [[TN, FP],
# [FN, TP]]

[[5525  412]
 [ 384 5679]]
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay object at 0x7eb431999190>


In [None]:
# Criar a matriz de custo -----------------------------------
cost_FP = 1  # Custo de Falso Positivo
cost_FN = 5  # Custo de Falso Negativo
cost_TP = 0  # Custo de Verdadeiro Positivo
cost_TN = 0  # Custo de Verdadeiro Negativo

tn, fp, fn, tp = cm.ravel()

# Calcular o custo total ------------------------------------
total_cost = (fp * cost_FP) + (fn * cost_FN) + (tp * cost_TP) + (tn * cost_TN)
print(f'Custo Total: {total_cost}')

Custo Total: 2332


In [None]:
# Criar a matriz de custo -----------------------------------
cost_FP = 5  # Custo de Falso Positivo
cost_FN = 1  # Custo de Falso Negativo
cost_TP = 0  # Custo de Verdadeiro Positivo
cost_TN = 0  # Custo de Verdadeiro Negativo

tn, fp, fn, tp = cm.ravel()

# Calcular o custo total ------------------------------------
total_cost = (fp * cost_FP) + (fn * cost_FN) + (tp * cost_TP) + (tn * cost_TN)
print(f'Custo Total: {total_cost}')

Custo Total: 2444
