### Teste Transformers

In [6]:
import json
from transformers import pipeline
import os

In [15]:
BASE_DIR = os.getcwd()
DIR_PAI = os.path.dirname(BASE_DIR)
DIR_DATA = os.path.join(DIR_PAI, "data")
DIR_DATA_RAW = os.path.join(DIR_DATA, "raw")
DIR_DATA_PROCESSED =  os.path.join(DIR_DATA, "processed")
DIR_DATA_OUTPUTS =  os.path.join(DIR_DATA, "outputs")
DIR_DATA_OUTPUTS_TRANSFORMERS =  os.path.join(DIR_DATA, "outputs_transformers")
DIR_DATA_PROCESSED_CLEAN =  os.path.join(DIR_DATA, "processed_clean")

In [10]:
def load_transformer_pipeline(task, model_name):
    """
    Configura e retorna um pipeline HuggingFace para a tarefa especificada.
    Args:
        task (str): A tarefa que o pipeline deve realizar (ex: "summarization", "question-answering").
        model_name (str): O nome ou caminho do modelo pré-treinado.
    Returns:
        pipeline: Um pipeline configurado do HuggingFace Transformers.
    """
    return pipeline(task, model=model_name)

In [11]:
def process_summarization(texts, transformer_pipeline):
    """
    Processa uma lista de textos para a tarefa de sumarização.
    Args:
        texts (list): Lista de textos para processar.
        transformer_pipeline (pipeline): Pipeline do HuggingFace configurado para sumarização.
    Returns:
        list: Lista de resumos gerados.
    """
    summaries = []
    for text in texts:
        try:
            summary = transformer_pipeline(
                text, truncation=True, max_length=200, min_length=30
            )
            summaries.append(summary[0]["summary_text"])
        except Exception as e:
            print(f"Erro ao processar texto para sumarização: {e}")
            summaries.append("")
    return summaries

In [12]:
def process_question_answering(texts, transformer_pipeline, question):
    """
    Processa uma lista de textos para a tarefa de question-answering (QA).
    Args:
        texts (list): Lista de textos para processar.
        transformer_pipeline (pipeline): Pipeline do HuggingFace configurado para QA.
        question (str): Pergunta a ser feita para cada texto.
    Returns:
        list: Lista de respostas geradas.
    """
    answers = []
    for text in texts:
        try:
            answer = transformer_pipeline(question=question, context=text)
            answers.append(answer["answer"])
        except Exception as e:
            print(f"Erro ao processar texto para QA: {e}")
            answers.append("")
    return answers

In [13]:
def process_json_files(input_dir, output_dir, transformer_pipeline, task, question=None):
    """
    Processa arquivos JSON contendo textos e aplica o modelo Transformer para a tarefa especificada.
    Args:
        input_dir (str): Caminho para a pasta de entrada contendo os arquivos JSON.
        output_dir (str): Caminho para a pasta de saída onde os resultados serão salvos.
        transformer_pipeline (pipeline): Pipeline do HuggingFace configurado.
        task (str): A tarefa a ser realizada ("summarization" ou "question-answering").
        question (str): Pergunta para QA (necessária se task="question-answering").
    """
    os.makedirs(output_dir, exist_ok=True)
    for json_file in os.listdir(input_dir):
        if json_file.endswith(".json"):
            input_path = os.path.join(input_dir, json_file)
            output_path = os.path.join(output_dir, json_file)

            with open(input_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            texts = [entry["text"] for entry in data]
            if task == "summarization":
                results = process_summarization(texts, transformer_pipeline)
            elif task == "question-answering":
                if not question:
                    raise ValueError("Uma pergunta deve ser fornecida para QA.")
                results = process_question_answering(texts, transformer_pipeline, question)
            else:
                raise ValueError(f"Tarefa '{task}' não suportada.")

            # Adicionar os resultados aos dados originais
            for entry, result in zip(data, results):
                entry[task] = result

            # Salvar os resultados processados
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

            print(f"Processado e salvo: {output_path}")

In [None]:
if __name__ == "__main__":
    # Configurações
    input_dir = DIR_DATA_PROCESSED_CLEAN  # Pasta com os JSONs extraídos
    output_dir = DIR_DATA_OUTPUTS_TRANSFORMERS  # Pasta para salvar os resultados
    model_name = "google/flan-t5-small"  # Modelo Transformer a ser usado
    task = "summarization"  # Task: "summarization" ou "question-answering"
    question = "Qual é a informação mais importante deste texto?"  # Necessário para QA

    # Carregar o pipeline
    print(f"Carregando modelo {model_name} para a tarefa '{task}'...")
    transformer_pipeline = load_transformer_pipeline(task, model_name)

    # Processar arquivos JSON
    process_json_files(input_dir, output_dir, transformer_pipeline, task, question)

Carregando modelo google/flan-t5-small para a tarefa 'summarization'...


Your max_length is set to 200, but your input_length is only 129. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 200, but your input_length is only 183. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=91)
Your max_length is set to 200, but your input_length is only 126. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)


Processado e salvo: c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\outputs_transformers\fluidos_11484.json


Your max_length is set to 200, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 200, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 200, but your input_length is only 104. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 200, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Y

Processado e salvo: c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\outputs_transformers\fluidos_11640.json


Your max_length is set to 200, but your input_length is only 148. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=74)
Your max_length is set to 200, but your input_length is only 107. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 200, but your input_length is only 130. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=65)
Your max_length is set to 200, but your input_length is only 128. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)


Processado e salvo: c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\outputs_transformers\fluidos_13271.json


Your max_length is set to 200, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Your max_length is set to 200, but your input_length is only 135. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)
Your max_length is set to 200, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 200, but your input_length is only 174. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Y

Processado e salvo: c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\outputs_transformers\fluidos_13417.json


Your max_length is set to 200, but your input_length is only 136. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 200, but your input_length is only 135. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)
Your max_length is set to 200, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 200, but your input_length is only 163. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=81)


Processado e salvo: c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\outputs_transformers\fluidos_13472.json
Processado e salvo: c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\outputs_transformers\fluidos_13572.json


Your max_length is set to 200, but your input_length is only 77. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


Processado e salvo: c:\Users\axel.chepanski\doutor-ia\1 - extract-pdfs-transformer\data\outputs_transformers\fluidos_13852.json


Your max_length is set to 200, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 200, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 200, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
Your max_length is set to 200, but your input_length is only 193. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
