In [1]:
from datasets import load_dataset
import torch
import evaluate
import time
import pandas as pd
import numpy as np

In [2]:
# Importações da biblioteca transformers para processamento de linguagem natural

# AutoModelForSeq2SeqLM:
# - Classe para modelos sequence-to-sequence (seq2seq)
# - Usado quando entrada e saída são textos (ex: tradução, resumo)
# - "Auto" carrega automaticamente a arquitetura correta do modelo
from transformers import AutoModelForSeq2SeqLM

# AutoTokenizer:
# - Converte texto em tokens (números) para processamento do modelo
# - Divide texto em unidades menores (tokenização)
# - Faz a conversão inversa: tokens -> texto
# - Carrega automaticamente o tokenizador específico do modelo
from transformers import AutoTokenizer

# GenerationConfig:
# - Configura os parâmetros de geração de texto do modelo
# - Controla:
#   * Comprimento máximo do texto gerado
#   * Temperatura da geração
#   * Estratégias de amostragem
#   * Outros parâmetros de geração
from transformers import GenerationConfig

# Trainer:
# - Gerencia todo o processo de treinamento
# - Funcionalidades:
#   * Executa loop de treinamento
#   * Otimização do modelo
#   * Salvamento de checkpoints
#   * Avaliação do modelo
#   * Sistema de logging
from transformers import Trainer

# TrainingArguments:
# - Define configurações do treinamento
# - Parâmetros incluem:
#   * Learning rate
#   * Número de épocas
#   * Tamanho do batch
#   * Diretório de saída
#   * Frequência de logging/salvamento
from transformers import TrainingArguments

# Importação combinada mais concisa:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer, 
    GenerationConfig,
    Trainer,
    TrainingArguments
)

#### 1.2 Load Dataset and LLM

In [3]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
model_name = "google/flan-t5-base"

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, 
                                                       torch_dtype=torch.bfloat16, # 16-bit precision for faster processing and less memory usage
                                                       )

tokenizer = AutoTokenizer.from_pretrained(model_name)

#Its possible to pull out the number of model parameters and the number of trainable parameters. The following function does this.

In [5]:
def print_number_of_trainable_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"Trainable model parameters: {trainable_model_params}\nAll model parameters: {all_model_params}\nPercentage of trainable parameters: {100 * (trainable_model_params / all_model_params):.2f}%"

print(print_number_of_trainable_parameters(original_model)) 


Trainable model parameters: 247577856
All model parameters: 247577856
Percentage of trainable parameters: 100.00%


### 1.3 Zero Shot Summary

In [6]:
from transformers.cache_utils import EncoderDecoderCache

index = 200

dialogue = dataset['test'][index]['dialogue'] # This is the dialogue from the dataset
summary = dataset['test'][index]['summary'] # This is the summary from the dataset

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

print(prompt) 

inputs = tokenizer(prompt, return_tensors='pt') # This is the input for the model

outputs_decoded = tokenizer.decode(
    original_model.generate(
        inputs['input_ids'],
        max_new_tokens=100,
        past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values) if 'past_key_values' in locals() else None
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))

print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{outputs_decoded}')


Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you conside

## 2. Fine-tuning the Model

2.1 Preprocessing the Dialogues-Summary Dataset



In [7]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary:' 
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt').input_ids
    
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])

In [8]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True) #

Check the shapes of all three parts of the dataset:

In [9]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)   

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


#### 2.2 Fine-Tune the Model with the Preprocessed Dataset

In [48]:
# Define o diretório de saída onde os arquivos gerados durante o treinamento serão salvos.
# O nome do diretório inclui um timestamp único gerado com base no horário atual, para evitar conflitos.

output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

# Cria um conjunto de argumentos de configuração para o treinamento do modelo, definindo parâmetros básicos.
training_args = TrainingArguments(
    output_dir=output_dir,  # Especifica o diretório de saída para salvar os checkpoints do modelo.
    learning_rate=1e-5,  # Define a taxa de aprendizado (learning rate), que controla o quão rápido o modelo ajusta os pesos.
    num_train_epochs=3,  # Define o número de épocas de treinamento (uma época = passar por todos os dados uma vez). Aqui está configurado para apenas 1 época para economizar recursos.
    weight_decay=0.01,  # Aplica um decaimento de peso (weight decay) para evitar overfitting, incentivando pesos menores no modelo.
    logging_steps=1,  # Registra métricas a cada 1 passo de treinamento para monitorar o progresso.
    max_steps=1,  # Limita o treinamento a apenas 1 passo total, usado para reduzir esforço computacional neste exemplo.
    report_to="none"  # Desabilita o uso de ferramentas de monitoramento como wandb (Weights and Biases).
)

# Instancia o objeto 'Trainer', que gerencia o processo de treinamento, avaliação e salvamento do modelo.
trainer = Trainer(
    model=original_model,  # Especifica o modelo que será treinado (neste caso, 'original_model').
    args=training_args,  # Passa os argumentos de treinamento configurados anteriormente.
    train_dataset=tokenized_datasets['train'],  # Especifica o conjunto de dados tokenizados para treinamento.
    eval_dataset=tokenized_datasets['validation']  # Especifica o conjunto de dados tokenizados para validação (avaliação).
)

In [49]:
trainer.train()

  0%|          | 0/1 [00:00<?, ?it/s]

{'loss': 47.5, 'grad_norm': 398.0, 'learning_rate': 0.0, 'epoch': 0.06}
{'train_runtime': 487.434, 'train_samples_per_second': 0.016, 'train_steps_per_second': 0.002, 'train_loss': 47.5, 'epoch': 0.06}


TrainOutput(global_step=1, training_loss=47.5, metrics={'train_runtime': 487.434, 'train_samples_per_second': 0.016, 'train_steps_per_second': 0.002, 'total_flos': 5478058819584.0, 'train_loss': 47.5, 'epoch': 0.0625})

In [50]:
# Carregar o modelo do checkpoint local

# O caminho e a pasta que estamos falando correspondem ao diretório onde o modelo foi salvo no treinamento anterior.
# e.g: output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

# checkpoint_dir = "C:/Users/danrl/Documents/Projects/genAI_with_LLM_course/dialogue-summary-training-1735761052/checkpoint-1"

checkpoint_dir = "C:/Users/danrl/Documents/Projects/genAI_with_LLM_course/dialogue-summary-training-1735768418/checkpoint-1"


instruct_model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint_dir,                # Caminho para o checkpoint
    torch_dtype=torch.bfloat16     # Tipo de dado para otimizar memória e cálculo
)

# Verificar se o modelo foi carregado corretamente
print("Modelo carregado com sucesso!")

Modelo carregado com sucesso!


### 2.3 Evaluate the Model Qualitatively (Human Evaluation)

In [52]:
index = 10

dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')


---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# attends Brian's birthday party. Brian thinks #Person1# looks great and charming.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
People at the party are all happy to see Brian.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1#: Happy birthday, Brian. #Person2#: I'm so happy you're having a good time. #Person1#: Thank you, I'm sure you're having a good time. #Person2#: Thank you, I'm sure you're having a good time. #Person1#: Thank you, I'm sure you're having a good time. #Person2#: Thank you, I'm sure you're having a good time. #Person1#: Thank you, I'm sure you're having a good time.


### 2.4 Evaluate the Model Quantitativaly (with ROUGE Metric)

In [30]:
%pip install rouge_score

Note: you may need to restart the kernel to use updated packages.


#### Exemplo básico com Rouge_Score

In [31]:
from rouge_score import rouge_scorer

# Textos de exemplo
referencia = "O cachorro brincava alegremente no parque."
gerado = "O cão estava brincando no parque."

# Criar o objeto ROUGE Scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calcular a pontuação
pontuacoes = scorer.score(referencia, gerado)

# Mostrar os resultados
for rouge, valores in pontuacoes.items():
    print(f"{rouge}:")
    print(f"  Precisão: {valores.precision:.4f}")
    print(f"  Recall: {valores.recall:.4f}")
    print(f"  F1: {valores.fmeasure:.4f}")

rouge1:
  Precisão: 0.4286
  Recall: 0.5000
  F1: 0.4615
rouge2:
  Precisão: 0.1667
  Recall: 0.2000
  F1: 0.1818
rougeL:
  Precisão: 0.4286
  Recall: 0.5000
  F1: 0.4615


In [32]:
rouge = evaluate.load('rouge')

In [43]:
import torch

# Verificar se CUDA está disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando a GPU: {torch.cuda.is_available()}")

Usando a GPU: False


In [54]:
import torch
import pandas as pd

# Definir dispositivo para CPU
device = torch.device("cpu")

# Carregar os modelos e o tokenizer
original_model = original_model.to(device)
instruct_model = instruct_model.to(device)

dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

# Iterar sobre os diálogos
for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""
    
    # Preparar entradas e mover para a CPU
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

    # Gerar resumo com o modelo original
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    # Gerar resumo com o modelo de instrução
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)

# Criar DataFrame com os resultados
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))
df = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])

# Mostrar os primeiros 10 resultados
df.head(10)

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1#: I need to change my email and I nee...,#Person1#: I need to take a dictation for you.
1,In order to prevent employees from wasting tim...,Employees will be given a choice to choose the...,#Person1#: I need to take a dictation for you.
2,Ms. Dawson takes a dictation for #Person1# abo...,Employees are to be given notice of the new in...,#Person1#: I need to take a dictation for you.
3,#Person2# arrives late because of traffic jam....,Taking public transport to work is a good idea...,The traffic jam at the Carrefour intersection ...
4,#Person2# decides to follow #Person1#'s sugges...,The person is a reporter.,The traffic jam at the Carrefour intersection ...
5,#Person2# complains to #Person1# about the tra...,Person1: I'm a traffic jam victim.,The traffic jam at the Carrefour intersection ...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,People at a party are having a party.,"#Person1#: Happy birthday, Brian. #Person2#: I..."


#### Evaluate the model with Rouge

In [55]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.2482408546025567), 'rouge2': np.float64(0.09865625024171779), 'rougeL': np.float64(0.2130742256167788), 'rougeLsum': np.float64(0.21706858577071342)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.23884559093833285), 'rouge2': np.float64(0.11535720375106562), 'rougeL': np.float64(0.21714203657752046), 'rougeLsum': np.float64(0.2175800707655546)}


#### 3 - Perform Parameter Efficient Fine-Tunning (PEFT)

##### 3.1 Setup the PEFT/LoRA model for Fine-Tuning

In [56]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r = 32, #Rank
    lora_alpha= 32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM" #FLAN-T5    
)

Add LoRA adapter layers/parameters to the original LLM to be trained.

In [58]:
peft_model = get_peft_model(original_model,
                            lora_config)

print(print_number_of_trainable_parameters(peft_model))

Trainable model parameters: 3538944
All model parameters: 251116800
Percentage of trainable parameters: 1.41%


### Train PEFT Adapter

In [64]:
output_dir = f'./peft-dialogue-summary-training-str{int(time.time())}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, #Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1,
    report_to="none"  # Adicione esta linha para desabilitar o wandb    
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [65]:
peft_trainer.train()

  0%|          | 0/1 [00:00<?, ?it/s]

{'loss': 48.0, 'grad_norm': 9.306382179260254, 'learning_rate': 0.0, 'epoch': 0.06}
{'train_runtime': 430.5198, 'train_samples_per_second': 0.019, 'train_steps_per_second': 0.002, 'train_loss': 48.0, 'epoch': 0.06}


TrainOutput(global_step=1, training_loss=48.0, metrics={'train_runtime': 430.5198, 'train_samples_per_second': 0.019, 'train_steps_per_second': 0.002, 'total_flos': 5565031907328.0, 'train_loss': 48.0, 'epoch': 0.0625})

In [67]:
peft_model_path = "C:/Users/danrl/Documents/Projects/genAI_with_LLM_course/peft-dialogue-summary-training-str1735770458/checkpoint-1"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('C:/Users/danrl/Documents/Projects/genAI_with_LLM_course/peft-dialogue-summary-training-str1735770458/checkpoint-1\\tokenizer_config.json',
 'C:/Users/danrl/Documents/Projects/genAI_with_LLM_course/peft-dialogue-summary-training-str1735770458/checkpoint-1\\special_tokens_map.json',
 'C:/Users/danrl/Documents/Projects/genAI_with_LLM_course/peft-dialogue-summary-training-str1735770458/checkpoint-1\\tokenizer.json')

In [69]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False) # We not interested in training the LoRA adapter layers.

In [70]:
print(print_number_of_trainable_parameters(peft_model))

Trainable model parameters: 0
All model parameters: 251116800
Percentage of trainable parameters: 0.00%


#### 3.3 Evaluate the Model Qualitatively (Human Evaluation)

In [71]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_outputs = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL:\n{peft_model_text_outputs}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# attends Brian's birthday party. Brian thinks #Person1# looks great and charming.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
You're welcome.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1#: I'm thinking of upgrading my computer.
---------------------------------------------------------------------------------------------------
PEFT MODEL:
#Person1#: I'm thinking of upgrading my computer.


#### 3.4 Evaluate the Model Quantitatively (with ROUGE Metric)

In [72]:
import torch
import pandas as pd

# Definir dispositivo para CPU
device = torch.device("cpu")

# Carregar os modelos e o tokenizer
original_model = original_model.to(device)
instruct_model = instruct_model.to(device)

dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

# Iterar sobre os diálogos
for _, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""
    
    # Preparar entradas e mover para a CPU
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

    # Gerar resumo com o modelo original
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    # Gerar resumo com o modelo de instrução
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)
    
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)
    

# Criar DataFrame com os resultados
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_model_summaries))
df = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries'])

# Mostrar os primeiros 10 resultados
df.head(10)

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,The memo will be sent out to all employees by ...,#Person1#: I need to take a dictation for you.,The memo is to be distributed to all employees...
1,In order to prevent employees from wasting tim...,This memo is to be distributed to all employee...,#Person1#: I need to take a dictation for you.,The memo is to be distributed to all employees...
2,Ms. Dawson takes a dictation for #Person1# abo...,Employees will be required to keep their offic...,#Person1#: I need to take a dictation for you.,The memo is to be distributed to all employees...
3,#Person2# arrives late because of traffic jam....,#Person1#: I got stuck in traffic. #Person2: I...,The traffic jam at the Carrefour intersection ...,The traffic jam at the Carrefour intersection ...
4,#Person2# decides to follow #Person1#'s sugges...,Person1 is looking at the public transport sys...,The traffic jam at the Carrefour intersection ...,The traffic jam at the Carrefour intersection ...
5,#Person2# complains to #Person1# about the tra...,The traffic is congested and there's a traffic...,The traffic jam at the Carrefour intersection ...,The traffic jam at the Carrefour intersection ...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are having a separation for 2 m...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,"#Person1: Happy birthday, Brian. #Person2: Hap...","#Person1#: Happy birthday, Brian. #Person2#: I...",Brian's birthday is coming up.


In [73]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries, # Predictions
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.2719873403063058), 'rouge2': np.float64(0.10181289283673575), 'rougeL': np.float64(0.2225432852332807), 'rougeLsum': np.float64(0.22601143409616642)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.23884559093833285), 'rouge2': np.float64(0.11535720375106562), 'rougeL': np.float64(0.21714203657752046), 'rougeLsum': np.float64(0.2175800707655546)}
PEFT MODEL:
{'rouge1': np.float64(0.26109650997150996), 'rouge2': np.float64(0.11055072463768116), 'rougeL': np.float64(0.2302777777777778), 'rougeLsum': np.float64(0.2339245014245014)}
