In [None]:

# %pip install --disable-pip-version-check \
#     torch==1.13.1 \
#     torchdata==0.5.1 --quiet

%pip install \
    transformers\
    datasets\
    evaluate\
    rouge_score\
    loralib\
    peft --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
import torch
import evaluate
import pandas as pd
import numpy as np
from datasets import load_dataset
from peft import AutoPeftModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

In [None]:
torch.cuda.is_available()

True

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:

huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [None]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
peft_model = AutoPeftModelForSeq2SeqLM.from_pretrained('BhushanM25/fine-tuned-flan-t5-base-v4',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False).to(DEVICE)

adapter_config.json:   0%|          | 0.00/334 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

In [None]:
dialogues = dataset['test']['dialogue']
human_baseline_summaries = dataset['test']['summary']

original_model_summaries = []
peft_model_summaries = []

In [None]:
len(dialogues)

1500

In [None]:
for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").to(DEVICE).input_ids

    human_baseline_text_output = human_baseline_summaries[idx]

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)


Token indices sequence length is longer than the specified maximum sequence length for this model (1028 > 512). Running this sequence through the model will result in indexing errors


In [None]:
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1#: I need to take a dictation for you.,Ms. Dawson asks Ms. Dawson to take a dictation...
1,In order to prevent employees from wasting tim...,#Person1#: I need to take a dictation for you.,Ms. Dawson asks Ms. Dawson to take a dictation...
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1#: I need to take a dictation for you.,Ms. Dawson asks Ms. Dawson to take a dictation...
3,#Person2# arrives late because of traffic jam....,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and got stuck i...
4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and got stuck i...
...,...,...,...
1495,Matthew and Steve meet after a long time. Stev...,#Person1#: Hi! #Person2#: Hi! How's your life?...,Matthew hasn't seen Steve in a year. He's been...
1496,Steve has been looking for a place to live. Ma...,#Person1#: Hi! #Person2#: Hi! How's your life?...,Matthew hasn't seen Steve in a year. He's been...
1497,Frank invites Besty to the party to celebrate ...,Person1 is going to throw a party for all of h...,Frank wants to throw a party for all of his fr...
1498,Frank invites Betsy to the big promotion party...,Person1 is going to throw a party for all of h...,Frank wants to throw a party for all of his fr...


In [None]:
df.to_csv('evaluation_v3.csv')

In [None]:
results = pd.read_csv('evaluation_v4.csv')
results.head()

Unnamed: 0.1,Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1#: I need to take a dictation for you.,Ms. Dawson asks Ms. Dawson to take a dictation...
1,1,In order to prevent employees from wasting tim...,#Person1#: I need to take a dictation for you.,Ms. Dawson asks Ms. Dawson to take a dictation...
2,2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1#: I need to take a dictation for you.,Ms. Dawson asks Ms. Dawson to take a dictation...
3,3,#Person2# arrives late because of traffic jam....,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and got stuck i...
4,4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam at the Carrefour intersection ...,#Person2# got stuck in traffic and got stuck i...


In [None]:
human_baseline_summaries = results['human_baseline_summaries'].values
original_model_summaries = results['original_model_summaries'].values
peft_model_summaries     = results['peft_model_summaries'].values

In [None]:
# v3
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_v3_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL V3:')
print(peft_model_v3_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ORIGINAL MODEL:
{'rouge1': 0.2334834589355092, 'rouge2': 0.07174289413285206, 'rougeL': 0.20204761402307664, 'rougeLsum': 0.2019997371215526}
PEFT MODEL V3:
{'rouge1': 0.39993410728708917, 'rouge2': 0.1525889162856004, 'rougeL': 0.3174106368884103, 'rougeLsum': 0.3174970640088307}


In [None]:
print("Absolute percentage improvement of PEFT MODEL v3 over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_v3_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_v3_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')


Absolute percentage improvement of PEFT MODEL v3 over ORIGINAL MODEL
rouge1: 16.65%
rouge2: 8.08%
rougeL: 11.54%
rougeLsum: 11.55%


In [None]:
# v4
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ORIGINAL MODEL:
{'rouge1': 0.23368358348378998, 'rouge2': 0.07168063413685666, 'rougeL': 0.20190610611415338, 'rougeLsum': 0.20210961728543164}
PEFT MODEL:
{'rouge1': 0.41366276517971545, 'rouge2': 0.16836766346564247, 'rougeL': 0.3315246111016755, 'rougeLsum': 0.331723364728825}


In [None]:
print("Absolute percentage improvement of PEFT MODEL v4 over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')


Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL
rouge1: 18.00%
rouge2: 9.67%
rougeL: 12.96%
rougeLsum: 12.96%
