In [1]:
# define model
TOKENIZER_NAME = "openai-community/gpt2"
# TOKENIZER_NAME = "gpt2-medium"
# TOKENIZER_NAME = "openai-community/gpt2"
MODEL_NAME = f"./{TOKENIZER_NAME}-fine-tuned-model"
# MODEL_NAME = "./checkpoing/checkpoint-300"

In [2]:
# Formatting libraries
import black
import jupyter_black

# Load jupyter_black settings
jupyter_black.load(
    lab=True,
    line_length=170,
)

## Load and prepare data

In [3]:
from datasets import load_dataset

In [4]:
# dataset = load_dataset("daily_cnn",1.)

dataset = load_dataset("cnn_dailymail", "3.0.0")

In [5]:
# define tokenizer. We will use the tokenizer to count the number of tokens per instance
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, padding_side="right")

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [7]:
# define prompt template
prompt_template = """
Summarize the following conversation.

### Conversation:

{dialogue}

### Summary:

"""


# create prompt
def create_prompt(data):
    dialogue = data["article"]
    summary = data["highlights"]
    prompt = prompt_template.format(dialogue=dialogue, summary=summary)

    n_tokens_output = len(tokenizer.encode(summary, add_special_tokens=False))
    n_tokens_input = len(tokenizer.encode(prompt, add_special_tokens=False))

    return {"input": prompt, "output": summary, "n_tokens_input": n_tokens_input, "n_tokens_output": n_tokens_output}

In [8]:
dataset = dataset.map(create_prompt)
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 11490
    })
})

In [9]:
# get 0.95 percentile of dialogue length in training set
dataset["train"].to_pandas().n_tokens_input.quantile(0.66)

991.0

In [10]:
# get 0.95 quantile of n_tokens_summary in train dataset`
dataset["train"].to_pandas().n_tokens_output.quantile(0.66)

70.0

In [11]:
dataset["train"].to_pandas().describe()

Unnamed: 0,n_tokens_input,n_tokens_output
count,287113.0,287113.0
mean,889.020476,65.692079
std,423.328305,27.458405
min,34.0,5.0
25%,576.0,49.0
50%,814.0,62.0
75%,1119.0,76.0
max,4696.0,2426.0


In [12]:
# filter very long dialogs and summaries
dataset = dataset.filter(lambda x: x["n_tokens_input"] < 942 and x["n_tokens_output"] < 70)

In [13]:
dataset["train"].to_pandas().describe()

Unnamed: 0,n_tokens_input,n_tokens_output
count,128837.0,128837.0
mean,599.438182,50.412894
std,193.010698,11.966592
min,46.0,6.0
25%,451.0,43.0
50%,604.0,51.0
75%,757.0,60.0
max,941.0,69.0


In [14]:
dataset["train"].to_pandas().describe()

Unnamed: 0,n_tokens_input,n_tokens_output
count,128837.0,128837.0
mean,599.438182,50.412894
std,193.010698,11.966592
min,46.0,6.0
25%,451.0,43.0
50%,604.0,51.0
75%,757.0,60.0
max,941.0,69.0


In [15]:
print(dataset["train"]["input"][0])


Summarize the following conversation.

### Conversation:

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his 

In [16]:
print(dataset["train"]["output"][0])

Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


## Load fine-tined model and prepare Tokenizer

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import LLMInference

In [18]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, add_special_tokens=False)

# verify the existing special tokens
print(f"Special Tokens: \n{tokenizer.special_tokens_map}")

# if no padding token set eos_token as padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

Special Tokens: 
{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


In [19]:
# load model and tokenizer
MODEL_NAME = ".\openai-community\gpt2-fine-tuned-model"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cuda")
# model = AutoModelForCausalLM.from_pretrained("gpt2-medium", device_map="cuda")

In [20]:
from typing import Tuple, List, Union
from transformers import GenerationConfig
from tqdm.notebook import tqdm
import torch
from torch.nn.utils.rnn import pad_sequence
import os
import transformers
import pandas as pd

In [21]:
llm_inference = LLMInference(model, tokenizer)

In [22]:
from evaluate import load

# Cargar la métrica BLEU
bleu_metric = load("bleu")

In [23]:
llm_inference.make_predictions_and_compute_metrics(
    dataset=dataset["test"],
    batch_size=15,
    source_max_len=942,
    padding_side="left",
    max_new_tokens=70,
    do_sample=False,
    temperature=0.2,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.0,
    # generation_config_kwargs={"num_beams": 5},
)

  0%|          | 0/315 [00:00<?, ?it/s]

Computing BLEU scores
Computing ROUGE scores


In [24]:
llm_inference.metrics

Unnamed: 0,bleu_score,1-gram precision,2-gram precision,3-gram precision,4-gram precision,rouge1,rouge2,rougeL,rougeLsum
0,0.109,0.314,0.13,0.073,0.047,0.346,0.149,0.252,0.323


In [25]:
predictions = pd.read_csv(f"./predictions_and_metric/predictions.csv")

In [26]:
idx = 0

In [27]:
print(predictions.input[idx])


Summarize the following conversation.

### Conversation:

'Cool' credentials: This week Samantha Cameron revealed her love of alternative group Poliça . She rarely misses a chance to demonstrate her ‘cool’ credentials. And this week Samantha Cameron was at it again, revealing her love of alternative group Poliça. But their brand of psychedelic rock conjures up a world a far cry from her life in Downing Street and the Cotswolds. The American band – whose name roughly translates as ‘policy’ in Polish – are inspired by a radical feminist who described pregnancy as ‘barbaric’, and their songs feature violent imagery. The video for the first single on the group’s most recent album depicts androgynous-looking singer Channy Leaneagh subjecting her trussed-up alter ego to a violent assault. Blood spurts in all directions as she smashes her hands with a hammer, punches her in the face and finally waterboards her. Another unsettling song by the four-piece band from Minnesota, entitled Leading T

In [28]:
print(predictions.output[idx])

Samantha Cameron recently spoke of her love of alternative group Poliça .
Rock band's latest album is inspired by feminist Shulamith Firestone .
Mrs Cameron even joined crowd at a recent gig in Shoreditch, East London .


In [29]:
print(predictions.prediction[idx])

Samantha Cameron, 43, is from an aristocratic background and was educated at Marlborough College.
She is friends with Bristol trip-hop artist Tricky during her time at university.
She recently name-checked US indie group The War On Drugs.Theresa May has said she will not rule out a return to the European
