In [1]:
# define model
MODEL_NAME = "openai-community/gpt2-medium"

In [2]:
# !pip install accelerate -U
# !pip install black
# !pip install jupyter_black

In [3]:
# !pip install datasets
# !pip install transformers

In [4]:
# Formatting libraries
import black
import jupyter_black

# Load jupyter_black settings
jupyter_black.load(
    lab=True,
    line_length=170,
)

## Load and prepare data

In [5]:
from datasets import load_dataset

In [6]:
# import pytorch
import torch
import random

In [7]:
# dataset = load_dataset("knkarthick/dialogsum")
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [8]:
# define tokenizer. We will use the tokenizer to count the number of tokens per instance
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="right")



In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [10]:
# define prompt template
prompt_template = """
Summarize the following conversation.

### Conversation:

{dialogue}

### Summary:

"""


# create prompt
def create_prompt(data):
    dialogue = data["article"]
    summary = data["highlights"]
    prompt = prompt_template.format(dialogue=dialogue, summary=summary)

    n_tokens_output = len(tokenizer.encode(summary, add_special_tokens=False))
    n_tokens_input = len(tokenizer.encode(prompt, add_special_tokens=False))

    return {"input": prompt, "output": summary, "n_tokens_input": n_tokens_input, "n_tokens_output": n_tokens_output}

In [11]:
dataset = dataset.map(create_prompt)
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 11490
    })
})

In [12]:
# get 0.95 percentile of dialogue length in training set
dataset["train"].to_pandas().n_tokens_input.quantile(0.95)

1734.0

In [13]:
dataset["train"].to_pandas().n_tokens_input.quantile(0.62)

942.0

In [14]:
# get 0.95 quantile of n_tokens_summary in train dataset`
dataset["train"].to_pandas().n_tokens_output.quantile(0.65)

70.0

In [15]:
# filter very long dialogs and summaries
dataset = dataset.filter(lambda x: x["n_tokens_input"] < 942 and x["n_tokens_output"] < 70)

In [16]:
dataset["train"].to_pandas().n_tokens_input.quantile(1)

941.0

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 128837
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 5024
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 4724
    })
})

In [18]:
random.seed(42)
dataset = dataset.filter(lambda x, index: random.random() < 0.5, with_indices=True)

In [19]:
# dataset["train"].to_pandas().max()

In [20]:
dataset["train"].to_pandas().n_tokens_input.quantile(1)

941.0

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 64510
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 2509
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 2336
    })
})

In [22]:
print(dataset["train"]["input"][0])


Summarize the following conversation.

### Conversation:

MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. "The whole bridge from one side of the Mississippi to the other just completely gave way, fell all the way down," survivor Gary Babineau told CNN. "I probably had a 30-, 35-foot free fall. And there's cars in the water, there's cars on fire. The whole bridge is down." He said his back was injured but he determined he could move around. "I realized there was a school bus right next to me, and me and a couple of other guys went over and started lifting the kids off the bridge. They were yelling, screaming, bleeding. I think there were some broken bones."  Watch a driver describe his narrow escape » . At home when he heard about the disaster, Dr. John Hink, an emergency room physician, jumped into his car and rushed to the scene in 15 minutes. He arrived at the south side of the bridge, stood on the riverb

In [23]:
print(dataset["train"]["output"][0])

NEW: "I thought I was going to die," driver says .
Man says pickup truck was folded in half; he just has cut on face .
Driver: "I probably had a 30-, 35-foot free fall"
Minnesota bridge collapsed during rush hour Wednesday .


## Load and prepare Tokenizer

In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [25]:
# load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_special_tokens=False)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# verify the existing special tokens
print(f"Special Tokens: \n{tokenizer.special_tokens_map}")

# if no padding token set eos_token as padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

Special Tokens: 
{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}




In [26]:
# load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cuda")

## Define DataLoader

In [27]:
# import pad_sequence from torch
from torch.nn.utils.rnn import pad_sequence
import torch
import copy

In [28]:
class DataCollatorForCausalLM:
    def __init__(self, tokenizer, source_max_len, target_max_len, training_on_source, padding_side):
        self.tokenizer = tokenizer
        self.source_max_len = source_max_len
        self.target_max_len = target_max_len
        self.training_on_source = training_on_source
        self.padding_side = padding_side

    def __call__(self, instances):
        source = [tokenizer.bos_token + text["input"] for text in instances]
        target = [text["output"] + tokenizer.eos_token for text in instances]

        source = tokenizer(
            source,
            padding=False,
            truncation=True,
            max_length=self.source_max_len,
        )

        target = tokenizer(
            target,
            padding=False,
            truncation=True,
            max_length=self.target_max_len,
        )

        input_ids = []
        labels = []
        for source_tokens, target_token in zip(source["input_ids"], target["input_ids"]):
            if tokenizer.pad_token_id == tokenizer.eos_token_id:
                target_token[-1] = -50

            input_ids.append(torch.LongTensor(source_tokens + target_token))

            if self.training_on_source:
                labels.append(torch.LongTensor(copy.deepcopy(source_tokens + target_token)))
            else:
                labels.append(torch.LongTensor([-100] * len(source_tokens) + copy.deepcopy(target_token)))

        # Pad sequences to the longest one in the batch
        if self.padding_side == "right":
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
            labels = pad_sequence(labels, batch_first=True, padding_value=-100)

        elif self.padding_side == "left":
            # Find the maximum sequence length in the batch
            max_length = max(len(seq) for seq in input_ids)

            # Perform left padding manually
            input_ids = [torch.cat([torch.full((max_length - len(seq),), self.tokenizer.pad_token_id, dtype=torch.long), seq]) for seq in input_ids]
            labels = [torch.cat([torch.full((max_length - len(seq),), -100, dtype=torch.long), seq]) for seq in labels]

            # Convert list of tensors to a single tensor
            input_ids = torch.stack(input_ids)
            labels = torch.stack(labels)

        else:
            raise ValueError("padding_direction must be 'right' or 'left'")

        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)

        if tokenizer.pad_token_id == tokenizer.eos_token_id:
            input_ids[input_ids == -50] = self.tokenizer.pad_token_id
            labels[labels == -50] = self.tokenizer.pad_token_id

        return {"input_ids": input_ids, "labels": labels, "attention_mask": attention_mask}

## Train model

In [29]:
# Define collator
data_collator = DataCollatorForCausalLM(
    tokenizer=tokenizer,
    source_max_len=942,
    target_max_len=70,
    training_on_source=False,
    padding_side="left",
)

In [30]:
from transformers import TrainingArguments, Trainer

In [31]:
# Define the training arguments for the Trainer
training_arguments = TrainingArguments(
    report_to="tensorboard",
    logging_dir=f"./{MODEL_NAME}-tensorboard",
    output_dir=f"./{MODEL_NAME}-checkpoing",
    num_train_epochs=4,
    # num_train_epochs=1,
    optim="adamw_torch",
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    gradient_accumulation_steps=5,
    max_steps=-1,
    weight_decay=0.01,
    learning_rate=5e-5,
    remove_unused_columns=False,
    max_grad_norm=1.0,
    gradient_checkpointing=True,
    do_train=True,
    do_eval=True,
    lr_scheduler_type="constant",
    warmup_ratio=0.005,
    logging_strategy="epoch",
    logging_steps=100,
    evaluation_strategy="epoch",
    eval_steps=100,  # We use same value for logging_steps (train metrics) and  eval_steps (eval metrics).
    group_by_length=False,
    save_strategy="epoch",
    save_steps=100,
    save_total_limit=100,
    load_best_model_at_end=True,
)



In [32]:
# RuntimeError: TensorBoardCallback requires tensorboard to be installed. Either update your PyTorch version or install tensorboardX.
# !pip install tensorboardX

In [33]:
print("Num GPUs Available: ", torch.cuda.device_count())

Num GPUs Available:  1


In [34]:
# Initialize the Trainer with the model, tokenizer, training arguments, datasets, and data collator
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"].shuffle(seed=42),  # .select(range(5000)),
    # train_dataset=dataset["train"].shuffle(seed=42).select(range(32000)),
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
)

# Start the training process
trainer.train()

# Save the trained model to the specified path
trainer.model.save_pretrained(f"./{MODEL_NAME}-fine-tuned-model")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
0,2.1837,1.781578
1,1.8524,1.750449
2,1.7844,1.724408
3,1.7192,1.717643


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [35]:
# !pip install tensorboard

In [36]:
trainer

<transformers.trainer.Trainer at 0x7f7fe8a8d540>

In [37]:
# save used args on json file
# args_json_path = os.path.join(self.new_model_name_or_path, 'training_args.json')
# with open(args_json_path, 'w') as json_file:
#     json.dump(args_dict, json_file, indent=4)