In [15]:
import torch

In [16]:
# define model
MODEL_NAME = "openai-community/gpt2-large"
use_peft = False
torch_dtype = torch.float16

In [4]:
# Formatting libraries
import black
import jupyter_black

# Load jupyter_black settings
jupyter_black.load(
    lab=True,
    line_length=170,
)

## Load and prepare data

In [5]:
from datasets import load_dataset

In [6]:
dataset = load_dataset("knkarthick/dialogsum")

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [7]:
# define tokenizer. We will use the tokenizer to count the number of tokens per instance
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="right")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [8]:
# define prompt template
prompt_template = """
Summarize the following conversation.

### Conversation:

{dialogue}

### Summary:

"""


# create prompt
def create_prompt(data):
    dialogue = data["dialogue"]
    summary = data["summary"]
    prompt = prompt_template.format(dialogue=dialogue, summary=summary)

    n_tokens_output = len(tokenizer.encode(summary, add_special_tokens=False))
    n_tokens_input = len(tokenizer.encode(prompt, add_special_tokens=False))

    return {"input": prompt, "output": summary, "n_tokens_input": n_tokens_input, "n_tokens_output": n_tokens_output}

In [9]:
dataset = dataset.map(create_prompt)
dataset

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input', 'output', 'n_tokens_input', 'n_tokens_output'],
        num_rows: 1500
    })
})

In [10]:
# get 0.95 percentile of dialogue length in training set
dataset["train"].to_pandas().n_tokens_input.quantile(0.95)

431.0

In [11]:
# get 0.95 quantile of n_tokens_summary in train dataset`
dataset["train"].to_pandas().n_tokens_output.quantile(0.95)

72.0

In [12]:
# filter very long dialogs and summaries
dataset = dataset.filter(lambda x: x["n_tokens_input"] < 470 and x["n_tokens_output"] < 70)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [13]:
print(dataset["train"]["input"][0])


Summarize the following conversation.

### Conversation:

#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.
#Person2#: Ok, thanks doctor.

### Summ

In [14]:
print(dataset["train"]["output"][0])

Mr. Smith's getting a check-up, and Doctor Hawkins advises him to have one every year. Hawkins'll give some information about their classes and medications to help Mr. Smith quit smoking.


## Load and prepare Tokenizer

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [14]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_special_tokens=False)

# verify the existing special tokens
print(f"Special Tokens: \n{tokenizer.special_tokens_map}")

# if no padding token set eos_token as padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

Special Tokens: 
{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


In [15]:
# load model and tokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cuda", torch_dtype=torch_dtype)  # torch_dtype=torch.float16

In [16]:
if use_peft:
    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

    config = LoraConfig(
        r=16,
        lora_alpha=64,
        target_modules=["c_attn", "c_proj", "c_fc", "c_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
    )

    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    else:

        def make_inputs_require_grad(module, input, output):
            output.requires_grad_(True)

        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

    model = get_peft_model(model, config)
    model.print_trainable_parameters()
    model.config.use_cache = False
else:
    model.config.use_cache = False

## Define DataLoader

In [17]:
# import pad_sequence from torch
from torch.nn.utils.rnn import pad_sequence
import torch
import copy

In [18]:
class DataCollatorForCausalLM:
    def __init__(self, tokenizer, source_max_len, target_max_len, training_on_source, padding_side):
        self.tokenizer = tokenizer
        self.source_max_len = source_max_len
        self.target_max_len = target_max_len
        self.training_on_source = training_on_source
        self.padding_side = padding_side

    def __call__(self, instances):
        source = [tokenizer.bos_token + text["input"] for text in instances]
        target = [text["output"] + tokenizer.eos_token for text in instances]

        source = tokenizer(
            source,
            padding=False,
            truncation=True,
            max_length=self.source_max_len,
        )

        target = tokenizer(
            target,
            padding=False,
            truncation=True,
            max_length=self.target_max_len,
        )

        input_ids = []
        labels = []
        for source_tokens, target_token in zip(source["input_ids"], target["input_ids"]):
            if tokenizer.pad_token_id == tokenizer.eos_token_id:
                target_token[-1] = -50

            input_ids.append(torch.LongTensor(source_tokens + target_token))

            if self.training_on_source:
                labels.append(torch.LongTensor(copy.deepcopy(source_tokens + target_token)))
            else:
                labels.append(torch.LongTensor([-100] * len(source_tokens) + copy.deepcopy(target_token)))

        # Pad sequences to the longest one in the batch
        if self.padding_side == "right":
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
            labels = pad_sequence(labels, batch_first=True, padding_value=-100)

        elif self.padding_side == "left":
            # Find the maximum sequence length in the batch
            max_length = max(len(seq) for seq in input_ids)

            # Perform left padding manually
            input_ids = [torch.cat([torch.full((max_length - len(seq),), self.tokenizer.pad_token_id, dtype=torch.long), seq]) for seq in input_ids]
            labels = [torch.cat([torch.full((max_length - len(seq),), -100, dtype=torch.long), seq]) for seq in labels]

            # Convert list of tensors to a single tensor
            input_ids = torch.stack(input_ids)
            labels = torch.stack(labels)

        else:
            raise ValueError("padding_direction must be 'right' or 'left'")

        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)

        if tokenizer.pad_token_id == tokenizer.eos_token_id:
            input_ids[input_ids == -50] = self.tokenizer.pad_token_id
            labels[labels == -50] = self.tokenizer.pad_token_id

        return {"input_ids": input_ids, "labels": labels, "attention_mask": attention_mask}

## Train model

In [19]:
# Define collator
data_collator = DataCollatorForCausalLM(
    tokenizer=tokenizer,
    source_max_len=470,
    target_max_len=70,
    training_on_source=False,
    padding_side="left",
)

In [20]:
from transformers import TrainingArguments, Trainer

In [21]:
# Define the training arguments for the Trainer
training_arguments = TrainingArguments(
    report_to="none",
    # logging_dir=None,  # f"./{MODEL_NAME}-tensorboard",
    output_dir=f"./{MODEL_NAME}-checkpoing",
    num_train_epochs=10,
    optim="paged_adamw_32bit",
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    gradient_accumulation_steps=4,
    max_steps=-1,
    weight_decay=0.01,
    learning_rate=5e-5,
    remove_unused_columns=False,
    max_grad_norm=1.0,
    gradient_checkpointing=True,
    do_train=True,
    do_eval=True,
    lr_scheduler_type="constant",
    warmup_ratio=0.005,
    logging_strategy="epoch",
    logging_steps=100,
    evaluation_strategy="epoch",
    eval_steps=100,  # We use same value for logging_steps (train metrics) and  eval_steps (eval metrics).
    group_by_length=False,
    save_strategy="epoch",
    save_steps=100,
    save_total_limit=100,
    load_best_model_at_end=True,
)

In [None]:
# Initialize the Trainer with the model, tokenizer, training arguments, datasets, and data collator
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"].shuffle(seed=42).select(range(7000)),
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
)

# Start the training process
trainer.train()

# Save the trained model to the specified path
trainer.model.save_pretrained(f"./{MODEL_NAME}-peft={use_peft}-fine-tuned-model")

Epoch,Training Loss,Validation Loss
1,1.5155,1.137695
2,0.9545,1.130859


In [None]:
# save used args on json file
# args_json_path = os.path.join(self.new_model_name_or_path, 'training_args.json')
# with open(args_json_path, 'w') as json_file:
#     json.dump(args_dict, json_file, indent=4)