In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig


In [38]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [2]:
peft_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_rslora=False)

In [3]:
dataset = load_dataset("pszemraj/booksum-short")

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 5912
    })
    validation: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 1012
    })
    test: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 988
    })
})

In [4]:
quantization_config = BitsAndBytesConfig(
        # load_in_8bit=True,
        load_in_4bit=True,
        # llm_int8_enable_fp32_cpu_offload=True,
        # llm_int8_has_fp16_weight=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16"
)

In [5]:
device = 'cuda'

In [6]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=quantization_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer.pad_token = tokenizer.eos_token


In [8]:
messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

In [10]:
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)
# model.to(device)

generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=10000)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


<s> [INST] What is your favourite condiment? [/INST]Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s> [INST] Do you have mayonnaise recipes? [/INST] I don't have the ability to create mayonnaise recipes myself, as I don't have the capability to cook or prepare foods. But I can certainly provide you with a simple and classic mayonnaise recipe if you'd like! Here's a traditional one using just a few essential ingredients:

Ingredients:
- 1 egg yolk (room temperature)
- 1 tablespoon of Dijon mustard
- 1 cup of vegetable oil (canola, soybean or grapeseed oil are good choices)
- 1 tablespoon of white wine vinegar or lemon juice
- Salt, to taste

Instructions:
1. In a medium-sized bowl, whisk together egg yolk and mustard.
2. Gradually add oil, in a slow, thin stream, whisking constantly to emulsify the mixture. If the mixture starts to thicken too much, you can add a splash of water t

In [13]:
train_dataset = dataset["train"].select_columns(['chapter', 'summary']).select(range(10))
val_dataset = dataset["validation"].select_columns(['chapter', 'summary']).select(range(10))

In [None]:
print(train_dataset)

In [None]:
print(dataset)

In [30]:
def tokenize_function(example):
    start_prompt = 'Summarize the following chapter of a book.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + chapter + end_prompt for chapter in example["chapter"]]
    example['input_ids'] = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding=True,truncation=True, return_tensors="pt").input_ids
    
    return example

In [31]:
# def encode(dataset):
#     return tokenizer(dataset["chapter"], dataset["summary"], truncation=True, padding="max_length")


In [32]:
encoded_train_dataset = train_dataset.map(tokenize_function, batched=True)
encoded_val_dataset = val_dataset.map(tokenize_function, batched=True)

# encoded_train_dataset = train_dataset.map(encode)
# encoded_val_dataset = val_dataset.map(encode)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [33]:
encoded_train_dataset = encoded_train_dataset.remove_columns(['chapter', 'summary'])
encoded_val_dataset = encoded_val_dataset.remove_columns(['chapter', 'summary'])

In [34]:
encoded_train_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 10
})

In [35]:
len(encoded_train_dataset['input_ids'][3])

8550

In [37]:
print(tokenizer.batch_decode(encoded_train_dataset['input_ids'][1]))

['</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>',

In [36]:
encoded_train_dataset['input_ids'][0]

[1,
 28705,
 13,
 28705,
 345,
 11273,
 1167,
 5080,
 654,
 480,
 1334,
 304,
 261,
 2636,
 28725,
 13,
 2287,
 11439,
 298,
 272,
 1170,
 321,
 813,
 24582,
 4699,
 286,
 28745,
 13,
 28705,
 415,
 8970,
 1174,
 302,
 15507,
 6774,
 13,
 2287,
 415,
 6138,
 304,
 3585,
 1503,
 4768,
 28745,
 13,
 28705,
 1015,
 5063,
 1114,
 28713,
 26108,
 28725,
 304,
 8191,
 28718,
 7835,
 4226,
 28725,
 13,
 2287,
 1015,
 285,
 696,
 1606,
 668,
 406,
 286,
 297,
 272,
 19759,
 611,
 13,
 13,
 28705,
 19721,
 28802,
 12738,
 28723,
 13,
 13,
 13,
 3514,
 1652,
 272,
 10214,
 381,
 822,
 288,
 17162,
 1050,
 304,
 516,
 1885,
 3269,
 24804,
 298,
 13,
 2748,
 299,
 6036,
 1309,
 13551,
 778,
 264,
 8613,
 369,
 10932,
 1259,
 2655,
 10333,
 607,
 13,
 262,
 15123,
 28725,
 478,
 1580,
 938,
 396,
 3227,
 28742,
 28713,
 23037,
 28725,
 304,
 6139,
 272,
 6337,
 264,
 1664,
 13,
 28719,
 3429,
 298,
 272,
 7635,
 1050,
 302,
 272,
 1633,
 970,
 478,
 506,
 1432,
 2598,
 706,
 28723,
 13,
 13,
 2486,

In [37]:
len(encoded_train_dataset['input_ids'])

10

In [38]:
encoded_val_dataset

Dataset({
    features: ['chapter', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [39]:
encoded_train_dataset

Dataset({
    features: ['chapter', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [40]:
# encoded_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=4)

In [41]:
# training_args = TrainingArguments(output_dir="test_trainer",
#                                   per_device_train_batch_size=1,
#                                   per_device_eval_batch_size=1,
#                                   num_train_epochs=5,
#                                   learning_rate=0.001,
#                                   optim='adamw_torch'
#                                   )

In [40]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  auto_find_batch_size=True,
                                  num_train_epochs=1,
                                  learning_rate=0.001,
                                  logging_steps=1,
                                  max_steps=1
                                  )

In [42]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")

In [42]:
model.add_adapter(peft_config)

How to Use Rouge
At minimum, this metric takes as input a list of predictions and a list of references:

>>> rouge = evaluate.load('rouge')
>>> predictions = ["hello there", "general kenobi"]
>>> references = ["hello there", "general kenobi"]
>>> results = rouge.compute(predictions=predictions,
...                         references=references)
>>> print(results)
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
One can also pass a custom tokenizer which is especially useful for non-latin languages.

>>> results = rouge.compute(predictions=predictions,
...                         references=references,
                            tokenizer=lambda x: x.split())
>>> print(results)
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
It can also deal with lists of references for each predictions:

>>> rouge = evaluate.load('rouge')
>>> predictions = ["hello there", "general kenobi"]
>>> references = [["hello", "there"], ["general kenobi", "general yoda"]]
>>> results = rouge.compute(predictions=predictions,
...                         references=references)
>>> print(results)
{'rouge1': 0.8333, 'rouge2': 0.5, 'rougeL': 0.8333, 'rougeLsum': 0.8333}```

In [43]:
print_number_of_trainable_model_parameters(model)

'trainable model parameters: 20971520\nall model parameters: 3773042688\npercentage of trainable model parameters: 0.56%'

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    # compute_metrics=rouge
)

In [None]:
trainer.train()

In [None]:
print(len(encoded_train_dataset['attention_mask'][0]))

In [None]:
print(len(encoded_train_dataset['attention_mask'][78]))


In [72]:
for i in range(len(encoded_train_dataset['input_ids'])):
    if len(encoded_train_dataset['input_ids'][i]) == 11593:
        print(i)

KeyboardInterrupt: 