In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig


In [2]:
peft_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_rslora=False)

In [3]:
dataset = load_dataset("pszemraj/booksum-short")

In [4]:
quantization_config = BitsAndBytesConfig(
        # load_in_8bit=True,
        load_in_4bit=True,
        # llm_int8_enable_fp32_cpu_offload=True,
        # llm_int8_has_fp16_weight=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16"
)

In [5]:
device = 'cuda'

In [6]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=quantization_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", model_max_length=128)
tokenizer.pad_token = tokenizer.eos_token


In [8]:
messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

In [9]:
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)
# model.to(device)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


<s> [INST] What is your favourite condiment? [/INST]Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s> [INST] Do you have mayonnaise recipes? [/INST] While I don't have the ability to create or prepare recipes myself, I'd be happy to help you make a classic mayonnaise using simple ingredients. Here's a simple Mayonnaise recipe you can try at home:

Ingredients:
1. 1 cup (240 ml) vegetable oil
2. 1 egg yolk
3. 1 tablespoon water
4. 1 tablespoon distilled white vinegar or lemon juice
5. 1 teaspoon Dijon mustard
6. 1 teaspoon salt

Instructions:
1. In a clean and dry bowl, whisk together the egg yolk, water, vinegar or lemon juice, Dijon mustard, and salt.
2. Very slowly, drop by drop, start adding the oil to the mixture, while continuously whisking the ingredients together.
3. Once half of the oil has been incorporated, you can start adding the oil in a thin, steady stream while con

In [10]:
train_dataset = dataset["train"].select_columns(['chapter', 'summary']).select(range(10))
val_dataset = dataset["validation"].select_columns(['chapter', 'summary']).select(range(10))

In [11]:
print(train_dataset)

Dataset({
    features: ['chapter', 'summary'],
    num_rows: 10
})


In [12]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 5912
    })
    validation: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 1012
    })
    test: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 988
    })
})


In [13]:
def encode(dataset):
    return tokenizer(dataset["chapter"], dataset["summary"], truncation=True, padding="max_length", max_length=128)


In [14]:
encoded_train_dataset = train_dataset.map(encode, batched=True)
encoded_val_dataset = val_dataset.map(encode, batched=True)

# encoded_train_dataset = train_dataset.map(encode)
# encoded_val_dataset = val_dataset.map(encode)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [15]:
encoded_train_dataset

Dataset({
    features: ['chapter', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [16]:
len(encoded_train_dataset['input_ids'][1])

128

In [17]:
print(encoded_train_dataset['attention_mask'][1][:5])

[1, 1, 1, 1, 1]


In [18]:
print(tokenizer.batch_decode(encoded_train_dataset['input_ids'][1]))

['<s>', '', '\n', '', '"', 'Well', ',', 'go', 'thy', 'way', ':', 'thou', 'sh', 'alt', 'not', 'from', 'this', 'gro', 've', '\n', '', 'T', 'ill', 'I', 'tor', 'ment', 'the', 'e', 'for', 'this', 'injury', '."', '\n', '\n', '', '_', 'M', 'id', 'sum', 'mer', 'Night', "'", 's', 'Dream', '._', '\n', '\n', '\n', 'The', 'words', 'were', 'still', 'in', 'the', 'mouth', 'of', 'the', 'sc', 'out', ',', 'when', 'the', 'leader', 'of', '<s>', '{"', 'name', '":', '"', 'Ch', 'apter', '', '4', '",', '"', 'url', '":', '"', 'https', '://', 'web', '.', 'archive', '.', 'org', '/', 'web', '/', '2', '0', '2', '0', '1', '1', '0', '1', '0', '5', '3', '2', '0', '5', '/', 'https', '://', 'www', '.', 'cl', 'iffs', 'notes', '.', 'com', '/', 'liter', 'ature', '/', 'l', '/', 'the', '-', 'last', '-', 'of', '-', 'the', '-', 'm', 'oh']


In [19]:
encoded_train_dataset['input_ids'][0]

[1,
 28705,
 13,
 28705,
 345,
 11273,
 1167,
 5080,
 654,
 480,
 1334,
 304,
 261,
 2636,
 28725,
 13,
 2287,
 11439,
 298,
 272,
 1170,
 321,
 813,
 24582,
 4699,
 286,
 28745,
 13,
 28705,
 415,
 8970,
 1174,
 302,
 15507,
 6774,
 13,
 2287,
 415,
 6138,
 304,
 3585,
 1503,
 4768,
 28745,
 13,
 28705,
 1015,
 5063,
 1114,
 28713,
 26108,
 28725,
 304,
 8191,
 28718,
 7835,
 4226,
 28725,
 13,
 2287,
 1015,
 285,
 696,
 1606,
 1,
 9830,
 861,
 1264,
 345,
 1209,
 2902,
 28705,
 28770,
 548,
 345,
 2179,
 1264,
 345,
 3887,
 1508,
 4311,
 28723,
 23682,
 28723,
 1909,
 28748,
 4311,
 28748,
 28750,
 28734,
 28750,
 28734,
 28740,
 28740,
 28734,
 28740,
 28734,
 28782,
 28770,
 28750,
 28734,
 28782,
 28748,
 3887,
 1508,
 2849,
 28723,
 512,
 17820,
 18787,
 28723,
 675,
 28748,
 24477,
 1373,
 28748,
 28714,
 28748,
 1237,
 28733,
 4081,
 28733,
 1009,
 28733,
 1237,
 28733,
 28719,
 1371]

In [20]:
len(encoded_train_dataset['input_ids'])

10

In [21]:
encoded_val_dataset

Dataset({
    features: ['chapter', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [22]:
encoded_train_dataset

Dataset({
    features: ['chapter', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [23]:
# encoded_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=4)

In [24]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  num_train_epochs=5,
                                  learning_rate=0.001
                                  )

In [25]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")

In [26]:
model.add_adapter(peft_config)

How to Use Rouge
At minimum, this metric takes as input a list of predictions and a list of references:

>>> rouge = evaluate.load('rouge')
>>> predictions = ["hello there", "general kenobi"]
>>> references = ["hello there", "general kenobi"]
>>> results = rouge.compute(predictions=predictions,
...                         references=references)
>>> print(results)
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
One can also pass a custom tokenizer which is especially useful for non-latin languages.

>>> results = rouge.compute(predictions=predictions,
...                         references=references,
                            tokenizer=lambda x: x.split())
>>> print(results)
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
It can also deal with lists of references for each predictions:

>>> rouge = evaluate.load('rouge')
>>> predictions = ["hello there", "general kenobi"]
>>> references = [["hello", "there"], ["general kenobi", "general yoda"]]
>>> results = rouge.compute(predictions=predictions,
...                         references=references)
>>> print(results)
{'rouge1': 0.8333, 'rouge2': 0.5, 'rougeL': 0.8333, 'rougeLsum': 0.8333}```

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    compute_metrics=rouge
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
print(len(encoded_train_dataset['attention_mask'][0]))

In [None]:
print(len(encoded_train_dataset['attention_mask'][78]))


In [72]:
for i in range(len(encoded_train_dataset['input_ids'])):
    if len(encoded_train_dataset['input_ids'][i]) == 11593:
        print(i)

KeyboardInterrupt: 