In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig


In [3]:
peft_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_rslora=False)

In [4]:
dataset = load_dataset("pszemraj/booksum-short")

In [5]:
quantization_config = BitsAndBytesConfig(
        # load_in_8bit=True,
        load_in_4bit=True,
        # llm_int8_enable_fp32_cpu_offload=True,
        # llm_int8_has_fp16_weight=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16"
)

In [6]:
device = 'cuda'

In [7]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=quantization_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", model_max_length=128)
tokenizer.pad_token = tokenizer.eos_token


In [9]:
messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]

In [10]:
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)
# model.to(device)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


<s> [INST] What is your favourite condiment? [/INST]Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s> [INST] Do you have mayonnaise recipes? [/INST] I'm glad you asked, here's a simple and classic Homemade Mayonnaise recipe for you:

Ingredients:
- 1 cup vegetable oil (canola, safflower, or any neutral-tasting oil)
- 1 large egg yolk
- 1 tbsp white wine vinegar or apple cider vinegar
- 1 tbsp lemon juice
- 1 tsp Dijon mustard
- 1 tsp salt
- optional: 1/2 tsp paprika (for a nice color)

Instructions:
1. In a blender or food processor, add the egg yolk, vinegar, lemon juice, mustard, and salt. Blend for about 5 seconds to combine.
2. With the machine running, very slowly drizzle in the oil in a thin, steady stream, near the top of the blade. Be patient and keep the flow steady until all the oil has been incorporated, about 1-2 minutes.
3. Taste the mayonnaise and add additional sal

In [11]:
train_dataset = dataset["train"].select_columns(['chapter', 'summary']).select(range(10))
val_dataset = dataset["validation"].select_columns(['chapter', 'summary']).select(range(10))

In [12]:
print(train_dataset)

Dataset({
    features: ['chapter', 'summary'],
    num_rows: 10
})


In [13]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 5912
    })
    validation: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 1012
    })
    test: Dataset({
        features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
        num_rows: 988
    })
})


In [29]:
def encode(dataset):
    return tokenizer(dataset["chapter"], dataset["summary"], truncation=False, padding="max_length", max_length=128)


In [30]:
encoded_train_dataset = train_dataset.map(encode, batched=True)
encoded_val_dataset = val_dataset.map(encode, batched=True)

# encoded_train_dataset = train_dataset.map(encode)
# encoded_val_dataset = val_dataset.map(encode)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [31]:
encoded_train_dataset

Dataset({
    features: ['chapter', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [32]:
len(encoded_train_dataset['input_ids'][1])

5604

In [34]:
print(encoded_train_dataset['attention_mask'][1][:5])

[1, 1, 1, 1, 1]


In [35]:
print(tokenizer.batch_decode(encoded_train_dataset['input_ids'][1]))

['<s>', '', '\n', '', '"', 'Well', ',', 'go', 'thy', 'way', ':', 'thou', 'sh', 'alt', 'not', 'from', 'this', 'gro', 've', '\n', '', 'T', 'ill', 'I', 'tor', 'ment', 'the', 'e', 'for', 'this', 'injury', '."', '\n', '\n', '', '_', 'M', 'id', 'sum', 'mer', 'Night', "'", 's', 'Dream', '._', '\n', '\n', '\n', 'The', 'words', 'were', 'still', 'in', 'the', 'mouth', 'of', 'the', 'sc', 'out', ',', 'when', 'the', 'leader', 'of', 'the', '\n', 'party', ',', 'whose', 'approaching', 'foot', 'steps', 'had', 'caught', 'the', 'vig', 'il', 'ant', 'ear', 'of', 'the', '\n', 'Ind', 'ian', ',', 'came', 'openly', 'into', 'view', '.', 'A', 'beaten', 'path', ',', 'such', 'as', 'those', 'made', 'by', 'the', '\n', 'period', 'ical', 'passage', 'of', 'the', 'deer', ',', 'wound', 'through', 'a', 'little', 'gl', 'en', 'at', 'no', 'great', '\n', 'distance', ',', 'and', 'struck', 'the', 'river', 'at', 'the', 'point', 'where', 'the', 'white', 'man', 'and', 'his', '\n', 'red', 'companions', 'had', 'posted', 'themselves',

In [36]:
encoded_train_dataset['input_ids'][0]

[1,
 28705,
 13,
 28705,
 345,
 11273,
 1167,
 5080,
 654,
 480,
 1334,
 304,
 261,
 2636,
 28725,
 13,
 2287,
 11439,
 298,
 272,
 1170,
 321,
 813,
 24582,
 4699,
 286,
 28745,
 13,
 28705,
 415,
 8970,
 1174,
 302,
 15507,
 6774,
 13,
 2287,
 415,
 6138,
 304,
 3585,
 1503,
 4768,
 28745,
 13,
 28705,
 1015,
 5063,
 1114,
 28713,
 26108,
 28725,
 304,
 8191,
 28718,
 7835,
 4226,
 28725,
 13,
 2287,
 1015,
 285,
 696,
 1606,
 668,
 406,
 286,
 297,
 272,
 19759,
 611,
 13,
 13,
 28705,
 19721,
 28802,
 12738,
 28723,
 13,
 13,
 13,
 3514,
 1652,
 272,
 10214,
 381,
 822,
 288,
 17162,
 1050,
 304,
 516,
 1885,
 3269,
 24804,
 298,
 13,
 2748,
 299,
 6036,
 1309,
 13551,
 778,
 264,
 8613,
 369,
 10932,
 1259,
 2655,
 10333,
 607,
 13,
 262,
 15123,
 28725,
 478,
 1580,
 938,
 396,
 3227,
 28742,
 28713,
 23037,
 28725,
 304,
 6139,
 272,
 6337,
 264,
 1664,
 13,
 28719,
 3429,
 298,
 272,
 7635,
 1050,
 302,
 272,
 1633,
 970,
 478,
 506,
 1432,
 2598,
 706,
 28723,
 13,
 13,
 2486,

In [37]:
len(encoded_train_dataset['input_ids'])

10

In [38]:
encoded_val_dataset

Dataset({
    features: ['chapter', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [39]:
encoded_train_dataset

Dataset({
    features: ['chapter', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [40]:
# encoded_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=4)

In [41]:
training_args = TrainingArguments(output_dir="test_trainer",
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  num_train_epochs=5,
                                  learning_rate=0.001,
                                  optim='adamw_torch'
                                  )

In [42]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")

In [43]:
model.add_adapter(peft_config)

ValueError: Adapter with name default already exists. Please use a different name.

How to Use Rouge
At minimum, this metric takes as input a list of predictions and a list of references:

>>> rouge = evaluate.load('rouge')
>>> predictions = ["hello there", "general kenobi"]
>>> references = ["hello there", "general kenobi"]
>>> results = rouge.compute(predictions=predictions,
...                         references=references)
>>> print(results)
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
One can also pass a custom tokenizer which is especially useful for non-latin languages.

>>> results = rouge.compute(predictions=predictions,
...                         references=references,
                            tokenizer=lambda x: x.split())
>>> print(results)
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
It can also deal with lists of references for each predictions:

>>> rouge = evaluate.load('rouge')
>>> predictions = ["hello there", "general kenobi"]
>>> references = [["hello", "there"], ["general kenobi", "general yoda"]]
>>> results = rouge.compute(predictions=predictions,
...                         references=references)
>>> print(results)
{'rouge1': 0.8333, 'rouge2': 0.5, 'rougeL': 0.8333, 'rougeLsum': 0.8333}```

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    compute_metrics=rouge
)

In [45]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 168.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 13.86 GiB is allocated by PyTorch, and 358.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print(len(encoded_train_dataset['attention_mask'][0]))

In [None]:
print(len(encoded_train_dataset['attention_mask'][78]))


In [72]:
for i in range(len(encoded_train_dataset['input_ids'])):
    if len(encoded_train_dataset['input_ids'][i]) == 11593:
        print(i)

KeyboardInterrupt: 