In [1]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType


In [5]:
pip install peft

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install accelerate




In [3]:
pip install -i https://test.pypi.org/simple/ bitsandbytes

Looking in indexes: https://test.pypi.org/simple/
Collecting bitsandbytes
  Downloading https://test-files.pythonhosted.org/packages/5c/e0/597d593ec3b6cf5ea7eb4894a545045bd95611de8a316a2a1eaa838a2459/bitsandbytes-0.39.0-py3-none-any.whl (95.8 MB)
                                              0.0/95.8 MB ? eta -:--:--
                                              0.0/95.8 MB ? eta -:--:--
                                             0.1/95.8 MB 656.4 kB/s eta 0:02:26
                                             0.1/95.8 MB 901.1 kB/s eta 0:01:47
                                              0.2/95.8 MB 1.2 MB/s eta 0:01:22
                                              0.2/95.8 MB 1.1 MB/s eta 0:01:29
                                              0.3/95.8 MB 1.0 MB/s eta 0:01:32
                                             0.3/95.8 MB 930.9 kB/s eta 0:01:43
                                              0.4/95.8 MB 1.1 MB/s eta 0:01:30
                                              0.5/95.

In [4]:
# Load the pre-trained model and tokenizer
model_id = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)



ImportError: Using `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or pip install bitsandbytes` 

In [None]:
# Prepare the model for LoRA and int-8 training
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)


In [None]:
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from datasets import load_dataset

traindata = load_dataset("cnn_dailymail" , split="train")
valdata = load_dataset("cnn_dailymail" , split="validation")
testdata = load_dataset("cnn_dailymail" , split="test")

In [None]:
# Concatenate text content for each example in the dataset
train_texts = [" ".join(example["article"].split() + example["highlights"].split()) for example in traindata]
val_texts = [" ".join(example["article"].split() + example["highlights"].split()) for example in valdata]

# Tokenize the concatenated texts
train_encodings = tokenizer(train_texts, return_tensors="pt", truncation=True, padding=True)
val_encodings = tokenizer(val_texts, return_tensors="pt", truncation=True, padding=True)

In [None]:
# Prepare the dataset for fine-tuning
train_dataset = TextDataset(
    tokenizer=tokenizer,
    tokenized_datasets=train_encodings,
    block_size=128
)
val_dataset = TextDataset(
    tokenizer=tokenizer,
    tokenized_datasets=val_encodings,
    block_size=128
)


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)



In [None]:
# Set up the training arguments and trainer
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    save_steps=10_000,
    save_total_limit=2,
    no_grad_checkpoint=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    prediction_loss_only=True
)



In [None]:
# Fine-tune the model
trainer.train()

In [None]:

with open("./evaluation-article.txt", "r", encoding="utf-8") as article_file:
    input_text = article_file.read()

with open("./evaluation-summary.txt", "r", encoding="utf-8") as summary_file:
    target_text = summary_file.read()

# Tokenize the input and target texts
inputs = tokenizer.encode(input_text, return_tensors="pt")
labels = tokenizer.encode(target_text, return_tensors="pt")

# Evaluate the model
eval_pred = model(inputs, labels)
predictions = torch.argmax(eval_pred.logits, dim=-1)
