In [None]:
!pip install -U transformers==4.40.0 peft==0.10.0 accelerate==0.27.2 datasets evaluate rouge_score bert_score --quiet

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from transformers import (
    BartTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset, DatasetDict
import torch
from google.colab import drive
from datasets import load_dataset, Dataset
from evaluate import load
import pandas as pd
import os
import warnings
from tqdm import tqdm


os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
drive.mount('/content/drive/')
warnings.filterwarnings('ignore')

In [None]:
def backup_colab_content_to_drive(folder_name='Colab Notebooks'):
  import shutil
  import os

  src = '/content'
  dest = f'/content/drive/MyDrive/{folder_name}'
  os.makedirs(dest, exist_ok=True)

  for item in os.listdir(src):
    if item == 'drive':
      continue
    s = os.path.join(src, item)
    d = os.path.join(dest, item)
    if os.path.isdir(s):
      shutil.copytree(s, d)
    else:
      shutil.copy2(s, d)

  print(f'📁 Backup complete. Files saved to: {dest}')

# Load data:

In [None]:
file_path = "/content/drive/MyDrive/robot_dreams/final/summ_data.pickle"
df = pd.read_pickle(file_path)
print(f'Raw dataset shape: {df.shape}')

dataset = Dataset.from_pandas(df).shuffle()
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
test_valid = split_dataset['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = split_dataset['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

# Preprocess data:

In [None]:
MODEL_NAME = "facebook/bart-large-cnn"
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128


tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)

def preprocess(example):
  inputs = tokenizer(
    example["text"], truncation=True, padding="max_length", max_length=MAX_INPUT_LENGTH
  )
  labels = tokenizer(
    example["summary"], truncation=True, padding="max_length", max_length=MAX_TARGET_LENGTH
  )
  labels["input_ids"] = [l if l != tokenizer.pad_token_id else -100 for l in labels["input_ids"]]
  inputs["labels"] = labels["input_ids"]
  return inputs

tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess, batched=True, remove_columns=val_dataset.column_names)
tokenized_test = test_dataset.map(preprocess, batched=True, remove_columns=test_dataset.column_names)

# Train model:

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# LoRA configurations
lora_config = LoraConfig(
  r=8,
  lora_alpha=16,
  target_modules=["q_proj", "v_proj"],
  lora_dropout=0.1,
  bias="none",
  task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training
training_args = Seq2SeqTrainingArguments(
  output_dir="./bart-lora",
  per_device_train_batch_size=2,
  per_device_eval_batch_size=2,
  learning_rate=1e-4,
  num_train_epochs=1,
  logging_dir="./logs",
  evaluation_strategy="steps",
  save_strategy="steps",
  eval_steps=100,
  save_steps=100,
  logging_steps=100,
  predict_with_generate=True,
  report_to="none"
)
trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_train,
  eval_dataset=tokenized_val,
  tokenizer=tokenizer,
  data_collator=data_collator
)
trainer.train()

In [None]:
model.save_pretrained("./bart-lora-tuned")
tokenizer.save_pretrained("./bart-lora-tuned")

backup_colab_content_to_drive('robot_dreams/backup/')

# Evaluate:

In [None]:
# Load fine-tuned model
model_path = "./bart-lora-tuned"
#tokenizer = BartTokenizer.from_pretrained(model_path)
#model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model.eval()

# Load evaluation metrics
rouge = load("rouge")
bert = load("bertscore")

predictions = []
references = []

for example in tqdm(test_dataset):
  input_text = example["text"]
  reference = example["summary"]

  # Tokenize input
  inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    padding="max_length",
    max_length=MAX_INPUT_LENGTH
  )

  # Generate summaries
  with torch.no_grad():
    output_ids = model.generate(
      **inputs,
      max_new_tokens=MAX_TARGET_LENGTH,
      num_beams=4,
      early_stopping=True
    )

  summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  predictions.append(summary)
  references.append(reference)

rouge_results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
bertscore_results = bert.compute(predictions=predictions, references=references, lang="en")

print("ROUGE Scores:")
for key, value in rouge_results.items():
    print(f"{key}: {value:.4f}")

print("BERT Score:")
print(f"Precision: {sum(bertscore_results['precision'])/len(bertscore_results['precision']):.4f}")
print(f"Recall: {sum(bertscore_results['recall'])/len(bertscore_results['recall']):.4f}")
print(f"F1: {sum(bertscore_results['f1'])/len(bertscore_results['f1']):.4f}")