In [None]:
!pip install fsspec==2024.9.0



In [None]:
!pip install transformers datasets peft accelerate torch



In [None]:
import torch
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset, load_from_disk
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Specify the path where the dataset was saved
saved_path = "/content/drive/MyDrive/Creativity/Data/processed_narrativeqa_dataset_Starling-tokenizer"

# Load the dataset from the saved path
tokenized_dataset = load_from_disk(saved_path)

Loading dataset from disk:   0%|          | 0/27 [00:00<?, ?it/s]

In [None]:
# Load the model and tokenizer
model_name = "berkeley-nest/Starling-LM-7B-alpha"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Print model architecture to find correct target module names
# for name, module in model.named_modules():
#     print(name, module)

In [None]:
pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Prepare LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,                # Low-rank adaptation parameter
    lora_alpha=32,           # Scaling factor
    lora_dropout=0.1,          # Dropout to prevent overfitting
    target_modules=[
        f"layers.{i}.self_attn.q_proj" for i in range(20, 32)
    ] + [
        f"layers.{i}.self_attn.k_proj" for i in range(20, 32)
    ] + [
        f"layers.{i}.self_attn.v_proj" for i in range(20, 32)
    ] + [
        f"layers.{i}.self_attn.o_proj" for i in range(20, 32)
    ],
    bias="none"
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

trainable params: 5,111,808 || all params: 7,246,860,288 || trainable%: 0.0705


In [None]:
!pip install bitsandbytes



In [None]:
training_args = TrainingArguments(
    output_dir="./Starling-7b-finetuned-narrativeqa",
    run_name="Starling-7b-run-narrativeqa",
    eval_strategy="no",
    eval_steps=500,
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    weight_decay=0.01,
    warmup_ratio=0.01,
    lr_scheduler_type="constant",
    save_strategy="steps",
    optim="paged_adamw_8bit",
    max_grad_norm=1.0,
    save_steps=500,
    save_total_limit=5,
    logging_dir="./logs-narrativeqa",
    fp16=True,
    # remove_unused_columns=False,
    dataloader_num_workers=2
)

tokenized_narrativeqa_train = tokenized_dataset["train"]

from transformers import DataCollatorForSeq2Seq
# Define a data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_narrativeqa_train,
    data_collator=data_collator
)

# Fine-tune the model with LoRA
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m18217291136[0m ([33m18217291136-nanyang-technological-university-singapore[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,187.1151
200,110.0395
300,107.6422
400,106.3749
500,105.5665
600,104.7424
700,104.4499
800,102.7827
900,102.0481
1000,100.4864


TrainOutput(global_step=2046, training_loss=106.1787684525446, metrics={'train_runtime': 24377.285, 'train_samples_per_second': 1.343, 'train_steps_per_second': 0.084, 'total_flos': 5.724786947423994e+18, 'train_loss': 106.1787684525446, 'epoch': 0.9996640913671482})

In [None]:
# Define save path
model_save_path = "/content/drive/MyDrive/Creativity/Starling_7b_fine_tuned_model_narrativeqa_long"

# Save the fine-tuned model
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('/content/drive/MyDrive/Creativity/Starling_7b_fine_tuned_model_narrativeqa_long/tokenizer_config.json',
 '/content/drive/MyDrive/Creativity/Starling_7b_fine_tuned_model_narrativeqa_long/special_tokens_map.json',
 '/content/drive/MyDrive/Creativity/Starling_7b_fine_tuned_model_narrativeqa_long/tokenizer.model',
 '/content/drive/MyDrive/Creativity/Starling_7b_fine_tuned_model_narrativeqa_long/added_tokens.json',
 '/content/drive/MyDrive/Creativity/Starling_7b_fine_tuned_model_narrativeqa_long/tokenizer.json')