In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig
from dotenv import load_dotenv

# Load the environment file
load_dotenv("/mnt/c/projects/game/token.env")

# Retrieve the API token
api_token = os.getenv("api_token")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f'Number of GPUs available: {num_gpus}')

    # Print the name of each GPU
    for i in range(num_gpus):
        print(f'GPU {i}: {torch.cuda.get_device_name(i)}')
else:
    print('No GPUs available, running on CPU.')

Number of GPUs available: 1
GPU 0: NVIDIA GeForce RTX 3060


### Dataset Prep

In [3]:
file_names = os.listdir("data")
from datasets import Dataset, DatasetDict

data_directory = "data"

datasets = DatasetDict()
for i,file in enumerate(file_names):
    with open(f"{data_directory}/{file}", 'r') as f:
        data = f.read()
    datasets[i] = Dataset.from_dict({'text': [data]})

In [4]:
print(len(datasets))

114


In [5]:
from datasets import concatenate_datasets

# Get the list of datasets from the DatasetDict
datasets_list = list(datasets.values())

# Concatenate the datasets into a single Dataset
dataset = concatenate_datasets(datasets_list)

In [6]:
dataset

Dataset({
    features: ['text'],
    num_rows: 114
})

### Model

In [7]:
model_name = "google/gemma-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, trust_remote_code = True)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)
# tokenizer.pad_token = eos

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.39s/it]


In [8]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 16

peft_config = LoraConfig(
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    r = lora_r,
    bias = "none",
    task_type = "CAUSAL_LM"
)

# from peft import LoraConfig

# lora_alpha = 16
# lora_dropout = 0.1
# lora_r = 64

# peft_config = LoraConfig(
#     lora_alpha=lora_alpha,
#     lora_dropout=lora_dropout,
#     r=lora_r,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

In [9]:
from transformers import TrainingArguments

output_dir = "./results"
gradient_accumulation_steps = 1
save_steps = 10
logging_steps = 10
optim = "paged_adamw_32bit"
learning_rate = 2e-4
max_grad_norm = 0.3
weight_decay = 0.01
num_train_epochs = 10
lr_scheduler_type = "constant"

training_args = TrainingArguments(
    output_dir = output_dir,
    optim = optim,
    num_train_epochs = num_train_epochs,
    gradient_accumulation_steps = gradient_accumulation_steps,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    max_grad_norm = max_grad_norm,
    fp16 = True,
    group_by_length = True,
    gradient_checkpointing = True,
    weight_decay = weight_decay,
    lr_scheduler_type = lr_scheduler_type
)

In [10]:
from trl import SFTTrainer

max_seq_length = 500

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)

Map: 100%|██████████| 114/114 [00:00<00:00, 2182.71 examples/s]


In [11]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maditobito[0m ([33madit_ahmedabadi[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


KeyboardInterrupt: 