In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import sys

from dotenv import load_dotenv
env = load_dotenv('token.env')
api_token = os.getenv('api_token')

In [25]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f'Number of GPUs available: {num_gpus}')

    # Print the name of each GPU
    for i in range(num_gpus):
        print(f'GPU {i}: {torch.cuda.get_device_name(i)}')
else:
    print('No GPUs available, running on CPU.')

Number of GPUs available: 1
GPU 0: NVIDIA GeForce RTX 3060


In [30]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token = api_token)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config,device_map = 'cuda', token = api_token)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.00it/s]


In [31]:
file_names = os.listdir("data")
from datasets import Dataset, DatasetDict

data_directory = "data"

datasets = DatasetDict()
for i,file in enumerate(file_names):
    with open(f"{data_directory}/{file}", 'r') as f:
        data = f.read()
    datasets[i] = Dataset.from_dict({'text': [data]})

In [32]:
print(len(datasets))

39


In [33]:
from datasets import concatenate_datasets

# Get the list of datasets from the DatasetDict
datasets_list = list(datasets.values())

# Concatenate the datasets into a single Dataset
dataset = concatenate_datasets(datasets_list)

In [35]:
print(dataset['text'][5])


[Narrator]: Within the heart of the enchanted forest, where ancient trees stand sentinel and the air is thick with the scent of magic, a fearsome creature prowls, its eyes burning with primal rage. With a deafening roar, it emerges from the shadows, its massive form casting a menacing shadow over the forest floor.

[Player]: With a steady hand, I draw my weapon, prepared to confront the beast and defend myself.

[Narrator]: The clash of steel fills the air as you engage in a fierce battle with the creature, each strike ringing out like thunder amidst the tranquil forest. Despite its ferocity, you stand your ground, determined to emerge victorious against this formidable adversary.

[Player]: I fight with all my strength, seeking out weaknesses in the beast's defenses as I strive to gain the upper hand.

[Narrator]: With each exchange of blows, you feel the weight of the creature's relentless assault bearing down upon you, testing your resolve and pushing you to the brink of exhaustion

In [36]:
def tokenize_function(example):
    start_prompt = '[Game Start]\n\n'
    example['text'] = start_prompt + example['text']
    example['text'] = tokenizer(example['text'], padding = 'max_length',max_length=2500,truncation= True , return_tensors="pt").to('cuda')
    
    return example

# def tokenize_function(example):
#     start_prompt = '[Game Start]\n\n'
#     example['text'] = tokenizer(start_prompt + example['text'])
#     return example

In [37]:
tk_dataset = dataset.map(tokenize_function)

Map: 100%|██████████| 39/39 [00:00<00:00, 496.58 examples/s]


In [10]:
# for i,value in enumerate(tk_dataset['text'][7]['input_ids'][0]):
#     if value != 0:
#         print(True)

In [11]:
# tk_dataset = tk_dataset.remove_columns(['text'])
# tk_dataset = tk_dataset.rename_column('token', 'input_ids')

In [38]:
tk_dataset

Dataset({
    features: ['text'],
    num_rows: 39
})

In [39]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    # gradient_accumulation_steps=gradient_accumulation_steps,
    # save_steps=save_steps,
    # logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    # max_grad_norm=max_grad_norm,
    # max_steps=max_steps,
    # warmup_ratio=warmup_ratio,
    # group_by_length=True,
    # gradient_checkpointing=True,
)

In [1]:
import transformers
from trl import SFTTrainer
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

train_args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    )


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import transformers
from trl import SFTTrainer
def formatting_func(example):
    text = f"<start_of_turn>user\n{example['INSTRUCTION'][0]}<end_of_turn> <start_of_turn>model\n{example['RESPONSE'][0]}<end_of_turn>"
    return [text]

import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=tk_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=150,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)

NameError: name 'model' is not defined

In [15]:
len(tk_dataset['text'][0])

2

In [16]:
tk_dataset['text'][0]['input_ids']

[[2,
  235309,
  6242,
  7248,
  235307,
  109,
  235309,
  154506,
  8254,
  1877,
  692,
  27418,
  22583,
  1280,
  573,
  115885,
  9309,
  235269,
  573,
  25619,
  55868,
  3131,
  78608,
  41471,
  12136,
  576,
  2611,
  578,
  12425,
  3054,
  573,
  9309,
  6784,
  235265,
  1646,
  798,
  2375,
  573,
  87434,
  4134,
  57873,
  1384,
  2449,
  692,
  235269,
  216571,
  696,
  573,
  8566,
  576,
  22172,
  1996,
  27290,
  120076,
  71557,
  573,
  8195,
  235265,
  62122,
  235269,
  692,
  2063,
  4492,
  476,
  37003,
  1570,
  476,
  75045,
  27988,
  12353,
  1794,
  692,
  235269,
  1277,
  4628,
  134775,
  675,
  671,
  1156,
  192464,
  17273,
  235265,
  109,
  235309,
  7324,
  8254,
  590,
  102079,
  5688,
  573,
  27988,
  578,
  20483,
  1277,
  157682,
  235265,
  109,
  235309,
  154506,
  8254,
  714,
  27988,
  235269,
  78248,
  476,
  70318,
  43314,
  675,
  476,
  166962,
  13618,
  576,
  8343,
  235269,
  13807,
  692,
  675,
  39987,
  235265,
  9

In [17]:
trainer.train()

  0%|          | 0/30 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


ValueError: Attempting to unscale FP16 gradients.