In [11]:
import os
import torch
from datasets import load_dataset
import tensor_parallel as tp
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer

)
from datasets import Dataset
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import pandas as pd
import json
import gc
import argparse
from peft import prepare_model_for_kbit_training, get_peft_model
import datetime
import wandb

In [5]:
wandb.login()
wandb_project = "Llam-2-7b-draper"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project
wandb.init(project=wandb_project)

# Assuming you're using argparse to parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--num_train_epochs", type=int, default=3)
parser.add_argument("--per_device_train_batch_size", type=int, default=8)
parser.add_argument("--learning_rate", type=float, default=5e-5)
parser.add_argument("--optim", type=str, default="adamw_bnb_8bit")
parser.add_argument("--weight_decay", type=float, default=0.01)
args = parser.parse_args()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbansijivani[0m ([33mbansijivaniteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


usage: ipykernel_launcher.py [-h] [--num_train_epochs NUM_TRAIN_EPOCHS]
                             [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
                             [--learning_rate LEARNING_RATE]
ipykernel_launcher.py: error: unrecognized arguments: --f=/home/service1/.local/share/jupyter/runtime/kernel-v2-6932238NgcijDaitX1.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
gc.collect()

#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:40000"

In [12]:
#base_model = r"/home/service1/TinyMistral-248M-v2.5"
base_model = r"/home/service1/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/c1b0db933684edbfe29a06fa47eb19cc48025e93"
#draper_dataset = r"C:\coding\LLM\draper_val_mistral.txt"
#new_model = r"/home/service1/llama-2-7b-drapertrain" ## path for a new model to save

### for wandb
run_name = "llama-2-7b"
new_model = r"/home/service1/llama-2-7b-hyperparametertrain" + run_name

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model,
                                             quantization_config=bnb_config,
                                             device_map="auto"
                                            )

model = accelerator.prepare_model(model)
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

    
# optional
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights

# model.half()
print(next(model.parameters()).device)  # Print model's device
# print(tokenized_inputs["input_ids"].device)  # Print tensor's device

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.config.pretraining_tp = 1
# Reshape the tensor to the desired size
# model.config.pretraining_tp = model.config.pretraining_tp.view(-1, 1)



In [None]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [None]:
######## Train Dataset ########
    
updated_dataset = load_dataset("json", data_files="/home/service1/data/finetune/draper/draper_train_inputids.json", split="train")

tokenized_dataset = updated_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])
print(tokenized_dataset)


In [None]:
#### validation or Evaluation Dataset ####

validation_dataset = load_dataset("json", data_files="/home/service1/data/finetune/draper/draper_val_inputids.json", split="train")
tokenized_val_dataset = validation_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)
tokenized_val_dataset = tokenized_val_dataset.add_column("labels", tokenized_val_dataset["input_ids"])
print(tokenized_val_dataset)

In [None]:
import matplotlib.pyplot as plt

def plot_data_lengths(tokenized_train_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
    lengths += [len(x['input_ids']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()

plot_data_lengths(tokenized_dataset)

In [None]:

model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)

In [None]:
######### alternate the process to get the intituiton of the performance of the model ########
#If only targeting attention blocks of the model
#target_modules = ["q_proj", "v_proj"]

#If targeting all linear layers
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']


In [None]:
peft_params = LoraConfig(lora_alpha=16, 
                         lora_dropout=0.1, 
                         r=64, 
                         bias="none", 
                         target_modules=target_modules,
                         task_type="CAUSAL_LM")

In [None]:
#model= get_peft_model(model, peft_params)
model = accelerator.prepare_model(model)

In [13]:
from datetime import datetime
training_args = TrainingArguments(
    output_dir = new_model,
    overwrite_output_dir=True,
    num_train_epochs=args.num_train_epochs,
    gradient_accumulation_steps=8,
    fp16=False,
    bf16=True,
    per_device_train_batch_size=args.per_device_train_batch_size,
    optim=args.optim,
    max_steps=-1,
    group_by_length=True,
    learning_rate=args.learning_rate,
    weight_decay=args.weight_decay,
    warmup_steps=10,
    lr_scheduler_type="linear",
    save_steps= 200,
    save_strategy="steps",
    logging_steps= 25,
    logging_dir=f"{new_model}/logs",
    evaluation_strategy="steps",
    eval_steps=1000,
    do_eval=True, 
    report_to="wandb",
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}",
)

ValueError: adam is not a valid OptimizerNames, please select one of ['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_torch_npu_fused', 'adamw_apex_fused', 'adafactor', 'adamw_anyprecision', 'sgd', 'adagrad', 'adamw_bnb_8bit', 'adamw_8bit', 'lion_8bit', 'lion_32bit', 'paged_adamw_32bit', 'paged_adamw_8bit', 'paged_lion_32bit', 'paged_lion_8bit', 'rmsprop']

In [None]:

#4-Bit Quantization( use this if you want to quantize the model using PEFT) 
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:40000"
trainer = SFTTrainer(model=model, 
                     train_dataset=tokenized_dataset,
                     eval_dataset=tokenized_val_dataset, 
                     peft_config=peft_params, 
                     tokenizer=tokenizer, 
                     args=training_args, 
                     dataset_text_field="text",
                     data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False))

In [None]:
trainer.train()
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)
gc.collect()

torch.cuda.empty_cache()