In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, EvalPrediction, BitsAndBytesConfig
from transformers.trainer_pt_utils import nested_concat
from datasets import load_dataset, load_metric
import torch
from pynvml import *
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from tqdm import tqdm
from time import time
import wandb
from peft import LoraConfig, get_peft_model, IA3Config, PromptEncoderConfig, PrefixTuningConfig, AdaLoraConfig, prepare_model_for_kbit_training
import numpy as np
from torch.nn.functional import cross_entropy
import transformers
from torch.nn.utils.rnn import pad_sequence

transformers.logging.set_verbosity_info()
device = torch.device("cuda")

2024-06-15 09:10:02.914932: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
os.environ["WANDB_API_KEY"]=""
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]=""
# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"
# turn off watch to log faster
os.environ["WANDB_WATCH"]="true"
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mknowledge-ai-online[0m ([33mfuck-my-diploma[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [6]:
import torch
torch.cuda.empty_cache()
print_gpu_utilization()

GPU memory occupied: 4634 MB.


In [7]:
def download_and_save_dataset(dataset_name='wikitext', dataset_config='wikitext-2-raw-v1', save_path='wikitext-2'):
    # Load the dataset from Hugging Face
    dataset = load_dataset(dataset_name,  dataset_config)

    # Save the training data to a file
    with open(save_path + '-train.txt', 'w', encoding='utf-8') as f:
        for text in dataset['train']['text']:
            f.write(text + '\n')
    with open(save_path + '-val.txt', 'w', encoding='utf-8') as f:
        for text in dataset['validation']['text']:
            f.write(text + '\n')
    with open(save_path + '-test.txt', 'w', encoding='utf-8') as f:
        for text in dataset['test']['text']:
            f.write(text + '\n')
 

# Download WikiText-2 and save it
# download_and_save_dataset()

In [8]:
def freeze_parameters(model, unfreeze_last_n=2):
    # Freeze all parameters first
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last `unfreeze_last_n` transformer blocks
    for i in range(1, unfreeze_last_n + 1):
        for param in model.transformer.h[-i].parameters():
            param.requires_grad = True

    # Unfreeze the embedding layer parameters
    for param in model.transformer.wte.parameters():  # Word token embeddings
        param.requires_grad = True
    for param in model.transformer.wpe.parameters():  # Positional embeddings
        param.requires_grad = True

    # Unfreeze the LM head
    for param in model.lm_head.parameters():
        param.requires_grad = True

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params


class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def training_step(self, model: torch.nn.Module, inputs: dict) -> torch.Tensor:
        model.train()
        inputs = self._prepare_inputs(inputs)
        
        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)

        del inputs
        torch.cuda.empty_cache()

        kwargs = {}

        if self.args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss, **kwargs)

        # Update and allocate with AdaLoRA
        model.base_model.update_and_allocate(self.state.global_step)

        return loss.detach() / self.args.gradient_accumulation_steps

In [9]:
def compute_metrics(eval_pred: EvalPrediction):
    logits_list, labels_list = eval_pred
    total_loss = 0.0
    total_count = 0

    for logits, labels in zip(logits_list, labels_list):
        # Apply mask to exclude padding (or other ignore indices marked by -100)
        mask = labels != -100
        valid_logits = logits[mask]
        valid_labels = labels[mask]

        # Calculate loss for each batch and accumulate
        if valid_labels.size > 0:  # Use .size for numpy arrays
            # Reshape logits and labels appropriately
            valid_logits = valid_logits.reshape(-1, valid_logits.shape[-1])
            valid_labels = valid_labels.reshape(-1)

            # Compute softmax on the logits to convert them to probabilities
            exp_logits = np.exp(valid_logits - np.max(valid_logits, axis=1, keepdims=True))
            softmax_logits = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

            # Compute cross entropy loss
            log_probs = -np.log(softmax_logits[np.arange(valid_labels.size), valid_labels] + 1e-9)  # Add a small constant for numerical stability
            batch_loss = np.mean(log_probs)

            total_loss += batch_loss * valid_labels.size
            total_count += valid_labels.size

    # Calculate perplexity from total accumulated loss and count
    if total_count > 0:
        average_loss = total_loss / total_count
        perplexity = np.exp(average_loss)
    else:
        perplexity = float('inf')  # Handle cases with no valid data

    return {"perplexity": perplexity}

def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        # Prepare the model
        model_name = 'gpt2'
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        
        if  "qlora" in config.peft_method:
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype="float16",
                bnb_4bit_use_double_quant=False,
            )
            model = GPT2LMHeadModel.from_pretrained(
                model_name,
                quantization_config=bnb_config
            )
        else:
            model = GPT2LMHeadModel.from_pretrained(
                model_name
            )
    
        if "lora" in config.peft_method:
            peft_config = LoraConfig(
                r=config.r,
                lora_alpha=config.r * config.alpha_multiplier,
                # target_modules=adapter_conf['target_modules'],
                lora_dropout=config.lora_dropout,
                fan_in_fan_out=config.fan_in_fan_out,
                bias=config.bias,
                task_type="CAUSAL_LM"
            )
            model = get_peft_model(model, peft_config)
        if "rslora" in config.peft_method:
            peft_config = LoraConfig(
                r=config.r,
                lora_alpha=config.r * config.alpha_multiplier,
                # target_modules=adapter_conf['target_modules'],
                lora_dropout=config.lora_dropout,
                fan_in_fan_out=config.fan_in_fan_out,
                bias=config.bias,
                use_rslora=config.use_rslora,
                task_type="CAUSAL_LM"
            )
            model = get_peft_model(model, peft_config)
        if "dora" in config.peft_method:
            peft_config = LoraConfig(
                r=config.r,
                lora_alpha=config.r * config.alpha_multiplier,
                # target_modules=adapter_conf['target_modules'],
                lora_dropout=config.lora_dropout,
                fan_in_fan_out=config.fan_in_fan_out,
                bias=config.bias,
                use_dora=config.use_dora,
                task_type="CAUSAL_LM"
            )
            model = get_peft_model(model, peft_config)
        elif "adalora" in config.peft_method:
            peft_config = AdaLoraConfig(
                init_r=config.init_r,
                target_r=config.target_r,
                lora_alpha = config.target_r * 2,
                beta1=config.beta1,
                beta2=config.beta2,
                tinit=config.tinit,
                total_step=config.total_step,
                # lora_alpha=adapter_conf['lora_alpha'],
                lora_dropout=0.01,
                task_type="CAUSAL_LM",
                inference_mode=False,
            )
            model = get_peft_model(model, peft_config)
        elif "IA3" in config.peft_method:
            peft_config = IA3Config(
                task_type="CAUSAL_LM",
                target_modules=config.config['target_modules'],
                feedforward_modules=config.config['feedforward_modules']
            )
            model = get_peft_model(model, peft_config)
        elif "qlora" in config.peft_method:
            model.gradient_checkpointing_enable()
            model = prepare_model_for_kbit_training(model)
            peft_config = LoraConfig(
                r=config.r,
                lora_alpha=config.r * config.alpha_multiplier,
                # target_modules=adapter_conf['target_modules'],
                lora_dropout=config.lora_dropout,
                fan_in_fan_out=config.fan_in_fan_out,
                bias=config.bias,
                task_type="CAUSAL_LM"
            )
            model = get_peft_model(model, peft_config)

        # if config.unfreeze_last_n != -1:
        #     freeze_parameters(model, unfreeze_last_n=config.unfreeze_last_n)

        
        if "prefix tuning" in config.peft_method:
            peft_config = PrefixTuningConfig(
                num_virtual_tokens=config.num_virtual_tokens,
                prefix_projection=config.prefix_projection,
                task_type="CAUSAL_LM"
            )
            model = get_peft_model(model, peft_config)
        elif "p-tuning" in config.peft_method:
            peft_config = PrefixTuningConfig(
                num_virtual_tokens=config.num_virtual_tokens,
                encoder_hidden_size=config.encoder_hidden_size,
                task_type="CAUSAL_LM"
            )
            model = get_peft_model(model, peft_config)

        
        for name, param in model.named_parameters():
            if 'prompt' in name or 'lora' in name:
                param.requires_grad = True

            
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path='wikitext-2-train.txt',
            block_size=512)
        val_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path='wikitext-2-val.txt',
            block_size=512)
    
        model = model.to(device)
    
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=False
        )
    
        # Set training arguments
        training_args = TrainingArguments(
            output_dir='adalora_sweap',
            report_to="wandb",
            # run_name=run_name, 
            # overwrite_output_dir=True,
            num_train_epochs=5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            save_steps=10_000,
            # save_total_limit=2,
            save_strategy='epoch',
            evaluation_strategy='epoch',
            logging_strategy='epoch',
            # eval_accumulation_steps=4,
            # dataloader_pin_memory=False,
            include_tokens_per_second=True,
            include_num_input_tokens_seen=True,
            # eval_do_concat_batches=False,
        )
        
        # Initialize the Trainer
        if config.peft_method == "adalora":
            # Initialize the CustomTrainer
            trainer = CustomTrainer(
                model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                # compute_metrics=compute_metrics,
            )
        else:
            trainer = Trainer(
                model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                # compute_metrics=compute_metrics,
            )
    
        print(f"Training with config = {config}, \n Number of trainable parametrs = {trainer.get_num_trainable_parameters()}")
        wandb.log({"number_of_params": trainer.get_num_trainable_parameters()}) 
        trainer.evaluate()
        trainer.train()
        wandb.finish()


In [None]:
wandb.agent('your_sweep_id', train)