# Download all the Python Libraries

In [None]:
# Check if the copmuter is on google colab
import sys
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    !pip install rich
    # Install PyTorch 2.0 with cuda 11.7
    !pip install "torch>=2.0" --extra-index-url https://download.pytorch.org/whl/cu117 --upgrade --quiet
    !pip install -q -U bitsandbytes
    !pip install -q -U git+https://github.com/huggingface/transformers.git 
    !pip install -q -U git+https://github.com/huggingface/peft.git
    !pip install -q -U git+https://github.com/huggingface/accelerate.git
    !pip install datasets
    !pip install wandb
    #!pip install ray[tune]
    !pip install langchain
    !pip install session-info
    !pip install tensorboard
else:
    print("Not running on Google Colab")
from rich import print
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
#ROOT_PATH = Path(__file__).parent.parent
import session_info
session_info.show()

# Check the GPU env
1. You can check the GPU in the Google Colab by clicking  and efficieny
2. Check if the GPU can use bfloat16 most effective as most model are pre-trained with bfloat16

In [None]:
import torch
from rich import print
if torch.cuda.is_available():
    !nvidia-smi
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print("Cuda capability: ", torch.cuda.get_device_capability(0))
    '''
    On pre-ampere hardware bf16 works, but doesn't provide speed-ups compared to fp32 matmul operations, and some matmul operations are failing outright, so this check is more like "guaranteed to work and be performant" than "works somehow".  https://github.com/pytorch/pytorch/issues/75427
    '''
    print(f"bfloat16 support: { torch.cuda.is_bf16_supported()}") 

# Set the Seed Environment of the Notebook to ensure the reproducibility

In [None]:
from transformers import set_seed

DEFAULT_SEED = 42

set_seed( DEFAULT_SEED )

# Constant Variable

In [2]:
#@title Default title text
variable_name = "paged_lion_8bit" #@param ["paged_lion_8bit"]
BITS  = 4
OPTIMIZER = "paged_lion_8bit"
# Whether to use bf16 (preferred on A100's)."
# Default now is in pyTorch 1.13 and above
import torch
torch.backends.cuda.matmul.allow_tf32 = True

# Setup Weight And Bias 💡 Configuration tips

W&B integration with Hugging Face can be configured to add extra functionalities:

* auto-logging of models as artifacts: just set environment varilable `WANDB_LOG_MODEL` to `true`
* log histograms of gradients and parameters: by default gradients are logged, you can also log parameters by setting environment variable `WANDB_WATCH` to `all`
* set custom run names with `run_name` arg present in scripts or as part of `TrainingArguments`
* organize runs by project with the `WANDB_PROJECT` environment variable

For more details refer to [W&B + HF integration documentation](https://docs.wandb.ai/integrations/huggingface).

In [None]:
import wandb
import os

wandb.login()

%env WANDB_LOG_MODEL=true

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="prompt_generator"

# save your trained model checkpoint to wandb you don't want to save it set this to false as the model is 3 billion saize 
#os.environ["WANDB_LOG_MODEL"]

# turn off watch to log faster
#os.environ["WANDB_WATCH"]= "all" # I am getting an error with this


# Set Up Local Training Root

# Download the Datset from the Hugging Face Datset Face Dataset

In [None]:
DEFAULT_TRAINING_DATASET = "Rami/prompts"
from datasets import load_dataset
training_dataset = load_dataset(
    "Rami/prompts",
)
print(
    training_dataset
)


# Download the Tokenizers
1. We are suing Dolly model which was trained on the Pythia model. Instead we are recreating the dollvy tokenizer from the Pythia tokenizer

In [1]:
from transformers import AutoTokenizer

# Special Tokens
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
DEFAULT_SEED = 42

PRETRAINED_MODEL_NAME_OR_PATH = "databricks/dolly-v2-3b"#"databricks/dolly-v2-3b"
eleutherai_python_3b = "EleutherAI/pythia-2.8b"
eleutherai_python_7b = "EleutherAI/pythia-6.9b"
dolly_v2_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)
print(dolly_v2_tokenizer)
pythia_tokenizer = AutoTokenizer.from_pretrained(eleutherai_python_7b ,
                                                 padding_side  = "right")
                                                 
print(pythia_tokenizer)

# Make sure that the pad token is the end of the tokens
pythia_tokenizer.pad_token = pythia_tokenizer.eos_token

# Add special tokens for End , Instruction , Response Key
pythia_tokenizer.add_special_tokens({
    "additional_special_tokens": [
        END_KEY,
        INSTRUCTION_KEY,
        RESPONSE_KEY,
    ]
})

print(pythia_tokenizer)

GPTNeoXTokenizerFast(name_or_path='databricks/dolly-v2-3b', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['### End', '### Instruction:', '### Response:']}, clean_up_tokenization_spaces=True)
GPTNeoXTokenizerFast(name_or_path='EleutherAI/pythia-6.9b', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)
GPTNeoXTokenizerFast(name_or_path='EleutherAI/pythia-6.9b', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftex

# Process the dataset
1. Convert the dataset into instruction

## Instruct Format String and Constants

In [None]:
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
DEFAULT_SEED = 42
# This is a training prompt that does not contain an input string.  The instruction by itself has enough information
# to respond.  For example, the instruction might ask for the year a historic figure was born.
PROMPT_NO_INPUT_FORMAT = """{intro}

{instruction_key}
{instruction}

{response_key}
{response}

{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

# This is a training prompt that contains an input string that serves as context for the instruction.  For example,
# the input might be a passage from Wikipedia and the intruction is to extract some information from it.
PROMPT_WITH_INPUT_FORMAT = """{intro}

{instruction_key}
{instruction}

{input_key}
{input}

{response_key}
{response}

{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    input="{input}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_FOR_GENERATION_FORMAT = """{intro}

{instruction_key}
{instruction}

{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)

In [None]:
from datasets import Dataset, load_dataset

def load_training_dataset(path_or_dataset: str = DEFAULT_TRAINING_DATASET) -> Dataset:
    logger.info(f"Loading dataset from {path_or_dataset}")
    dataset = load_dataset(path_or_dataset)["train"]
    logger.info("Found %d rows", dataset.num_rows)

    def _add_text(rec):
        instruction = rec["instruction"]
        response = rec["response"]
        context = rec.get("context")

        if not instruction:
            raise ValueError(f"Expected an instruction in: {rec}")

        if not response:
            raise ValueError(f"Expected a response in: {rec}")

        # For some instructions there is an input that goes along with the instruction, providing context for the
        # instruction.  For example, the input might be a passage from Wikipedia and the instruction says to extract
        # some piece of information from it.  The response is that information to extract.  In other cases there is
        # no input.  For example, the instruction might be open QA such as asking what year some historic figure was
        # born.
        if context:
            rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, response=response, input=context)
        else:
            rec["text"] = PROMPT_NO_INPUT_FORMAT.format(instruction=instruction, response=response)
        return rec

    dataset = dataset.map(_add_text)

    return dataset

## Preprocess the dataset methods

In [None]:
import logging
from functools import partial
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)
def preprocess_batch(batch: Dict[str, List], tokenizer: AutoTokenizer, max_length: int) -> dict:
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation = True,
    )
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed=DEFAULT_SEED, training_dataset: str = DEFAULT_TRAINING_DATASET) -> Dataset:
    """Loads the training dataset and tokenizes it so it is ready for training.

    Args:
        tokenizer (AutoTokenizer): Tokenizer tied to the model.
        max_length (int): Maximum number of tokens to emit from tokenizer.

    Returns:
        Dataset: HuggingFace dataset
    """

    dataset = load_training_dataset(training_dataset)

    logger.info("Preprocessing dataset")
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Make sure we don't have any truncated records, as this would mean the end keyword is missing.
    logger.info("Processed dataset has %d rows", dataset.num_rows)
    dataset = dataset.filter(lambda rec: len(rec["input_ids"]) < max_length)
    logger.info("Processed dataset has %d rows after filtering for truncated records", dataset.num_rows)

    logger.info("Shuffling dataset")
    dataset = dataset.shuffle(seed=seed)

    logger.info("Done preprocessing")

    return dataset

## Data Collator For Completion Only LM

In [None]:
import numpy as np
# The author use the Data Collator for Adding special tokens, such as the beginning-of-sequence and end-of-sequence tokens.
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
        # sequence of tokens.  This should just be a single token.
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)

        labels = batch["labels"].clone()

        for i in range(len(examples)):

            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # Make pytorch loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

# Download the Model
1. Torch Datat

## Setup Bits and Butes Config

## 4 Bit Configuration
1. 4 bit Normal Float 
2. Double Quantization save even more memory

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "EleutherAI/gpt-neox-20b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    load_in_8bit = False,
    bnb_4bit_use_double_quant=True , # double quantization
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
)


## Download the LM Models
Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [None]:
from transformers import AutoModelForCausalLM
assert torch.cuda.is_available(), "You need to have a GPU to run this notebook."
n_gpus = torch.cuda.device_count()
def model_init():
    free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
    max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

    n_gpus = torch.cuda.device_count()
    print(f"Number of GPUs: {n_gpus}")
    max_memory = {i: max_memory for i in range(n_gpus)}
    print(f"Max memory: {max_memory}")
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path = PRETRAINED_MODEL_NAME_OR_PATH,
        trust_remote_code = True,
        use_cache = False,
        torch_dtype =  torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        device_map = "auto",
        load_in_8bit = False,
        load_in_4bit = True,
        low_cpu_mem_usage = True, # low cpu memory usage is to be true when the device map is auto
        max_memory =  max_memory,
        quantization_config = bnb_config,
    )
    return model

model = model_init()

## Determine the Max Length of the Model

In [None]:
# Use the same max length that the model supports.  Fall back to 1024 if the setting can't be found.
# The configuraton for the length can be stored under different names depending on the model.  Here we attempt
# a few possible names we've encountered.
conf = model.config
max_length = None
for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
    max_length = getattr(model.config, length_setting, None)
    if max_length:
        logger.info(f"Found max lenth: {max_length}")
        break
if not max_length:
    max_length = 1024
    logger.info(f"Using default max length: {max_length}")
print(max_length)

In [None]:
# https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing#scrollTo=jq0nX33BmfaC
from peft import prepare_model_for_kbit_training
def print_trainable_parameters( model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    if BITS == 4: trainable_params /= 2
    print(
        f"trainable params: {trainable_params} || "
        f"all params: {all_param} || "
        f"trainable: {100 * trainable_params / all_param}"
    )
# https://github.com/huggingface/peft/blob/main/src/peft/utils/other.py#LL62C1-L97C17
model = prepare_model_for_kbit_training(model , use_gradient_checkpointing = True)
# in the Qlora codes they did gradient checkpointing twice
model.gradient_checkpointing_enable()

## Set up the Q LoRa Models
1. The Q Lora add adapters at every network layers and therby avoid almost all of the accuracy tradeoff seen in priors works

In [None]:
from peft import LoraConfig, get_peft_model
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
import bitsandbytes as bnb
def find_all_linear_names( model):
    cls = bnb.nn.Linear4bit if BITS == 4 else (bnb.nn.Linear8bitLt if BITS == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

print(find_all_linear_names(model)) # ['query_key_value', 'dense_h_to_4h', 'dense_4h_to_h', 'dense'] adpater is added to the layer in QLoRa papers

lora_config = LoraConfig(
    r=8, # there iosnot relation between R and performance of the model 
    lora_alpha = 16 , # Q Lora they set alpha to 16
    target_modules = find_all_linear_names(model), 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)
# https://github.com/artidoro/qlora/blob/main/qlora.py
for name, module in model.named_modules():
    #if isinstance(module, LoraLayer):
        #if args.bf16:
        #    module = module.to(torch.bfloat16)
    if 'norm' in name:
        module = module.to(torch.float32)
    #if 'lm_head' in name or 'embed_tokens' in name:
    #    if hasattr(module, 'weight'):
    #        if args.bf16 and module.weight.dtype == torch.float32:
    #            module = module.to(torch.bfloat16)

## Save the PEFT Model

In [None]:
from os.path import exists, join, isdir
import transformers
PREFIX_CHECKPOINT_DIR = None
class SavePeftModelCallback(transformers.TrainerCallback):
    def save_model(self, args, state, kwargs):
        print('Saving PEFT checkpoint...')
        if state.best_model_checkpoint is not None:
            checkpoint_folder = os.path.join(state.best_model_checkpoint, "adapter_model")
        else:
            checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)

    def on_save(self, args, state, control, **kwargs):
        self.save_model(args, state, kwargs)
        return control

    def on_train_end(self, args, state, control, **kwargs):
        def touch(fname, times=None):
            with open(fname, 'a'):
                os.utime(fname, times)

        touch(join(args.output_dir, 'completed'))
        self.save_model(args, state, kwargs)

# Preprocess the Datset using the Tokenizer

In [None]:
processed_dataset = preprocess_dataset(
    tokenizer = pythia_tokenizer,
    max_length = max_length,
    seed = DEFAULT_SEED,
)

# Split the Dataset

split_dataset = processed_dataset.train_test_split(test_size = 0.2, seed=DEFAULT_SEED, shuffle = True)

train_datset = split_dataset["train"]
print(train_datset)
val_dataset = split_dataset["test"]

logger.info("Train data size: %d", split_dataset["train"].num_rows)
logger.info("Test data size: %d", split_dataset["test"].num_rows)

# Create the Data Collator for this Projects

In [None]:

# https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
data_collator = DataCollatorForCompletionOnlyLM(
    tokenizer = pythia_tokenizer,
    mlm = False , 
    return_tensors="pt", 
    pad_to_multiple_of = 8
)

# Training Arguments 
1. WarmUp Steps 
2. Learning Rate 
   1.  0.0001 <path_or_name> https://github.com/artidoro/qlora/tree/main if the model is bigger than 13 Billions
   2.  The learning rate of the lora for gpt model was 2.00e-04. I belive the learning rate need to be higher for the lora model
   3.  What should be the learnign rate of the model 32 batch size dollvy model
       1.  2.00e-04 .3 
       2.  2.00e-03
       3.  1e-5
       4.  0.001
       5.  

In [None]:
def generate_run_name():
    import random
    return f"run-{PRETRAINED_MODEL_NAME_OR_PATH.replace('/', '-')}-{BITS}bit-LoRa-paged-adamw-8bits"+("-bf16" if torch.cuda.is_bf16_supported() else "") + str(random.randint(0,9))

## Determine the Batch Size

In [None]:
'''
The larger the batch size the worse of the performance of the model, and harder to overfit and can easily get diverged.
'''
import torch
def determine_batch_size(optimizer:str = "adamw", bits:int = 4, gpus:int = 1):
    if optimizer == "paded_adamw_8bit" and bits == 4 and "A100" in torch.cuda.get_device_name(0):
        return 32
    elif optimizer == "paded_lion_8bit" and bits == 4 and "A100" in torch.cuda.get_device_name(0):
        return 64
    elif optimizer == "paded_adamw_8bit" and bits == 4 and "A100" not in torch.cuda.get_device_name(0):
        return 8
    elif optimizer =="paged_lion_8bit" and bits == 4 and "A100" not in torch.cuda.get_device_name(0):
        return 4
    else:
        # Determine the Batch Size for 4 bits and non A100 GPU
        return 2
print(determine_batch_size(OPTIMIZER, BITS, 1))

In [None]:
from transformers import Trainer, TrainingArguments , EarlyStoppingCallback
from packaging import version
batch_size: int = determine_batch_size(optimizer = OPTIMIZER, bits = BITS, gpus = n_gpus)
gradient_accumulation_steps = 4
print(f"My effictive batch size is {batch_size * gradient_accumulation_steps}")
learning_rate: float = 2e-4
epochs = 16 if "NVIDIA A100-SXM4-40GB" in torch.cuda.get_device_name(0) else 4
print(f"learning_rate: {learning_rate}")
trainer_arguments = TrainingArguments(
            output_dir = "outputs",
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps, # https://lightning.ai/pages/blog/gradient-accumulation/
            num_train_epochs = epochs,
            learning_rate=learning_rate,
            lr_scheduler_type = "constant", # q lora paper they used constant learning rate and consine 
            warmup_ratio = .8, # notice the model stop decreae after it reach the peas
            bf16 =  True if torch.cuda.is_bf16_supported() else False,
            fp16 =   False if  torch.cuda.is_bf16_supported() else True,
            bf16_full_eval = True if torch.cuda.is_bf16_supported() else False,
            logging_steps = 1, 
            eval_steps = 1,
            evaluation_strategy = "steps",
            optim = OPTIMIZER,
            torch_compile =  True if version.parse(torch.__version__) >= version.parse("2.0.0") and BITS > 8 else False,
            save_strategy="steps",
            report_to = ["wandb", "tensorboard"],
            load_best_model_at_end = True,
            metric_for_best_model="eval_loss",
            seed = DEFAULT_SEED,
            max_grad_norm = .3 # the max grad norm was set to 0.3 in the Q Lora Papres
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer = pythia_tokenizer,
    args = trainer_arguments,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator,
    callbacks = [ EarlyStoppingCallback(
        early_stopping_patience = 2 if "NVIDIA A100-SXM4-40GB" in torch.cuda.get_device_name(0) else 1 , 
        early_stopping_threshold = 0.005
        )
                 ],
)

# Start training

In [None]:
try:
    logger.info("Training")
    trainer.train()
    wandb.finish()
except RuntimeError as e:
    if 'out of memory' in str(e):
        print('| WARNING: ran out of memory, retrying batch')
        for p in model.parameters():
            if p.grad is not None:
                del p.grad  # free some memory
        torch.cuda.empty_cache()

In [None]:
train_logs = trainer.state.log_history
print(train_logs)

# Tensorboard

In [None]:
# Load the TensorBoard notebook extensio
# https://discuss.huggingface.co/t/how-to-read-the-logs-created-by-hugging-face-trainer/32279/4
%load_ext tensorboard
%tensorboard --logdir outputs

# Push the Model to the Hugging Face Model Hub

In [None]:
model.push_to_hub("Rami/dolly_prompt_generator")

# Text Generation inference

# Text Generation COnfiguration

In [None]:
from transformers import AutoModelForCausalLM, GenerationConfig
import random
generation_config =  GenerationConfig(
    max_new_tokens = 256, # The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
    num_beams = 1, # 1 means no beam search instead greedy search
    temperature = .3, # Parameters for manipulation of the model output logits
    top_p = 0.92, # Parameters for manipulation of the model output logits
    top_k = 50, # Parameters to only select the top-k tokens, instead of sampling from the distribution
    do_sample = True ,# select a random token from the top-k tokens (set to 0 to disable top-k sampling) instead of choosing the one with the highest probability
    use_cache = True, # Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
    repetition_penalty = 1.02, # The parameter for repetition penalty. 1.0 means no penalty. See this paper for more details.
)

## Download the adaper config if not present

In [None]:
from peft import PeftConfig, PeftModel

repo_name = "Rami/dolly_prompt_generator"
config = PeftConfig.from_pretrained(repo_name) 

## Combine the Model and Adapter

In [None]:
from peft import PeftConfig, PeftModel

inference_model = None
try:
    inference_model = PeftModel.from_pretrained(
        model,
        repo_name,
    )
except NameError as e:
    ## Donwload the model from the HFhub
    model = model_init()
    
    inference_model = PeftModel.from_pretrained(
        model,
        repo_name,
    )

## Create the Instruction Generation Pipeline

In [None]:
import logging
import re
from typing import List

import numpy as np
from transformers import Pipeline, PreTrainedTokenizer


logger = logging.getLogger(__name__)

INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)

# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)


def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
    """Gets the token ID for a given string that has been added to the tokenizer as a special token.
    When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
    treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
    Args:
        tokenizer (PreTrainedTokenizer): the tokenizer
        key (str): the key to convert to a single token
    Raises:
        RuntimeError: if more than one ID was generated
    Returns:
        int: the token ID for the given key
    """
    token_ids = tokenizer.encode(key)
    if len(token_ids) > 1:
        raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
    return token_ids[0]

from transformers import AutoModelForCausalLM, GenerationConfig
class InstructionTextGenerationPipeline(Pipeline):
    def __init__(
        self, 
        generation_config: GenerationConfig = None,
        **kwargs,
    ):
        """Initialize the pipeline
        Args:
            do_sample (bool, optional): Whether or not to use sampling. Defaults to True.
            max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
            top_p (float, optional): If set to float < 1, only the smallest set of most probable tokens with
                probabilities that add up to top_p or higher are kept for generation. Defaults to 0.92.
            top_k (int, optional): The number of highest probability vocabulary tokens to keep for top-k-filtering.
                Defaults to 0.
        """
        self.generation_config: GenerationConfig = generation_config
        super().__init__(**kwargs)

    def _sanitize_parameters(self,
                             return_full_text: bool = None,
                             **generate_kwargs):
        preprocess_params = {}
        assert self.generation_config is not None, "Generation config is not initialized."

        # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
        # append a newline to yield a single token.  find whatever token is configured for the response key.
        tokenizer_response_key = next(
            (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
        )

        response_key_token_id = None
        end_key_token_id = None
        if tokenizer_response_key:
            try:
                response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
                end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)

                # Ensure generation stops once it generates "### End"
                generate_kwargs["eos_token_id"] = end_key_token_id
                self.generation_config.eos_token_id = end_key_token_id
            except ValueError:
                pass

        forward_params = generate_kwargs
        postprocess_params = {
            "response_key_token_id": response_key_token_id,
            "end_key_token_id": end_key_token_id
        }

        if return_full_text is not None:
            postprocess_params["return_full_text"] = return_full_text
            print(postprocess_params)

        return preprocess_params, forward_params, postprocess_params

    def preprocess(self, instruction_text, **generate_kwargs):
        prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)
        inputs = self.tokenizer(
            prompt_text,
            return_tensors="pt",
        )
        inputs["prompt_text"] = prompt_text
        inputs["instruction_text"] = instruction_text
        return inputs
    ## Only Once
    def _forward(self, model_inputs , eos_token_id):
        assert self.model is not None, "Model is not initialized."
        assert self.generation_config is not None, "Generation config is not initialized."
        assert self.tokenizer is not None, "Tokenizer is not initialized."
        assert self.tokenizer.pad_token_id is not None, "Tokenizer does not have a pad token ID."
        print(self.generation_config)
        input_ids = model_inputs["input_ids"]
        attention_mask = model_inputs.get("attention_mask", None)

        if input_ids.shape[1] == 0:
            input_ids = None
            attention_mask = None
            in_b = 1
        else:
            in_b = input_ids.shape[0]

        generated_sequence = self.model.generate(
            input_ids=input_ids.to(self.model.device),
            attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None,
            pad_token_id=self.tokenizer.pad_token_id,
            generation_config=self.generation_config,
        )

        out_b = generated_sequence.shape[0]
        if self.framework == "pt":
            generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
        elif self.framework == "tf":
            generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))

        instruction_text = model_inputs.pop("instruction_text")
        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}

    def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_full_text: bool = False):

        generated_sequence = model_outputs["generated_sequence"][0]
        instruction_text = model_outputs["instruction_text"]

        generated_sequence: List[List[int]] = generated_sequence.numpy().tolist()
        records = []
        for sequence in generated_sequence:

            # The response will be set to this variable if we can identify it.
            decoded = None

            # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
            if response_key_token_id and end_key_token_id:
                # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
                # prompt, we should definitely find it.  We will return the tokens found after this token.
                try:
                    response_pos = sequence.index(response_key_token_id)
                except ValueError:
                    logger.warn(f"Could not find response key {response_key_token_id} in: {sequence}")
                    response_pos = None

                if response_pos:
                    # Next find where "### End" is located.  The model has been trained to end its responses with this
                    # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
                    # this token, as the response could be truncated.  If we don't find it then just return everything
                    # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
                    try:
                        end_pos = sequence.index(end_key_token_id)
                    except ValueError:
                        end_pos = None

                    decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()

            if not decoded:
                # Otherwise we'll decode everything and use a regex to find the response and end.

                fully_decoded = self.tokenizer.decode(sequence)

                # The response appears after "### Response:".  The model has been trained to append "### End" at the
                # end.
                m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)

                if m:
                    decoded = m.group(1).strip()
                else:
                    # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
                    # return everything after "### Response:".
                    m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
                    if m:
                        decoded = m.group(1).strip()
                    else:
                        logger.warn(f"Failed to find response in:\n{fully_decoded}")

            # If the full text is requested, then append the decoded text to the original instruction.
            # This technically isn't the full text, as we format the instruction in the prompt the model has been
            # trained on, but to the client it will appear to be the full text.
            if return_full_text:
                decoded = f"{instruction_text}\n{decoded}"

            rec = {"generated_text": decoded}

            records.append(rec)

        return records

In [None]:
from rich import print
generate_text = InstructionTextGenerationPipeline(model=model,
                                                  tokenizer = pythia_tokenizer, 
                                                  task="text-generation" , 
                                                  return_full_text=True,
                                                  generation_config=generation_config)
                                                  
print(generate_text.task)
print(generate_text._sanitize_parameters())
print(generate_text("### Instruction: What is the capital of France? ### Response:"))



## Langchain Prompt with Hugging Face Pipeline

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline

# template for an instrution with no input
prompt = PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}")

# template for an instruction with input
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context}")

hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)



print(llm_chain.predict(instruction="Explain to me the difference between nuclear fission and fusion.").lstrip())

context = """George Washington (February 22, 1732[b] – December 14, 1799) was an American military officer, statesman,
and Founding Father who served as the first president of the United States from 1789 to 1797."""

print(llm_context_chain.predict(instruction="When was George Washington president?", context=context).lstrip())

# Hyperparameter Tuning

## Model Init

In [None]:
def model_int():
    return 

# References
[1] [Dolly Github](https://github.com/databrickslabs/dolly/blob/5021d941d95dddcf1f00d978d7f944709873f419/training/trainer.py#L138)
[2] https://gist.github.com/Birch-san/57878c4a27cf34f57d3e861865a7d0a2
[3] https://github.com/artidoro/qlora/blob/main/qlora.py 
[4] https://github.com/tloen/alpaca-lora/blob/main/finetune.py 