# Example of X-LoRA API
Eric Buehler 2024

In [None]:
import torch
import xlora
from transformers import AutoConfig, AutoModelForCausalLM  # type: ignore
from xlora.xlora_utils import load_model  # type: ignore

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    trust_remote_code=True,
    use_flash_attention_2=False,
    device_map="cuda:0",
    torch_dtype=torch.bfloat16,
)

config = AutoConfig.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    trust_remote_code=True,
    use_flash_attention_2=False,
    device_map="auto",
)

## Convert the model to X-LoRA

In [None]:
model_created = xlora.add_xlora_to_model(
    model=model,
    xlora_config=xlora.xLoRAConfig(
        config.hidden_size,
        base_model_id="mistralai/Mistral-7B-Instruct-v0.1",
        xlora_depth=8,
        device=torch.device("cuda"),
        adapters={
            "adapter_1": "./path/to/the/checkpoint/",
            "adapter_2": "./path/to/the/checkpoint/",
            "adapter_n": "./path/to/the/checkpoint/",
        },
    ),
    verbose=True,
)

## Set the adapters to trainable
This means that when loading the model again we should specify only the adapter names.

In [None]:
# Use trainable adapters: mark all adapters as trainable
model_created.set_use_trainable_adapters(True)

# Get the current status of the trainable adapters, in this case returning True
model_created.get_use_trainable_adapters()

## Set and resetting the scaling pass value

In [None]:
# Set the scaling pass value to 0, meaning that no adapters will contribute to the scaling pass output
model_created.set_scaling_pass_value(0)

# Allow the model to use the default scaling pass value
model_created.set_scaling_pass_value(None)

## Setting and getting the global LoRA weight

In [None]:
# Multiply the output of each LoRA adapter by 2, additionally to the scalings.
model_created.set_global_scaling_weight(2)

# Returns 2
res = model_created.get_global_scaling_weight()

## Example of scalings logging

In [None]:
# Enable scalings logging and begin a log
model_created.enable_scalings_logging()

# Run forward passes to accumulate a log

# Write the log to a file, or multiple.
model_created.flush_log_scalings("./path/to/output/file")

# Get a shallow copy of the scalings
log_copy = model_created.get_scalings_log()

# Disable scalings logging
model_created.disable_scalings_logging()

# Clear the scalings log
model_created.clear_scalings_log()

# Get the latest scalings prediction
scalings_pred = model_created.get_latest_scalings()

# Load the scalings log from a file, or multiple automatically.
loaded_log = xlora.xlora_utils.load_scalings_log("./path/to/output/file", verbose=True)

## Example of getting, printing trainable parameters

In [None]:
num_trainable, num_all_params = model_created.get_nb_trainable_parameters()

model_created.print_trainable_parameters()

## Setting and getting the top-k lora value

In [None]:
# Use the top 2 lora experts
model_created.set_topk_lora(2)

# Returns 2
res = model_created.get_topk_lora()

## From pretrained

In [None]:
model_created = xlora.from_pretrained(
    "./path/to/saved/model",
    model,
    "cuda",
)

## Using the simpler API

In [None]:
XLoRA_model_name = "myuser/repo"

model_loaded, tokenizer = load_model(
    model_name=XLoRA_model_name,
    device="cuda:0",
    dtype=torch.bfloat16,
    adapters={
        "adapter_1": "./path/to/the/checkpoint/",
        "adapter_2": "./path/to/the/checkpoint/",
        "adapter_n": "./path/to/the/checkpoint/",
    },
)