# AI should be accessible to all
GPU poor or GPU rich!

### Load and Inference Qwen-0.5B
Here we just test run the raw model using vllm to get an idea of the performance and memory usage.

In [None]:
!pip install vllm

In [None]:
mport time
from vllm import LLM, SamplingParams

def load_model(model_name, dtype):
    """
    Load a language model and measure the time it takes to load.

    Args:
        model_name (str): The name or path of the model to load.
        dtype (str): The data type to use for the model.

    Returns:
        LLM: The loaded language model instance.
    """
    loading_start = time.time()
    llm = LLM(model=model_name, dtype=dtype)
    loading_time = time.time() - loading_start
    print(f"--- Loading time: {loading_time:.2f} seconds ---")
    return llm

def generate_text(llm, prompts, sampling_params):
    """
    Generate text from the given prompts using the loaded language model.

    Args:
        llm (LLM): The loaded language model instance.
        prompts (list): A list of prompts to generate text from.
        sampling_params (SamplingParams): The sampling parameters to use for text generation.

    Returns:
        list: A list of outputs containing the generated text for each prompt.
    """
    generation_start = time.time()
    outputs = llm.generate(prompts, sampling_params)
    generation_time = time.time() - generation_start
    print(f"--- Generation time: {generation_time:.2f} seconds ---")
    return outputs

def print_generated_text(outputs):
    """
    Print the generated text from the outputs.

    Args:
        outputs (list): A list of outputs containing the generated text.
    """
    for output in outputs:
        generated_text = output.outputs[0].text
        print(generated_text)
        print('------')

In [None]:
prompts = [
    "Driving a manual car requires knowledge of"
]
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, top_k=20, max_tokens=150)

llm = load_model("Qwen/Qwen1.5-0.5B", "half")

try:
    outputs = generate_text(llm, prompts, sampling_params)
    print_generated_text(outputs)

In [None]:
finally:
    # Clean up the LLM instance and free resources.
    llm.close()

### Fine-Tuning
Using the QloRA technique

In [None]:
!pip install --upgrade bitsandbytes transformers peft accelerate datasets trl

In [None]:
# Import necessary libraries
import torch
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer


def load_tokenizer(model_name):
   """
   Load and configure the tokenizer for the given model.

   Args:
       model_name (str): The name of the pre-trained model.

   Returns:
       tokenizer (AutoTokenizer): The configured tokenizer instance.
   """
   tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
   tokenizer.pad_token = tokenizer.eos_token
   tokenizer.pad_token_id = tokenizer.eos_token_id
   tokenizer.padding_side = 'left'
   return tokenizer


def load_model(model_name, tokenizer):
   """
   Load and configure the model for PEFT training.

   Args:
       model_name (str): The name of the pre-trained model.
       tokenizer (AutoTokenizer): The tokenizer instance.

   Returns:
       model (AutoModelForCausalLM): The configured model instance.
   """
   compute_dtype = getattr(torch, "float16")
   bnb_config = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_compute_dtype=compute_dtype,
       bnb_4bit_use_double_quant=True,
   )
   model = AutoModelForCausalLM.from_pretrained(
       model_name, quantization_config=bnb_config, device_map={"": 0}
   )
   model = prepare_model_for_kbit_training(model)

   # Configure the pad token in the model
   model.config.pad_token_id = tokenizer.pad_token_id
   model.config.use_cache = False  # Gradient checkpointing is used by default but not compatible with caching

   return model


def configure_peft_lora():
   """
   Configure the PEFT LoRA (Low-Rank Adaptation) parameters.

   Returns:
       peft_config (LoraConfig): The configured PEFT LoRA instance.
   """
   peft_config = LoraConfig(
       lora_alpha=16,
       lora_dropout=0.05,
       r=16,
       bias="none",
       task_type="CAUSAL_LM",
       target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
   )
   return peft_config


def configure_training_arguments():
   """
   Configure the training arguments for the PEFT fine-tuning process.

   Returns:
       training_arguments (TrainingArguments): The configured training arguments instance.
   """
   training_arguments = TrainingArguments(
       output_dir="./drive/Mydrive/results_qlora",
       evaluation_strategy="steps",
       do_eval=True,
       optim="paged_adamw_8bit",
       per_device_train_batch_size=8,
       per_device_eval_batch_size=8,
       log_level="debug",
       save_steps=50,
       logging_steps=50,
       learning_rate=2e-5,
       eval_steps=50,
       max_steps=300,
       warmup_steps=30,
       lr_scheduler_type="linear",
   )
   return training_arguments

In [None]:
# Set the model name
model_name = "Qwen/Qwen1.5-0.5B"

# Load the tokenizer
tokenizer = load_tokenizer(model_name)

# Load the dataset
ds = load_dataset("timdettmers/openassistant-guanaco")

# Load and configure the model
model = load_model(model_name, tokenizer)

# Configure the PEFT LoRA
peft_config = configure_peft_lora()

# Configure the training arguments
training_arguments = configure_training_arguments()

# Create the trainer instance
trainer = SFTTrainer(
    model=model,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Start the training process
trainer.train()


### Inference the fine-tuned model

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_model_and_tokenizer(output_dir):
    """
    Load the model and tokenizer from the specified output directory.

    Args:
        output_dir (str): The path to the output directory containing the model and tokenizer files.

    Returns:
        model (AutoModelForCausalLM): The loaded language model.
        tokenizer (AutoTokenizer): The loaded tokenizer.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(output_dir)
        model = AutoModelForCausalLM.from_pretrained(output_dir, device_map="auto")
    except Exception as e:
        print(f"Error loading model and tokenizer: {e}")
        return None, None
    return model, tokenizer

def inference(model, tokenizer, messages):
    """
    Perform inference using the loaded model and tokenizer.

    Args:
        model (AutoModelForCausalLM): The loaded language model.
        tokenizer (AutoTokenizer): The loaded tokenizer.
        messages (list): A list of dictionaries containing the conversation history.

    Returns:
        output (str): The generated response from the model.
    """
    try:
        input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to(model.device)

        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95
        )
        output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    except Exception as e:
        print(f"Error during inference: {e}")
        output = "Sorry, I encountered an error and couldn't generate a response."

    return output

In [None]:
output_dir = "/content/drive/Mydrive/results_qlora/checkpoint-300"

model, tokenizer = load_model_and_tokenizer(output_dir)

messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]

output = inference(model, tokenizer, messages)
print(output)