We recommend running this notebook a **free** Nvidia L4 Google Colab instance!
<div class="align-center">
  <a href="https://github.com/addy-ai/langdrive">Star us on GitHub</a>
  <br>
  <a href="https://discord.gg/G8eYmcaTTd">Join our Discord</a>
  <br>
  <a href="https://github.com/sponsors/addy-ai/">Sponsor us on Github sponsors</a>
</div>

This notebook replicates the training environment on one of our training images. It spins up a python project with the finetuning code and uses Flask to make it a web server, then uses Ngrok to open an endpoint to the internet.

You can use that endpoint to send your data for finetuning. Feel free to modify as necessary.
Thanks to @vilsonrodrigues for sharding falcon 7b that we use here.

# Installing pip, bnb, and other requirements

In [None]:
!pip install -Uqqq pip
!pip install -qqq bitsandbytes==0.42.0
!pip install -qqq torch==2.0.1
!pip install -qqq -U git+https://github.com/huggingface/transformers.git@e03a9cc
!pip install -qqq -U git+https://github.com/huggingface/peft.git@42a184f
!pip install -qqq -U git+https://github.com/huggingface/accelerate.git@c9fbb71
!pip install -qqq datasets==2.12.0
!pip install -qqq loralib==0.1.1
!pip install -qqq einops==0.6.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m119.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m98.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Initialize the model
We're initializing the model and running a test prompt. This is a well-known and used template prompt for the midjourney dataset

In [None]:
import bitsandbytes as bnb
import torch
import os

import locale
locale.getpreferredencoding = lambda: "UTF-8" #Some dependencies won't install without this


from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)

from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Initializing model")
model = AutoModelForCausalLM.from_pretrained(
            "vilsonrodrigues/falcon-7b-instruct-sharded",
            device_map="auto",
            trust_remote_code=True,
            quantization_config=bnb_config,
        )
print("Initializing tokenizer")
tokenizer = AutoTokenizer.from_pretrained("vilsonrodrigues/falcon-7b-instruct-sharded")
tokenizer.pad_token = tokenizer.eos_token
print("Initializing Lora Configuration")
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

print("Loading model")

model = get_peft_model(model, config)

generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

device = "cuda:0"

prompt = """
<human>: midjourney prompt for a girl sit on the mountain
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Initializing model


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of FalconForCausalLM were not initialized from the model checkpoint at vilsonrodrigues/falcon-7b-instruct-sharded and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initializing tokenizer
Initializing Lora Configuration
Loading model
<human>: midjourney prompt for a girl sit on the mountain
<assistant>: What do you want to do on the mountain?
<human>: I want to take a break and enjoy the view.
<assistant>: Alright, take your time.
<human>: 


# Install Flask and Ngrok
We use flask to make this a webserver for the API endpoint.
Then NGROK is used to open that endpoint to the web so you can call it from your machine.

## Note: You must provide an NGROK Token

In [None]:
!pip install -Uqqq pip
!pip install -qqq bitsandbytes==0.39.0
!pip install -qqq torch==2.0.1
!pip install -qqq -U git+https://github.com/huggingface/transformers.git@e03a9cc
!pip install -qqq -U git+https://github.com/huggingface/peft.git@42a184f
!pip install -qqq -U git+https://github.com/huggingface/accelerate.git@c9fbb71
!pip install -qqq datasets==2.12.0
!pip install -qqq loralib==0.1.1
!pip install -qqq einops==0.6.1

!pip install flask_ngrok
!pip install -qqq huggingface_hub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0mCollecting flask_ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask_ng

In [None]:
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null && echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list && sudo apt update && sudo apt install ngrok

deb https://ngrok-agent.s3.amazonaws.com buster main
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:3 https://ngrok-agent.s3.amazonaws.com buster InRelease [20.3 kB]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [872 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 https://ngrok-agent.s3.amazonaws.com buster/main amd64 Packages [4,219 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:9 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd

# Set NGROK Token
> REMOVE TOKEN BEFORE SHIPPING
- If you're a user, remember to replace this with your own NGROK Token. Sign up for Ngrok here: https://ngrok.com/

In [None]:
!ngrok authtoken <your-ngrok-token>

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


# Import Globally Required Libs

In [None]:
import sys

# Create Training Class
The training class intialized a training class with utilities to create, prepare, and train a model

In [None]:
# Import the necessary modules and set environment variables
import json
import os
from pprint import pprint
# import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import (
    load_dataset,
    Dataset
)
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)


from huggingface_hub import HfFolder, _login

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

"""
Train LLMs
"""


class LLMTrain:
    # Initialize the class with model and data path
    def __init__(self, MODEL_NAME, training_data, hf_token) -> None:
        self.MODEL_NAME = MODEL_NAME
        self.training_data = training_data
        self.HUGGINGFACE_TOKEN = hf_token
        # os.environ['HF_HOME'] = '/content'  # Sets the Hugging Face cache directory

        if 'HF_HOME' in os.environ:
          print("HF_HOME:", os.environ['HF_HOME'])
        else:
          print("HF_HOME environment variable is not set.")

        os.environ.pop('HF_HOME', None)  # Remove HF_HOME if it exists

        os.environ['HUGGINGFACE_HUB_TOKEN'] = hf_token
        HfFolder.save_token(hf_token)

        self.check_if_hugging_face_token_is_set()

    # Method to create transformer model and tokenizer
    def create_model_and_tokenizer(self):
        # Define Quantization configuration to optimize model

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        # Create Transformer model based on given model name
        model = AutoModelForCausalLM.from_pretrained(
            self.MODEL_NAME,
            device_map="auto",
            trust_remote_code=True,
            quantization_config=bnb_config
        )
        # Create a tokenizer for the designated model
        tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME)
        tokenizer.pad_token = tokenizer.eos_token
        self.tokenizer = tokenizer
        return model, tokenizer

    # Method to prepare and configure the model for training
    def prepare_and_configure_model(self, model):
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)
        # Define Configuration for LoRa (Long Range Transformers)
        config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["query_key_value"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
        # Apply the defined configuration to the model
        model = get_peft_model(model, config)
        self.print_trainable_parameters(model)
        return model

    # Method to generate result based on user provided prompt
    def generate_future_with_prompt(self, model, tokenizer, prompt):
        generation_config = model.generation_config

        generation_config.max_new_tokens = 200
        generation_config.temperature = 0.7
        generation_config.top_p = 0.7
        generation_config.num_return_sequences = 1
        generation_config.pad_token_id = tokenizer.eos_token_id
        generation_config.eos_token_id = tokenizer.eos_token_id


        device = "cuda:0"
        # Encoding the prompt using tokenizer
        encoding = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.inference_mode():
            outputs = model.generate(
                input_ids=encoding.input_ids,
                attention_mask=encoding.attention_mask,
                generation_config=generation_config
            )
        print("Decoded: ", tokenizer.decode(outputs[0], skip_special_tokens=True))

    """
    Method to load and tokenize the dataset
    It expects an array of object each object of the format:
    {
        'input': '{{user_input}}',
        'output': '{{model_output}}'
    }
    """

    def load_training_data(self, data):
        # Convert array of objects to dictionary format
        data_dict = {
            'input': [obj['input'] for obj in data],
            'output': [obj['output'] for obj in data]
        }
        d = Dataset.from_dict(data_dict)
        d = d.shuffle().map(self.generate_and_tokenize_prompt)
        return d

    # Method to fine tune the model
    def fine_tune_model(self, model, data, tokenizer, deploy_to_hf, hf_token, hub_model_id, model_path):
        training_args = transformers.TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            num_train_epochs=1,
            learning_rate=2e-4,
            fp16=True,
            save_total_limit=3,
            logging_steps=1,
            output_dir="experiments-1",
            overwrite_output_dir=True,
            optim="paged_adamw_8bit",
            lr_scheduler_type="cosine",
            warmup_ratio=0.05,

            # should_save=deploy_to_hf,
            # push_to_hub=deploy_to_hf,

            hub_model_id=hub_model_id,
            hub_token=hf_token,
        )
        trainer = transformers.Trainer(
            model=model,
            train_dataset=data,
            args=training_args,
            data_collator=transformers.DataCollatorForLanguageModeling(
                tokenizer, mlm=False),
        )

        return trainer

    # Run a complete training cycle
    def run_train(self, MODEL_NAME, training_data, deploy_to_hf, hf_token, model_path):
        self.MODEL_NAME = MODEL_NAME
        model, tokenizer = self.create_model_and_tokenizer()
        model = self.prepare_and_configure_model(model)
        prompt = """
        <human>: midjourney prompt for a girl sit on the mountain
        <assistant>:
        """.strip()

        self.generate_future_with_prompt(model, tokenizer, prompt)
        data = self.load_training_data(training_data)

        push_to_hub_model_id = ""

        if model_path and "/" in model_path:
          push_to_hub_model_id = model_path.split("/")[1] # Get everything after the first "/"

        trainer = self.fine_tune_model(model, data, tokenizer, deploy_to_hf, hf_token, push_to_hub_model_id, model_path)
        model.config.use_cache = False

        trainer.train()

        # Deploy model to Hugging Face Model Hub if necessary
        if (deploy_to_hf and self.check_if_hugging_face_token_is_set()):
          # Trainer push to hub
          trainer.model.push_to_hub(model_path)
          print("Successfully deployed to hub", model_path)
          return True

        # If everything went well, return true
        return True


    # Method to save and push the trained model to Hugging Face Model Hub
    def deploy_to_hugging_face(self, model, model_path, hf_token):
        model.save_pretrained("trained-model")
        PEFT_MODEL = model_path
        model.push_to_hub(PEFT_MODEL, use_auth_token=True)

    def check_if_hugging_face_token_is_set(self):
      # Get the path to the token file
      token_file = HfFolder.path_token

      print("Token file path:", token_file)

      # Check if the token file exists and read its content
      if os.path.isfile(token_file):
          with open(token_file, 'r') as file:
              saved_token = file.read().strip()
              print("Token found:", saved_token)
              return True
      else:
          print("No token found.")
          return False

    # Generate dialog prompt with human and assistant tags
    def generate_prompt(self, data_point):
        return f"""
        <human>: {data_point["input"]}
        <assistant>: {data_point["output"]}
        """.strip()

    # Tokenize the generated dialog prompt
    def generate_and_tokenize_prompt(self, data_point):
        full_prompt = self.generate_prompt(data_point)

        # padding and truncation are set to True for handling sequences of different length.
        tokenized_full_prompt = self.tokenizer(
            full_prompt, padding=True, truncation=True)

        return tokenized_full_prompt

    # Print the number of parameters that are trainable in the model
    def print_trainable_parameters(self, model):
        """
        Prints the number of trainable parameters in the model.
        """
        trainable_params = 0
        all_param = 0

        for _, param in model.named_parameters():
            all_param += param.numel()  # Total parameters
            if param.requires_grad:
                trainable_params += param.numel()  # Trainable parameters
        print(
            f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
        )

# Run Flask Server
- Get collab public URL
- Run flask API listening on that public URL

In [None]:
# Run Flask Server

from google.colab.output import eval_js

from flask import Flask, render_template, request, jsonify
from flask_ngrok import run_with_ngrok

print(eval_js("google.colab.kernel.proxyPort(5000)"))
app = Flask(__name__, template_folder='/content/drive/MyDrive/templates')

run_with_ngrok(app)

def validate_request(request_json, required_fields):
        """
        Validates if all the required fields are present in the request json.

        Args:
            request_json (dict): The request json to validate.
            required_fields (list): A list of strings representing the required fields.

        Returns:
            bool: True if all the required fields are present, False otherwise.
        """
        for field in required_fields:
            if field not in request_json:
                return False
        return True

"""
Root simple returns if the server is active
"""

@app.route("/")
def index():
    try:
        response = {"response": True}
        return jsonify(response), 200

    except Exception as e:
        error = str(e)
        exc_type, exc_value, tb = sys.exc_info()
        filename = tb.tb_frame.f_code.co_filename
        func_name = tb.tb_frame.f_code.co_name
        error_msg = f"{exc_type.__name__}: {exc_value}"
        app.logger.error(
            f"Error: {error_msg}, File: {filename}, Function: {func_name}, Line: {tb.tb_lineno}, Error(e): {e}")
        return jsonify({"error": "Internal server error", "message": error}), 500


"""
Runs the training method
"""
@app.route('/train', methods=['POST'])
def completion():
    try:
        # Check if the required attributes are present in the request body
        required_fields = ["baseModel", "trainingData",
                           "hfToken", "deployToHf",
                           "hfModelPath"]
        is_valid = validate_request(request.json, required_fields)

        if not is_valid:
            # Return error response
            return jsonify({"error": "Missing required params"}), 400

        # Get the required attributes from the request body
        model_name = request.json["baseModel"]
        training_data = request.json["trainingData"]
        hf_token = request.json["hfToken"]
        deploy_to_hugging_face = request.json["deployToHf"]
        model_path = request.json["hfModelPath"]

        print(model_name, deploy_to_hugging_face, model_path)

        llm_train = LLMTrain(model_name, training_data, hf_token)
        train = llm_train.run_train(model_name, training_data, deploy_to_hugging_face, hf_token, model_path)

        if not train:
            raise ValueError("ResponseUndefined")

        # Return response
        return jsonify({"success": True,
                        "model_path": model_path}), 200

    except Exception as e:
        exc_type, exc_value, tb = sys.exc_info()
        filename = tb.tb_frame.f_code.co_filename
        func_name = tb.tb_frame.f_code.co_name
        error_msg = f"{exc_type.__name__}: {exc_value}"
        app.logger.error(
            f"Error: {error_msg}, File: {filename}, Function: {func_name}, Line: {tb.tb_lineno}, Error(e): {e}")
        return jsonify({"error": "Internal server error", "message": error_msg}), 500



if __name__ == "__main__":
    app.run()

https://1wagbd663i9-496ff2e9c6d22116-5000-colab.googleusercontent.com/
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-12:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 203, in _new_conn
    sock = connection.create_connection(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 791, in urlopen
    response = self._make_request(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 497, in _make_request
    conn.request(
  File "/usr/local/lib/python3.10/dist-packages/urllib3