In [2]:
import argparse
import torch
from transformers import AutoModel, AutoTokenizer
from model.openllama import OpenLLAMAPEFTModel
import json

In [2]:
torch.cuda.is_available()

True

In [3]:
def initialize_model():
    args = {
        'model': 'openllama_peft',
        'imagebind_ckpt_path': '../pretrained_ckpt/imagebind_ckpt',
        'vicuna_ckpt_path': '../pretrained_ckpt/vicuna_ckpt/7b_v0',
        'delta_ckpt_path': '../pretrained_ckpt/pandagpt_ckpt/7b/pytorch_model.pt',
        'stage': 2,
        'max_tgt_len': 128,
        'lora_r': 32,
        'lora_alpha': 32,
        'lora_dropout': 0.1,
    }

    model = OpenLLAMAPEFTModel(**args)
    delta_ckpt = torch.load(args['delta_ckpt_path'], map_location=torch.device('cuda'))
    model.load_state_dict(delta_ckpt, strict=False)
    model = model.eval().half().cuda()
    return model

In [4]:
def parse_text(text):
    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
    lines = text.split("\n")
    lines = [line for line in lines if line != ""]
    count = 0
    for i, line in enumerate(lines):
        if "```" in line:
            count += 1
            items = line.split('`')
            if count % 2 == 1:
                lines[i] = f'<pre><code class="language-{items[-1]}">'
            else:
                lines[i] = f'<br></code></pre>'
        else:
            if i > 0:
                if count % 2 == 1:
                    line = line.replace("`", "\`")
                    line = line.replace("<", "&lt;")
                    line = line.replace(">", "&gt;")
                    line = line.replace(" ", "&nbsp;")
                    line = line.replace("*", "&ast;")
                    line = line.replace("_", "&lowbar;")
                    line = line.replace("-", "&#45;")
                    line = line.replace(".", "&#46;")
                    line = line.replace("!", "&#33;")
                    line = line.replace("(", "&#40;")
                    line = line.replace(")", "&#41;")
                    line = line.replace("$", "&#36;")
                lines[i] = "<br>"+line
    text = "".join(lines)
    return text

In [5]:
def generate_response(model, input_text, image_path=None, audio_path=None, video_path=None, thermal_path=None, max_length=256, top_p=0.9, temperature=1.0):
    # Prepare the prompt
    prompt_text = input_text

    # Generate response using the model
    response = model.generate({
        'prompt': prompt_text,
        'image_paths': [image_path] if image_path else [],
        'audio_paths': [audio_path] if audio_path else [],
        'video_paths': [video_path] if video_path else [],
        'thermal_paths': [thermal_path] if thermal_path else [],
        'top_p': top_p,
        'temperature': temperature,
        'max_tgt_len': max_length,
        'modality_embeds': ([])  # Assuming no modality embeddings in the CLI contex
    })

    return response

In [6]:
model = initialize_model()
print("[!] Model initialized successfully.")

Initializing visual encoder from ../pretrained_ckpt/imagebind_ckpt ...
Visual encoder initialized.
Initializing language decoder from ../pretrained_ckpt/vicuna_ckpt/7b_v0 ...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at ../pretrained_ckpt/vicuna_ckpt/7b_v0 and are newly initialized: ['model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn

trainable params: 33554432 || all params: 6771978240 || trainable%: 0.49548936530546206
Language decoder initialized.
[!] Model initialized successfully.


In [15]:
# Generate a response based on the inputs
response = generate_response(
    model, 
    input_text="What technical suggestion can you give to the climber in the video?", 
    image_path=None, 
    audio_path=None, 
    video_path="/leonardo_work/IscrC_LAMPE/VLMs/SkillGPT/code/assets/videos/DemoClimb.mp4", 
    thermal_path=None, 
    max_length=256, 
    top_p=0.9, 
    temperature=1.0
)

# Print the response
print(f"Model Response: {parse_text(response)}")

Model Response: A possible technical suggestion for the climber in the video is to practice their bouldering skills on different types of rock formations or colorful holds to challenge themselves and enhance their overall climbing abilities. By climbing on various shapes, sizes, and materials of holds, the climber can develop greater control, balance, and adaptability while climbing. This will help improve their technique and confidence, ultimately making them a more skilled and competent climber.
