# Model API

**Instalación**

In [None]:
!python3.11 -m venv venv
!source venv/bin/activate
!pip install autotrain-advanced peft accelerate bitsandbytes Pillow

!autotrain llm --train --project_name depresslm --model daryl149/llama-2-7b-chat-hf --data_path data --peft --quantization int4 --lr 2e-4 --batch-size 4 --epochs 2 --trainer sft  --token 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 

**Carga del modelo**

In [None]:
import os
import torch
import accelerate

import bitsandbytes as bnb

from PIL import Image

# Blip 2
from transformers import AutoProcessor
from transformers import Blip2ForConditionalGeneration

# Llama + Lora
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig


TORCH_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


def load_transcriptor():
    config = dict(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
  
    processor = AutoProcessor.from_pretrained(
        "Salesforce/blip2-opt-2.7b"
    )
    model = Blip2ForConditionalGeneration.from_pretrained(
        "Salesforce/blip2-opt-2.7b",
        device_map="auto",
        quantization_config=config
    )

    return processor,model


def get_caption(file,processor,model):
    image= Image.open(file).convert('RGB')
    inputs = processor(images=image,return_tensors="pt").to(TORCH_DEVICE, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=30)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

def load_model(adapter_model):
    base_model = "daryl149/llama-2-7b-chat-hf"

    bnb_config = dict(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config)
    model = PeftModel.from_pretrained(model, adapter_model, quantization_config=bnb_config)
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    model = model.to("cuda")
    return model,tokenizer


def inference_model(procesed_data,model,tokenizer):
    inputs = tokenizer(procesed_data,return_tensors="pt")
    response_tokens = []

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"].to("cuda"), 
            max_new_tokens=200
        )
        response_tokens.append(
            tokenizer.batch_decode(
                outputs.detach().cpu().numpy(), 
                skip_special_tokens=True
            )[0]
        )

    return "".join(response_tokens)

In [None]:
adapter_model = "depresslm"

processor,transcriptor = load_transcriptor()
model,tokenizer = load_model(adapter_model)

file = "/home/fercho/Descargas/Test.png"

generated_text = get_caption(file,processor,transcriptor)
response = inference_model("Hola, cómo estás?",model,tokenizer)