In [1]:
LLAVA_EXEC_PATH = "..\\..\\llama.cpp\\build\\bin\\Release\\llama-llava-cli"
# MODEL_PATH = "../../Models/ggml_llava-v1.5-7b/ggml-model-f16.gguf"
MODEL_PATH = "../../Models/ggml_llava-v1.5-7b/ggml-model-f16.gguf"
MMPROJ_PATH = "../../Models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf"

In [2]:
from pathlib import Path

DATA_DIR = "data"
IMAGE_DIR = Path(DATA_DIR, "image")
TXT_DIR = Path(DATA_DIR, "txt")

In [3]:
import glob

image_paths = sorted(glob.glob(str(IMAGE_DIR.joinpath("*.jpg"))))


# image_paths output
# [''data/image/anthony-delanoix-Q0-fOL2nqZc-unsplash.jpg'',
# ''data/image/arthur-humeau-3xwdarHxHqI-unsplash.jpg'',
# ''data/image/bastien-nvs-SprV1eqNrqM-unsplash.jpg'',
# ''data/image/marloes-hilckmann-EUzxLX8p8IA-unsplash.jpg'',
# ''data/image/michael-fousert-Ql9PCaOhyyE-unsplash.jpg'']

In [4]:
TEMP = 0.1
PROMPT = (
    "Scaled Dot-Product Attention."
)

bash_command = f'{LLAVA_EXEC_PATH} -m {MODEL_PATH} --mmproj {MMPROJ_PATH} --temp {TEMP} -p "{PROMPT}"'

# Bash command output
# ~/Code/llama.cpp/build/bin/llava -m ~/Models/ggml_llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj ~/Models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf --temp 0.1 -p "The image shows a site in Paris. Describe the image like a tourist guide would."

In [5]:
import subprocess
import time

for image_path in image_paths:
    print(f"Processing {image_path}")
    image_name = Path(image_path).stem
    image_summary_path = TXT_DIR.joinpath(image_name + ".txt")

    # add input image and output txt filenames to bash command
    bash_command_cur = f'{bash_command} --image "{image_path}" > "{image_summary_path}"'

    # run the bash command
    time_start = time.time()
    process = subprocess.Popen(
        bash_command_cur, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )

    # get the output and error from the command
    output, error = process.communicate()

    # commment output and error for less verbose output
    print("Output:")
    print(output.decode("utf-8"))

    print("Error:")
    print(error.decode("utf-8"))

    # return the code of the command
    return_code = process.returncode
    time_end = time.time()
    exec_time_sec = time_end - time_start

    print(f"Return code: {return_code}. Finished in {exec_time_sec:.2f} sec")
    print()

print("Done")

Processing data\image\test8.jpg
Output:

Error:
build: 3862 (3f1ae2e3) with MSVC 19.41.34120.0 for x64
llama_model_loader: loaded meta data with 18 key-value pairs and 291 tensors from ../../Models/ggml_llava-v1.5-7b/ggml-model-f16.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dime

In [6]:
filepaths = sorted(glob.glob(str(TXT_DIR.joinpath("*.txt"))))
image_texts = []

for filepath in filepaths:
    with open(filepath, "r") as f:
        image_text = f.read()
    image_texts.append(image_text)

In [7]:
print(image_texts[0])

clip_model_load: model name:   openai/clip-vit-large-patch14-336
clip_model_load: description:  image encoder for LLaVA
clip_model_load: GGUF version: 2
clip_model_load: alignment:    32
clip_model_load: n_tensors:    377
clip_model_load: n_kv:         18
clip_model_load: ftype:        f16

clip_model_load: loaded meta data with 18 key-value pairs and 377 tensors from ../../Models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf
clip_model_load: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
clip_model_load: - kv   0:                       general.architecture str              = clip
clip_model_load: - kv   1:                      clip.has_text_encoder bool             = false
clip_model_load: - kv   2:                    clip.has_vision_encoder bool             = true
clip_model_load: - kv   3:                   clip.has_llava_projector bool             = true
clip_model_load: - kv   4:                          general.file_type u32              = 1
clip_model_l