<a href="https://colab.research.google.com/github/Domqwerty/llava-video-renamer/blob/main/llava_video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd /content
!git clone -b v1.0 https://github.com/camenduru/LLaVA
%cd /content/LLaVA

!pip install -q transformers==4.36.2
!pip install -q gradio .
!pip install -q opencv-python

In [None]:
import os
import requests
from PIL import Image
from io import BytesIO
import cv2
from transformers import AutoTokenizer, BitsAndBytesConfig, TextStreamer
from llava.model import LlavaLlamaForCausalLM
import torch
from llava.conversation import conv_templates, SeparatorStyle
from llava.utils import disable_torch_init
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria

# Funzione per estrarre il primo fotogramma
def extract_first_frame(video_path, output_image_path):
    cap = cv2.VideoCapture(video_path)
    success, frame = cap.read()
    if success:
        cv2.imwrite(output_image_path, frame)
    cap.release()
    return success

# Funzione per rinominare il video
def rename_video(video_path, description):
    directory, original_name = os.path.split(video_path)
    file_name, file_extension = os.path.splitext(original_name)

    # Pulizia della descrizione
    new_name = description.replace(" ", "_").replace(",", "").replace(".", "").replace(":", "").replace("'", "")
    new_video_path = os.path.join(directory, f"{new_name}{file_extension}")

    # Gestione nomi duplicati
    counter = 1
    while os.path.exists(new_video_path):
        new_video_path = os.path.join(directory, f"{new_name}_{counter}{file_extension}")
        counter += 1

    os.rename(video_path, new_video_path)
    return new_video_path

# Percorso del modello (dovrai scaricarlo o usarne uno che hai già)
model_path = "4bit/llava-v1.5-13b-3GB" # Esempio, assicurati che il modello sia in questa cartella
# Se vuoi scaricare un modello diverso:
# model_path = "liuhaotian/llava-v1.5-7b"

In [None]:
kwargs = {"device_map": "auto"}
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)
model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device='cuda')
image_processor = vision_tower.image_processor

In [None]:
def caption_image(image_path, prompt):
    # Modificata per accettare direttamente un percorso locale
    image = Image.open(image_path).convert('RGB')
    disable_torch_init()
    conv_mode = "llava_v0"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles
    image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
    inp = f"{roles[0]}: {prompt}"
    inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    raw_prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(raw_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors

In [None]:
def caption_image(image_path, prompt):
    # Modificata per accettare direttamente un percorso locale
    image = Image.open(image_path).convert('RGB')
    disable_torch_init()
    conv_mode = "llava_v0"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles
    image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
    inp = f"{roles[0]}: {prompt}"
    inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    raw_prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(raw_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() # Aggiunto 'pt' e spostato il resto dell'istruzione nella riga successiva
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
      output_ids = model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2,
                                  max_new_tokens=1024, use_cache=True, stopping_criteria=[stopping_criteria])
    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
    conv.messages[-1][-1] = outputs
    output = outputs.rsplit('</s>', 1)[0]
    return output

In [None]:


# Processa tutti i video in una cartella
def process_videos_in_folder(video_folder, output_image_path, prompt="Descrivi l'immagine in dettaglio in italiano.."):
    for video_file in os.listdir(video_folder):
        video_path = os.path.join(video_folder, video_file)
        if not video_path.endswith((".mp4", ".avi", ".mkv")):
            continue

        print(f"Processando: {video_path}")
        if extract_first_frame(video_path, output_image_path):
            # Usa LLaVA per descrivere l'immagine
            description = caption_image(output_image_path, prompt)
            print(f"Descrizione rilevata: {description}")
            rename_video(video_path, description)
        else:
            print(f"Impossibile estrarre il primo fotogramma da: {video_path}")

In [None]:
import hashlib

def rename_video(video_path, description):
    directory, original_name = os.path.split(video_path)
    file_name, file_extension = os.path.splitext(original_name)

    # Pulizia della descrizione
    new_name = description.replace(" ", "_").replace(",", "").replace(".", "").replace(":", "").replace("'", "")

    # Tronca la descrizione a 200 caratteri (puoi modificare questo valore)
    max_length = 200
    if len(new_name) > max_length:
        new_name = new_name[:max_length]

    # Calcola l'hash della descrizione completa
    hash_object = hashlib.sha256(description.encode())
    hash_hex = hash_object.hexdigest()

    # Combina il nome troncato con una parte dell'hash
    new_name = f"{new_name}_{hash_hex[:8]}{file_extension}" # Prendi solo i primi 8 caratteri dell'hash

    new_video_path = os.path.join(directory, new_name)

    # Gestione nomi duplicati (anche se improbabile con l'hash)
    counter = 1
    while os.path.exists(new_video_path):
        new_name = f"{new_name}_{hash_hex[:8]}_{counter}{file_extension}"
        new_video_path = os.path.join(directory, new_name)
        counter += 1

    os.rename(video_path, new_video_path)
    return new_video_path

In [None]:
# Percorsi
video_folder = "/content/videos"  # Crea una cartella 'videos' in Colab e carica i video
output_image_path = "temp_frame.jpg"

# Assicurati che la cartella video esista
if not os.path.exists(video_folder):
    os.makedirs(video_folder)

# Esecuzione
process_videos_in_folder(video_folder, output_image_path)

In [None]:
!zip -r /content/videos.zip /content/videos