In [1]:
!pip install -q -U transformers
!pip install -q bitsandbytes accelerate

In [None]:
import os

def get_all_files_in_folder(folder_path):
    """
    Iterates through all the files in the given folder and returns their absolute paths in a list.
    
    Args:
    folder_path (str): The path to the folder to scan for files.
    
    Returns:
    List[str]: A list of absolute paths to the files.
    """
    file_paths = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Construct the absolute path
            absolute_path = os.path.abspath(os.path.join(root, file))
            file_paths.append(absolute_path)
    
    return file_paths

# Example usage
#Replace with your path
folder_path = '/kaggle/input/exist-memes-test'
absolute_paths = sorted(get_all_files_in_folder(folder_path))

In [4]:
import torch
from transformers import BitsAndBytesConfig
from transformers import pipeline
from datasets import Dataset
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
from transformers.pipelines.pt_utils import KeyDataset


model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
max_new_tokens = 128
prompt = "USER: <image>\nDescribe the content of the meme, but ignore the text caption of the meme. Provide a clear,concise and short answer.\nASSISTANT:"

2024-08-24 17:03:42.189135: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-24 17:03:42.189248: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-24 17:03:42.331525: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

In [5]:
generated_descriptions = []
dataset = Dataset.from_dict({"path_memes": absolute_paths})
i = 0
total = len(KeyDataset(dataset, "path_memes"))
for output in pipe(KeyDataset(dataset, "path_memes"), prompt=prompt, generate_kwargs={"max_new_tokens": max_new_tokens, "num_beams": 2}):
    print(f"Iteration {i} / {total}")
    generated_descriptions.extend([o["generated_text"] for o in output])
    i += 1

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


Iteration 0 / 1053
Iteration 1 / 1053
Iteration 2 / 1053
Iteration 3 / 1053
Iteration 4 / 1053
Iteration 5 / 1053
Iteration 6 / 1053
Iteration 7 / 1053
Iteration 8 / 1053
Iteration 9 / 1053
Iteration 10 / 1053
Iteration 11 / 1053
Iteration 12 / 1053
Iteration 13 / 1053
Iteration 14 / 1053
Iteration 15 / 1053
Iteration 16 / 1053
Iteration 17 / 1053
Iteration 18 / 1053
Iteration 19 / 1053
Iteration 20 / 1053
Iteration 21 / 1053
Iteration 22 / 1053
Iteration 23 / 1053
Iteration 24 / 1053
Iteration 25 / 1053
Iteration 26 / 1053
Iteration 27 / 1053
Iteration 28 / 1053
Iteration 29 / 1053
Iteration 30 / 1053
Iteration 31 / 1053
Iteration 32 / 1053
Iteration 33 / 1053
Iteration 34 / 1053
Iteration 35 / 1053
Iteration 36 / 1053
Iteration 37 / 1053
Iteration 38 / 1053
Iteration 39 / 1053
Iteration 40 / 1053
Iteration 41 / 1053
Iteration 42 / 1053
Iteration 43 / 1053
Iteration 44 / 1053
Iteration 45 / 1053
Iteration 46 / 1053
Iteration 47 / 1053
Iteration 48 / 1053
Iteration 49 / 1053
Iteration 

In [6]:
def clean_all_text(generated_descriptions):
    def get_assistant_part(text):
        try:
            return [x.strip() for x in text.split("ASSISTANT:")][1]
        except:
            return f"ERROR:{text}"
    cleaned_text = [get_assistant_part(t) for t in generated_descriptions]
    return cleaned_text


def save_list_to_text_file(description_list, filename):
    with open(filename, "w") as f:
        for item in description_list:
            item = " ".join(item.splitlines())
            f.write(f"{item}\n")
        
save_list_to_text_file(generated_descriptions, "test_generated_descriptions.txt")
clean_generated_descriptions = clean_all_text(generated_descriptions)
save_list_to_text_file(clean_generated_descriptions, "test_clean_generated_descriptions.txt")