In [None]:
# Ensures that there is enough memory allocation for the model to load
import os
import torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

In [None]:
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor

model_path = "moonshotai/Kimi-VL-A3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype="auto",
    device_map="cuda",
    trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

In [None]:
file_path = '/home/jovyan/Evaluation/OCR/KimiOCR.json'
if os.path.exists(file_path):
    os.remove(file_path)
    print("✅ File deleted successfully.")
else:
    print("⚠️ File does not exist.")

In [None]:
import re
import json
# Path to the directory (library)
directory_path = "/home/jovyan/images"
# List all files in the directory
files = os.listdir(directory_path)

# Filter out directories and show only files
files = [file for file in files if os.path.isfile(os.path.join(directory_path, file))]

data = {}

    
# Print the list of files
for index, file in enumerate(files):
    # Optimize memory usage 
    print(f"Processing image {file}")
    
    image_path = os.path.join(directory_path,file)
    
    image = Image.open(image_path)
    prompt = f"""
    TASK : Object Character Recognition by Region.
    Detect all text in the image.
    Output:
        - Do not include any header.
        - Include only the text detected and nothing else.
        - Ensure a consistent output.
    """

    messages = [
        {"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": {prompt} }]}
    ]
    text = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    generated_ids = model.generate(**inputs, max_new_tokens=512)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    word_list = response.split()
    data[file] = word_list
    print(f"Successfully processed Image {file} ,{index+1}/{len(files)} Images Processed")
with open("KimiOCR.json", "w") as f:
    json.dump(data,f,indent=4)
