In [1]:
# import sys
# !{sys.executable} -m pip install ultralytics

In [2]:
from ultralytics import YOLO
import os
from PIL import Image
import glob
import json
from tqdm import tqdm
from transformers import BlipProcessor, BlipForConditionalGeneration, MarianMTModel, MarianTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

translator = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-id")
translator_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [25]:
def generate_caption(img_url, img_obj):
    ymin, ymax, xmin, xmax = img_obj["y_min"], img_obj["y_max"], img_obj["x_min"], img_obj["x_max"]
    raw_image = Image.open(img_url).convert('RGB')
    crop = raw_image.crop((xmin, ymin, xmax, ymax))

    text = "an image of"
    inputs = caption_processor(crop, text, return_tensors="pt")

    out = caption_model.generate(**inputs)
    caption_en = caption_processor.decode(out[0], skip_special_tokens=True)

    translated = translator.generate(**translator_tokenizer(caption_en, return_tensors="pt", padding=True))
    caption_id = translator_tokenizer.decode(translated[0], skip_special_tokens=True)
    # print(caption_id, caption_en, sep='\n')
    return caption_id

In [5]:
model = YOLO("models/kfold_result/kfold_training/fold_4/weights/best.pt")
results = model.predict(
    source="../data/raw/source_file", save=True, save_txt=True, save_conf=True,
    project="detect_result",
)


image 1/661 d:\Research-Methodology\yolo\..\data\raw\source_file\1.png: 640x480 2 PageNumbers, 134.8ms
image 2/661 d:\Research-Methodology\yolo\..\data\raw\source_file\10.png: 640x480 1 PageNumber, 105.1ms
image 3/661 d:\Research-Methodology\yolo\..\data\raw\source_file\100.png: 640x480 4 Captions, 1 PageNumber, 4 Pictures, 105.4ms
image 4/661 d:\Research-Methodology\yolo\..\data\raw\source_file\101.png: 640x480 1 Caption, 1 PageNumber, 2 Pictures, 105.6ms
image 5/661 d:\Research-Methodology\yolo\..\data\raw\source_file\102.png: 640x480 (no detections), 104.3ms
image 6/661 d:\Research-Methodology\yolo\..\data\raw\source_file\103.png: 640x480 1 PageNumber, 106.5ms
image 7/661 d:\Research-Methodology\yolo\..\data\raw\source_file\104.png: 640x480 2 PageNumbers, 105.6ms
image 8/661 d:\Research-Methodology\yolo\..\data\raw\source_file\105.png: 640x608 (no detections), 153.9ms
image 9/661 d:\Research-Methodology\yolo\..\data\raw\source_file\106.png: 640x480 (no detections), 102.9ms
image 10

In [26]:
img_file_names = []
img_json = []
id2label = {0: "Caption", 1:"PageNumber", 2:"Picture"}

for i in range(1, 662):
    name = f'../data/raw/source_file/{i}.png'
    data = f'../data/raw/ocr_dict/dict_{i}.json'
    img_file_names.append(name)
    img_json.append(data)

label_dir = 'detect_result/predict/labels' #yolo

In [27]:
def get_detected_result(image_path, label_path):
    detected = {i : list() for i in range(0, 3)}
    img = Image.open(image_path)
    img_width, img_height = img.size

    with open(label_path, 'r') as f:
        lines = f.readlines()

    # print(f"Image: {image_name}")
    for line in lines:
        # print(line, end='')
        label, x_center, y_center, w, h, confidence = map(float, line.strip().split())

        x_min = int((x_center - w / 2) * img_width)
        y_min = int((y_center - h / 2) * img_height)
        x_max = int((x_center + w / 2) * img_width)
        y_max = int((y_center + h / 2) * img_height)

        obj = dict()
        obj['x_min'] = x_min
        obj['x_max'] = x_max
        obj['y_min'] = y_min
        obj['y_max'] = y_max
        obj['conf'] = confidence
        detected[label].append(obj)
    
    return detected

        #     print(f"Class {id2label[int(label)]}: (x_min={x_min}, y_min={y_min}), (x_max={x_max}, y_max={y_max})")
        #     print(f"Image width: {img_width}, image height: {img_height}")
        #     print()
        # print("=====================================================================")


In [28]:
def intersect_rect(image_obj, x_min2, x_max2, y_min2, y_max2):
    tolerance = 2
    x_min1, x_max1 = image_obj['x_min']+tolerance, image_obj['x_max']+tolerance
    y_min1, y_max1 = image_obj['y_min']+tolerance, image_obj['y_max']+tolerance

    return not ((x_max1 <= x_min2 or x_max2 <= x_min1) or
                (y_max1 <= y_min2 or y_max2 <= y_min1))

In [29]:
def clean_text(ocr_result, detected, caption_pos):
    new_txt = ""
    max_idx = len(ocr_result['text'])

    # tdk append text dalam image yg kedetect ocr
    for i in range(max_idx):
        text = ocr_result['text'][i]
        if i in caption_pos: new_txt += text + " "
        
        is_exclude = False
        exclude_list = detected[1] + detected[2] #1:pagenumber, 2:picture

        for obj in exclude_list:
            # print('masyk')
            x_min2 = ocr_result['left'][i]
            x_max2 = ocr_result['left'][i]+ocr_result['width'][i]
            y_min2 = ocr_result['top'][i]
            y_max2 = ocr_result['top'][i]+ocr_result['height'][i]

            if(intersect_rect(obj, x_min2, x_max2, y_min2, y_max2)): 
                is_exclude = True
                break
        
        if not is_exclude and text != "": new_txt += text + " "
    
    return new_txt.strip()

In [30]:
def match_caption_img(detected):
    if len(detected[0]) != 0: #kalau ada caption
        return []
    
    caption_img = [] #{caption: no urut caption, img: no urut img}: dict
    img_used = {} #utk track img apakah udah dipakai atau belum

    # loop greedy matching caption-img
    for id, caption in enumerate(detected[0]):
        y_min_c, y_max_c = caption['y_min'], caption['y_max']
        min_val = 1e5
        ambil_idx = -1
        ambil_obj = None

        for idx, img in enumerate(detected[2]):
            y_min_img, y_max_img = img['y_min'], img['y_max']

            # caption di atas img
            diff_up = abs(y_min_img-y_max_c)
            # kalau caption di bawah img
            diff_down = abs(y_min_c - y_max_img)
            
            if(diff_up < diff_down): 
                if diff_up < min_val: 
                    min_val = diff_up
                    ambil_idx = idx
                    ambil_obj = img
            else: 
                if diff_down < min_val: 
                    min_val = diff_down
                    ambil_idx = idx
                    ambil_obj = img
        
        # print(id, min_val, ambil_idx)

        #caption lebih byk dari img
        if(ambil_idx in img_used):
            if(min_val < img_used[ambil_idx]['value']):
                prev_caption_idx = img_used[ambil_idx]['caption']
                caption_img[prev_caption_idx]['img'] = None
                caption_img[prev_caption_idx]['img_obj'] = None
        
        caption_img.append({"caption":id, "img":ambil_idx, "img_obj":ambil_obj}) 
        img_used[ambil_idx] = {"caption":id, "value":min_val}

    # img lbih byk dri caption
    for image_id, image in enumerate(detected[2]):
        if image_id not in img_used:
            caption_img.append({"caption":None, "img":image_id, "img_obj":image}) 

    # cleaning, utamain img
    caption_img = [record for record in caption_img if record["img"] is not None]
    return caption_img

In [31]:
def append_caption(record, ocr_data, excludes):
    caption_x = (record['img_obj']['x_min'] + record['img_obj']['x_max']) / 2
    caption_y = (record['img_obj']['y_min'] + record['img_obj']['y_max']) / 2
    caption_str = record['caption']

    min_dist = 1e5
    insert_index = -1

    for i in range(len(ocr_data['top'])):
        if(ocr_data['left'][i] > caption_x) or (ocr_data['top'][i] > caption_y): continue
        if i in excludes: continue
        
        # hitung manhattan
        manhattan = abs(ocr_data['top'][i]-caption_y) + abs(ocr_data['left'][i]-caption_x)
        if (manhattan <= min_dist):
            min_dist = manhattan
            insert_index = i+1

    excludes.append(insert_index)
    ocr_data['left'].insert(insert_index, record['img_obj']['x_max']+1)
    ocr_data['top'].insert(insert_index, record['img_obj']['y_max']+1)

    ocr_data['width'].insert(insert_index, abs(record['img_obj']['x_max']-record['img_obj']['x_min']))
    ocr_data['height'].insert(insert_index, abs(record['img_obj']['y_max']-record['img_obj']['y_min']))
    ocr_data['text'].insert(insert_index, caption_str)

    return ocr_data, excludes

In [33]:
# main loop

for i, dict_path in tqdm(enumerate(img_json), desc='Cleaning text'):
    label_path = f"{label_dir}/{i+1}.txt"
    with open(f'{dict_path}','r') as file:
        ocr_data = json.load(file)

    if not os.path.exists(label_path):
        print(f"Skipping {label_path}")
        str = " ".join(ocr_data['text'])
        with open(f'./yolo_res/res_{i+1}.txt','w') as file:
            file.write(str)
        continue
    
    detected_objs = get_detected_result(img_file_names[i], label_path)

    matches = match_caption_img(detected_objs)
    excludes = []

    for record in matches:
        if record['img_obj'] is not None and not record['caption']:
            new_caption = generate_caption(img_file_names[i], record['img_obj'])
            # ubah caption-img (matches) ke dict 
            # add caption ke cleaned text
            record['caption'] = new_caption
            ocr_data, excludes = append_caption(record, ocr_data, excludes)

    cleaned = clean_text(ocr_data, detected_objs, excludes)
    with open(f'./yolo_res/res_{i+1}.txt','w') as file:
        file.write(cleaned)
    

Cleaning text: 5it [00:00, 40.26it/s]

Skipping detect_result/predict/labels/2.txt
Skipping detect_result/predict/labels/3.txt
Skipping detect_result/predict/labels/4.txt
Skipping detect_result/predict/labels/5.txt
Skipping detect_result/predict/labels/6.txt
Skipping detect_result/predict/labels/7.txt
Skipping detect_result/predict/labels/8.txt
Skipping detect_result/predict/labels/9.txt


Cleaning text: 15it [00:02,  6.63it/s]

Skipping detect_result/predict/labels/16.txt
Skipping detect_result/predict/labels/17.txt


Cleaning text: 33it [00:03, 11.57it/s]

Skipping detect_result/predict/labels/20.txt
Skipping detect_result/predict/labels/21.txt
Skipping detect_result/predict/labels/22.txt
Skipping detect_result/predict/labels/23.txt
Skipping detect_result/predict/labels/24.txt
Skipping detect_result/predict/labels/28.txt
Skipping detect_result/predict/labels/29.txt
Skipping detect_result/predict/labels/30.txt
Skipping detect_result/predict/labels/33.txt
Skipping detect_result/predict/labels/34.txt
Skipping detect_result/predict/labels/36.txt
Skipping detect_result/predict/labels/37.txt


Cleaning text: 47it [00:08,  5.16it/s]

Skipping detect_result/predict/labels/40.txt
Skipping detect_result/predict/labels/43.txt
Skipping detect_result/predict/labels/45.txt
Skipping detect_result/predict/labels/46.txt
Skipping detect_result/predict/labels/47.txt
Skipping detect_result/predict/labels/48.txt
Skipping detect_result/predict/labels/50.txt


Cleaning text: 55it [00:10,  5.01it/s]

Skipping detect_result/predict/labels/62.txt


Cleaning text: 97it [00:21,  5.97it/s]

Skipping detect_result/predict/labels/74.txt
Skipping detect_result/predict/labels/97.txt
Skipping detect_result/predict/labels/98.txt
Skipping detect_result/predict/labels/102.txt
Skipping detect_result/predict/labels/105.txt
Skipping detect_result/predict/labels/106.txt
Skipping detect_result/predict/labels/107.txt


Cleaning text: 116it [00:21, 10.63it/s]

Skipping detect_result/predict/labels/108.txt
Skipping detect_result/predict/labels/109.txt
Skipping detect_result/predict/labels/110.txt
Skipping detect_result/predict/labels/111.txt
Skipping detect_result/predict/labels/114.txt
Skipping detect_result/predict/labels/115.txt
Skipping detect_result/predict/labels/116.txt


Cleaning text: 130it [00:22, 10.76it/s]

Skipping detect_result/predict/labels/125.txt
Skipping detect_result/predict/labels/126.txt
Skipping detect_result/predict/labels/127.txt
Skipping detect_result/predict/labels/128.txt
Skipping detect_result/predict/labels/129.txt
Skipping detect_result/predict/labels/130.txt
Skipping detect_result/predict/labels/131.txt
Skipping detect_result/predict/labels/132.txt
Skipping detect_result/predict/labels/133.txt
Skipping detect_result/predict/labels/134.txt
Skipping detect_result/predict/labels/135.txt
Skipping detect_result/predict/labels/136.txt
Skipping detect_result/predict/labels/137.txt


Cleaning text: 143it [00:23, 16.36it/s]

Skipping detect_result/predict/labels/138.txt
Skipping detect_result/predict/labels/139.txt
Skipping detect_result/predict/labels/140.txt
Skipping detect_result/predict/labels/141.txt
Skipping detect_result/predict/labels/142.txt
Skipping detect_result/predict/labels/143.txt
Skipping detect_result/predict/labels/146.txt
Skipping detect_result/predict/labels/147.txt
Skipping detect_result/predict/labels/148.txt


Cleaning text: 157it [00:23, 25.65it/s]

Skipping detect_result/predict/labels/149.txt
Skipping detect_result/predict/labels/150.txt
Skipping detect_result/predict/labels/151.txt
Skipping detect_result/predict/labels/152.txt
Skipping detect_result/predict/labels/153.txt
Skipping detect_result/predict/labels/154.txt
Skipping detect_result/predict/labels/155.txt
Skipping detect_result/predict/labels/156.txt
Skipping detect_result/predict/labels/157.txt


Cleaning text: 163it [00:25,  9.00it/s]

Skipping detect_result/predict/labels/159.txt
Skipping detect_result/predict/labels/160.txt
Skipping detect_result/predict/labels/161.txt
Skipping detect_result/predict/labels/167.txt
Skipping detect_result/predict/labels/169.txt
Skipping detect_result/predict/labels/170.txt


Cleaning text: 184it [00:38,  2.42it/s]

Skipping detect_result/predict/labels/177.txt
Skipping detect_result/predict/labels/178.txt
Skipping detect_result/predict/labels/179.txt
Skipping detect_result/predict/labels/180.txt
Skipping detect_result/predict/labels/181.txt
Skipping detect_result/predict/labels/182.txt
Skipping detect_result/predict/labels/183.txt
Skipping detect_result/predict/labels/184.txt


Cleaning text: 194it [00:41,  3.16it/s]

Skipping detect_result/predict/labels/186.txt
Skipping detect_result/predict/labels/187.txt
Skipping detect_result/predict/labels/188.txt
Skipping detect_result/predict/labels/189.txt
Skipping detect_result/predict/labels/191.txt
Skipping detect_result/predict/labels/192.txt
Skipping detect_result/predict/labels/193.txt
Skipping detect_result/predict/labels/194.txt
Skipping detect_result/predict/labels/195.txt
Skipping detect_result/predict/labels/196.txt
Skipping detect_result/predict/labels/197.txt
Skipping detect_result/predict/labels/198.txt


Cleaning text: 204it [00:41,  5.71it/s]

Skipping detect_result/predict/labels/199.txt
Skipping detect_result/predict/labels/200.txt
Skipping detect_result/predict/labels/201.txt
Skipping detect_result/predict/labels/202.txt
Skipping detect_result/predict/labels/203.txt
Skipping detect_result/predict/labels/204.txt
Skipping detect_result/predict/labels/206.txt
Skipping detect_result/predict/labels/207.txt
Skipping detect_result/predict/labels/208.txt


Cleaning text: 219it [00:41, 12.66it/s]

Skipping detect_result/predict/labels/210.txt
Skipping detect_result/predict/labels/211.txt
Skipping detect_result/predict/labels/212.txt
Skipping detect_result/predict/labels/213.txt
Skipping detect_result/predict/labels/214.txt
Skipping detect_result/predict/labels/215.txt
Skipping detect_result/predict/labels/216.txt
Skipping detect_result/predict/labels/217.txt
Skipping detect_result/predict/labels/218.txt
Skipping detect_result/predict/labels/219.txt
Skipping detect_result/predict/labels/220.txt
Skipping detect_result/predict/labels/221.txt


Cleaning text: 225it [00:45,  4.07it/s]

Skipping detect_result/predict/labels/224.txt
Skipping detect_result/predict/labels/225.txt
Skipping detect_result/predict/labels/226.txt
Skipping detect_result/predict/labels/227.txt
Skipping detect_result/predict/labels/228.txt


Cleaning text: 242it [00:47,  7.33it/s]

Skipping detect_result/predict/labels/230.txt
Skipping detect_result/predict/labels/231.txt
Skipping detect_result/predict/labels/233.txt
Skipping detect_result/predict/labels/235.txt
Skipping detect_result/predict/labels/236.txt
Skipping detect_result/predict/labels/237.txt
Skipping detect_result/predict/labels/238.txt
Skipping detect_result/predict/labels/239.txt
Skipping detect_result/predict/labels/240.txt
Skipping detect_result/predict/labels/241.txt
Skipping detect_result/predict/labels/242.txt
Skipping detect_result/predict/labels/243.txt
Skipping detect_result/predict/labels/244.txt


Cleaning text: 252it [00:48,  7.35it/s]

Skipping detect_result/predict/labels/246.txt
Skipping detect_result/predict/labels/247.txt
Skipping detect_result/predict/labels/249.txt
Skipping detect_result/predict/labels/250.txt
Skipping detect_result/predict/labels/251.txt
Skipping detect_result/predict/labels/252.txt
Skipping detect_result/predict/labels/254.txt
Skipping detect_result/predict/labels/255.txt
Skipping detect_result/predict/labels/256.txt
Skipping detect_result/predict/labels/257.txt


Cleaning text: 261it [00:52,  4.14it/s]

Skipping detect_result/predict/labels/260.txt
Skipping detect_result/predict/labels/261.txt
Skipping detect_result/predict/labels/264.txt
Skipping detect_result/predict/labels/265.txt
Skipping detect_result/predict/labels/267.txt
Skipping detect_result/predict/labels/268.txt


Cleaning text: 269it [00:56,  2.73it/s]

Skipping detect_result/predict/labels/271.txt


Cleaning text: 290it [00:59,  5.59it/s]

Skipping detect_result/predict/labels/277.txt
Skipping detect_result/predict/labels/278.txt
Skipping detect_result/predict/labels/280.txt
Skipping detect_result/predict/labels/281.txt
Skipping detect_result/predict/labels/282.txt
Skipping detect_result/predict/labels/283.txt
Skipping detect_result/predict/labels/284.txt
Skipping detect_result/predict/labels/286.txt
Skipping detect_result/predict/labels/287.txt
Skipping detect_result/predict/labels/288.txt
Skipping detect_result/predict/labels/290.txt
Skipping detect_result/predict/labels/293.txt
Skipping detect_result/predict/labels/294.txt
Skipping detect_result/predict/labels/295.txt


Cleaning text: 315it [01:01, 12.28it/s]

Skipping detect_result/predict/labels/297.txt
Skipping detect_result/predict/labels/298.txt
Skipping detect_result/predict/labels/299.txt
Skipping detect_result/predict/labels/300.txt
Skipping detect_result/predict/labels/301.txt
Skipping detect_result/predict/labels/302.txt
Skipping detect_result/predict/labels/303.txt


Cleaning text: 438it [01:15, 15.94it/s]

Skipping detect_result/predict/labels/447.txt
Skipping detect_result/predict/labels/449.txt


Cleaning text: 484it [01:26,  5.29it/s]

Skipping detect_result/predict/labels/486.txt


Cleaning text: 661it [03:25,  3.22it/s]
