In [50]:
%%bash
nvidia-smi --query-gpu=index,memory.used,memory.total,memory.free,utilization.gpu --format=csv,noheader | awk -F, '
BEGIN {
    printf "GPU,Memory [%%],Free,Utilization [%%]\n"
}
{
    printf "%s,%6.2f,%-12s,%-12s\n", $1, ($2/$3)*100, $4, $5
}' | sed 's|\s%||g' | awk -F, '
BEGIN {
    OFS="\t";   # Output field separator, adjust as needed
}
{
    for (i=1; i<=NF; i++) {
        printf "%-20s", $i;   # Adjust the width (20 in this example) as needed
    }
    print "";   # Print a new line after each row
}'

GPU                 Memory [%]          Free                Utilization [%]     
0                    92.82               2909 MiB            0                  
1                    97.87               863 MiB             0                  
2                    95.76               1720 MiB            3                  
3                    35.19               26271 MiB           98                 
4                    95.35               1884 MiB            0                  
5                    81.75               7399 MiB            0                  
6                    98.49               614 MiB             0                  
7                    53.50               18851 MiB           8                  


In [None]:
# Memilih GPU yang akan digunakan (contohnya: GPU #7)
import os
import torch
os.environ['CUDA_VISIBLE_DEVICES'] = '5'
torch.cuda.current_device()

In [12]:
# !python3 edit.py {output_dir} {truth_dir}   # Edit Distance between OCR Result and Ground Truth

In [30]:
# import libraries, constants, singletons, and functions
import os
import time
import json
import base64
import pickle
import cv2 as cv
import numpy as np
import multiprocessing as mp

from tqdm import tqdm
from glob import glob
from character_segmentation import segment
from segmentation import extract_words
from train import prepare_char, featurizer

In [31]:
# Constants
dataset_dir = '/workspace/Dataset/pegon-annotated-dataset'
tokens_to_unknown = ['[CALLIGRAPHY]',
                     '[NASTALIQ]',
                     '[UNKNOWN]',
                     '[VERT]',
                    ]

In [32]:
# Helper function
def get_image_from_json(json_path):
    with open(json_path, encoding="utf8") as jsonfile:
        json_obj = json.load(jsonfile)
    
    filename    = json_obj['imagePath']
    encoded_img = json_obj['imageData']
    image_arr   = np.frombuffer(base64.b64decode(encoded_img), np.uint8)
    image       = cv.imdecode(image_arr, cv.IMREAD_COLOR)
    return filename, image

def clear_running_time(path):
    if not os.path.exists(path):
        os.mkdir(path)
    open(f'{path}/running_time.txt', 'w').close()
    return

def write_running_time(running_time_list, path):
    running_time_list.sort()
    with open(f'{path}/running_time.txt', 'w') as r:
        for t in running_time_list:
            r.writelines(f'image#{t[0]}: {t[1]}\n')       # if no need for printing 'image#id'.
    return

def extract_ground_truth(json_path):
    with open(json_path, encoding="utf8") as jsonfile:
        json_obj = json.load(jsonfile)
    filename   = json_obj['imagePath']
    
    clean_word = ' '.join([hashmap['label'] for hashmap in json_obj['shapes']])
    return filename, clean_word

def run_pool(obj, model=model):
    word, line = obj
    char_imgs = segment(line, word)
    txt_word = []
    for char_img in char_imgs:
        try:
            ready_char = prepare_char(char_img)
        except:
            continue
        feature_vector = featurizer(ready_char)
        predicted_char = model.predict([feature_vector])[0]
        txt_word.append(predicted_char)
    return ''.join(txt_word)

def run_ocr(json_path, current_folder):
    image_name, image = get_image_from_json(json_path)
    
    # Start
    before = time.time()
    words = extract_words(image)
    pool = mp.Pool(mp.cpu_count())
    predicted_words = pool.map(run_pool, words)
    pool.close()
    pool.join()
    
    # Stop Timer
    after = time.time()
    predicted_text = ' '.join(predicted_words)
    exc_time = after-before
    
    with open(f'{result_dir}/{current_folder}/text/{image_name}.txt','w',encoding='utf8') as fo:
        fo.writelines(predicted_text)
        
    return image_name, exc_time

def run_folder(folder):
    clear_running_time(f'{result_dir}/{folder}')
    destination = f'{result_dir}/{folder}/text'
    
    if not os.path.exists(destination):
        os.makedirs(destination)
    json_paths = glob(f'{dataset_dir}/{folder}/*.json')
    
    running_time = []
    before = time.time()
    for json_path in tqdm(json_paths,total=len(json_paths)):
        result = run_ocr(json_path,folder)
        running_time.append(result)
    write_running_time(running_time,f'{result_dir}/{folder}')
    
    after = time.time()
    print()
    print(f'total time to finish {len(running_time)} images: {after - before}')
    print(f'Successfully processing {len(running_time)} out of {len(json_paths)} images')
    print()
    
def main():
    folders = ['Majmuah Syariah','Mujarobat Doa','Mujarobat Kubro']
    for folder in folders:
        run_folder(folder)

In [33]:
# Character Error Rate for evaluation
from jiwer import cer, wer

def eval_cer(filename,folder,result_dir):
    filename = filename.split('/')[-1].split('.')[0]
    pred = open(f'{result_dir}/{folder}/text/{filename}.bmp.txt','r')
    _, true = extract_ground_truth(f'{dataset_dir}/{folder}/{filename}.json')
    return f'{folder}/{filename}', cer(true, pred.read())

def eval_wers(filename,folder,result_dir):
    filename = filename.split('/')[-1].split('.')[0]
    pred = open(f'{result_dir}/{folder}/text/{filename}.bmp.txt','r')
    _, true = extract_ground_truth(f'{dataset_dir}/{folder}/{filename}.json')
    return f'{folder}/{filename}', wer(true, pred.read())

def get_cer_avg(result_dir):
    cers = []
    folders = ['Majmuah Syariah','Mujarobat Doa','Mujarobat Kubro']
    for folder in folders:
        json_files = glob(f'{dataset_dir}/{folder}/*.json')
        cers.extend([eval_cer(filename,folder,result_dir) for filename in json_files])

    ev = [ev for _,ev in cers]
    return sum(ev) / len(ev)

def get_wer_avg(result_dir):
    wers = []
    folders = ['Majmuah Syariah','Mujarobat Doa','Mujarobat Kubro']
    for folder in folders:
        json_files = glob(f'{dataset_dir}/{folder}/*.json')
        wers.extend([eval_wers(filename,folder,result_dir) for filename in json_files])

    ev = [ev for _,ev in wers]
    return sum(ev) / len(ev)

In [34]:
# Model: 1L-NN
result_dir  = '/workspace/Arabic-OCR/src/pegon-result-page-1lnn'
model_name = '1L_NN.sav'

In [None]:
# Run OCR with Model above
model = pickle.load(open(f'models/{model_name}','rb'))
main()

In [35]:
# Eval result
print(f'Average CER:{get_cer_avg(result_dir)}\nAverage WER:{get_wer_avg(result_dir)}')

Average CER:0.9999789578363117
Average WER:1.0


In [36]:
# Next Model: 2L-NN
result_dir  = '/workspace/Arabic-OCR/src/pegon-result-page-2lnn'
model_name = '2L_NN.sav'

In [37]:
# Run OCR with Model above
model = pickle.load(open(f'models/{model_name}','rb'))
main()

100%|██████████| 15/15 [06:13<00:00, 24.87s/it]
  0%|          | 0/49 [00:00<?, ?it/s]


total time to finish 15 images: 373.0619041919708
Successfully processing 15 out of 15 images



100%|██████████| 49/49 [20:21<00:00, 24.93s/it]
  0%|          | 0/50 [00:00<?, ?it/s]


total time to finish 49 images: 1221.8094124794006
Successfully processing 49 out of 49 images



100%|██████████| 50/50 [20:50<00:00, 25.00s/it]


total time to finish 50 images: 1250.1710419654846
Successfully processing 50 out of 50 images






In [38]:
# Eval result
print(f'Average CER:{get_cer_avg(result_dir)}\nAverage WER:{get_wer_avg(result_dir)}')

Average CER:0.9999763291489205
Average WER:1.0


In [39]:
# Next Model: GaussNB
result_dir  = '/workspace/Arabic-OCR/src/pegon-result-page-gaussnb'
model_name = 'Gaussian_Naive_Bayes.sav'

In [40]:
# Run OCR with Model above
model = pickle.load(open(f'models/{model_name}','rb'))
main()

100%|██████████| 15/15 [06:13<00:00, 24.91s/it]
  0%|          | 0/49 [00:00<?, ?it/s]


total time to finish 15 images: 373.6137704849243
Successfully processing 15 out of 15 images



100%|██████████| 49/49 [20:17<00:00, 24.84s/it]
  0%|          | 0/50 [00:00<?, ?it/s]


total time to finish 49 images: 1217.3779122829437
Successfully processing 49 out of 49 images



100%|██████████| 50/50 [20:44<00:00, 24.90s/it]


total time to finish 50 images: 1244.905256986618
Successfully processing 50 out of 50 images






In [41]:
# Eval result
print(f'Average CER:{get_cer_avg(result_dir)}\nAverage WER:{get_wer_avg(result_dir)}')

Average CER:0.9999763291489205
Average WER:1.0


In [42]:
# Next Model: LinearSVM
result_dir  = '/workspace/Arabic-OCR/src/pegon-result-page-linsvm'
model_name = 'LinearSVM.sav'

In [43]:
# Run OCR with Model above
model = pickle.load(open(f'models/{model_name}','rb'))
main()

100%|██████████| 15/15 [06:14<00:00, 24.95s/it]
  0%|          | 0/49 [00:00<?, ?it/s]


total time to finish 15 images: 374.3252499103546
Successfully processing 15 out of 15 images



100%|██████████| 49/49 [20:12<00:00, 24.75s/it]
  0%|          | 0/50 [00:00<?, ?it/s]


total time to finish 49 images: 1212.7852368354797
Successfully processing 49 out of 49 images



100%|██████████| 50/50 [20:39<00:00, 24.80s/it]


total time to finish 50 images: 1239.915227651596
Successfully processing 50 out of 50 images






In [44]:
# Eval result
print(f'Average CER:{get_cer_avg(result_dir)}\nAverage WER:{get_wer_avg(result_dir)}')

Average CER:0.9999763291489205
Average WER:1.0


In [None]:
# New Line Segmentation Method
