In [1]:
import cv2, json, boto3
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from matplotlib import rcParams

In [2]:
# Directory of images to run the code on
img_dir = '../AllData/processed_images/Part2'

In [3]:
def detectText(path, image, image_text, img_text):
    
    img_height, img_width, channels = image.shape
    _, im_buf = cv2.imencode("." + path.name.split(".")[-1], image)
        
    response = client.detect_text(
        Image = {
            "Bytes" : im_buf.tobytes()
        }
    )
    
    if path.name not in image_text:
        image_text[path.name] = {}
        image_text[path.name]['TextDetections'] = response['TextDetections']
    else:
        image_text[path.name]['TextDetections'].extend(response['TextDetections'])
        
    textDetections = response['TextDetections']
        
    if path.name not in img_text:
        img_text[path.name] = []
            
    for text in textDetections:
        if text['Type'] == 'WORD' and text['Confidence'] >= 80:
                
            vertices = [[vertex['X'] * img_width, vertex['Y'] * img_height] for vertex in text['Geometry']['Polygon']]
            vertices = np.array(vertices, np.int32)
            vertices = vertices.reshape((-1, 1, 2))
            
            image = cv2.fillPoly(image, [vertices], (255, 255, 255))
                  
            left = np.amin(vertices, axis=0)[0][0]
            top = np.amin(vertices, axis=0)[0][1]
            right = np.amax(vertices, axis=0)[0][0]
            bottom = np.amax(vertices, axis=0)[0][1]
            
            img_text[path.name].append(
                (
                    text['DetectedText'],
                    (
                        int(left),
                        int(top),
                        int(right - left),
                        int(bottom - top)
                    )
                )
            )

    return image

In [4]:
img_text = {}
image_text = {}
client = boto3.client('rekognition')

for index, path in enumerate(Path(img_dir).iterdir()):
    if path.name.endswith('.png') or path.name.endswith('.jpg') or path.name.endswith('.jpeg'):       
        filepath = img_dir + "/" + path.name
        
        print("[{0}] file name: {1}".format(index, path.name))
        
        image = cv2.imread(filepath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
        image = detectText(path, image, image_text, img_text)
        detectText(path, image, image_text, img_text)
        
with open('../data/aws-rekognition-output.json', 'w') as out:
    json.dump(image_text, out)
    
with open('../data/ocr-image-text.json', 'w') as out:
    json.dump(img_text, out)

[0] file name: 08813216-Figure1-1.png
[1] file name: 08813216-Figure4-1.png
[2] file name: 1-s2.0-S0009250913008336-main-Figure10-1.png
[3] file name: 1-s2.0-S0009250913008336-main-Figure11-1.png
[4] file name: 1-s2.0-S0009250913008336-main-Figure2-1.png
[5] file name: 1-s2.0-S0009250913008336-main-Figure3-1.png
[6] file name: 1-s2.0-S0009250913008336-main-Figure8-1.png
[7] file name: 1-s2.0-S0009250913008336-main-Figure9-1.png
[8] file name: 1-s2.0-S0009250914000463-main-Figure1-1.png
[9] file name: 1-s2.0-S0009250916306625-main-Figure4-1.png
[10] file name: 1-s2.0-S0009250916306625-main-Figure6-1.png
[11] file name: 1-s2.0-S0016236111006739-main-Figure2-1.png
[12] file name: 1-s2.0-S0016236112010988-main-Figure4-1.png
[13] file name: 1-s2.0-S0016236114000477-main-Figure5-1.png
[14] file name: 1-s2.0-S0016236114000477-main-Figure8-1.png
[15] file name: 1-s2.0-S0016236114004244-main-Figure3-1.png
[16] file name: 1-s2.0-S0016236114008667-main-Figure2-1.png
[17] file name: 1-s2.0-S001623

[137] file name: 1-s2.0-S0196890416310159-main-Figure6-1.png
[138] file name: 1-s2.0-S0196890416310445-main-Figure10-1.png
[139] file name: 1-s2.0-S0196890416311475-main-Figure1-1.png
[140] file name: 1-s2.0-S0196890417301693-main-Figure2-1.png
[141] file name: 1-s2.0-S0196890417303643-main-Figure2-1.png
[142] file name: 1-s2.0-S0196890417303643-main-Figure3-1.png
[143] file name: 1-s2.0-S0196890417303643-main-Figure4-1.png
[144] file name: 1-s2.0-S0196890417303643-main-Figure5-1.png
[145] file name: 1-s2.0-S0196890417303643-main-Figure6-1.png
[146] file name: 1-s2.0-S0196890417310452-main-Figure4-1.png
[147] file name: 1-s2.0-S0196890417310452-main-Figure6-1.png
[148] file name: 1-s2.0-S0196890417310634-main-Figure2-1.png
[149] file name: 1-s2.0-S0196890417310634-main-Figure3-1.png
[150] file name: 1-s2.0-S0196890417310634-main-Figure4-1.png
[151] file name: 1-s2.0-S0196890417310634-main-Figure8-1.png
[152] file name: 1-s2.0-S0196890417311330-main-Figure1-1.png
[153] file name: 1-s2.0

[272] file name: 1-s2.0-S0360544219302749-main-Figure2-1.png
[273] file name: 1-s2.0-S0360544219302749-main-Figure3-1.png
[274] file name: 1-s2.0-S0360544219302749-main-Figure4-1.png
[275] file name: 1-s2.0-S0360544219302749-main-Figure5-1.png
[276] file name: 1-s2.0-S0360544219302749-main-Figure6-1.png
[277] file name: 1-s2.0-S0360544219302749-main-Figure7-1.png
[278] file name: 1-s2.0-S0360544219302865-main-Figure3-1.png
[279] file name: 1-s2.0-S0360544219302865-main-Figure4-1.png
[280] file name: 1-s2.0-S0360544219302865-main-Figure5-1.png
[281] file name: 1-s2.0-S0360544219303020-main-Figure4-1.png
[282] file name: 1-s2.0-S0360544219303020-main-Figure5-1.png
[283] file name: 1-s2.0-S0360544219308230-main-Figure1-1.png
[284] file name: 1-s2.0-S0360544219308230-main-Figure2-1.png
[285] file name: 1-s2.0-S0360544219308230-main-Figure3-1.png
[286] file name: 1-s2.0-S0360544219318316-main-Figure8-1.png
[287] file name: 1-s2.0-S0360544219319917-main-Figure3-1.png
[288] file name: 1-s2.0-

[407] file name: 1-s2.0-S0956053X17304403-main-Figure3-1.png
[408] file name: 1-s2.0-S0956053X17305809-main-Figure3-1.png
[409] file name: 1-s2.0-S0956053X17305809-main-Figure5-1.png
[410] file name: 1-s2.0-S0956053X17305809-main-Figure6-1.png
[411] file name: 1-s2.0-S0956053X17307638-main-Figure7-1.png
[412] file name: 1-s2.0-S0956053X17307638-main-Figure8-1.png
[413] file name: 1-s2.0-S0956053X1830312X-main-Figure4-1.png
[414] file name: 1-s2.0-S0956053X18303404-main-Figure2-1.png
[415] file name: 1-s2.0-S0956053X18303404-main-Figure3-1.png
[416] file name: 1-s2.0-S0956053X18303404-main-Figure7-1.png
[417] file name: 1-s2.0-S0956053X18304677-main-Figure2-1.png
[418] file name: 1-s2.0-S0956053X19305410-main-Figure2-1.png
[419] file name: 1-s2.0-S0956053X19305410-main-Figure3-1.png
[420] file name: 1-s2.0-S0956053X19305410-main-Figure5-1.png
[421] file name: 1-s2.0-S0956053X19306993-main-Figure2-1.png
[422] file name: 1-s2.0-S0956053X19306993-main-Figure5-1.png
[423] file name: 1-s2.0-

[541] file name: 1-s2.0-S0960852416305533-main-Figure4-1.png
[542] file name: 1-s2.0-S0960852416308719-main-Figure2-1.png
[543] file name: 1-s2.0-S0960852416308719-main-Figure3-1.png
[544] file name: 1-s2.0-S0960852416308719-main-Figure4-1.png
[545] file name: 1-s2.0-S0960852416309026-main-Figure2-1.png
[546] file name: 1-s2.0-S0960852416309026-main-Figure4-1.png
[547] file name: 1-s2.0-S0960852416309129-main-Figure1-1.png
[548] file name: 1-s2.0-S0960852416310264-main-Figure1-1.png
[549] file name: 1-s2.0-S0960852416310264-main-Figure2-1.png
[550] file name: 1-s2.0-S0960852416310264-main-Figure3-1.png
[551] file name: 1-s2.0-S0960852416310264-main-Figure4-1.png
[552] file name: 1-s2.0-S0960852416310264-main-Figure5-1.png
[553] file name: 1-s2.0-S0960852416311816-main-Figure2-1.png
[554] file name: 1-s2.0-S0960852416311816-main-Figure4-1.png
[555] file name: 1-s2.0-S0960852416312482-main-Figure3-1.png
[556] file name: 1-s2.0-S0960852416313360-main-Figure3-1.png
[557] file name: 1-s2.0-

[676] file name: 1-s2.0-S0961953419303629-main-Figure1-1.png
[677] file name: 1-s2.0-S0961953419303629-main-Figure2-1.png
[678] file name: 1-s2.0-S0961953419303629-main-Figure4-1.png
[679] file name: 1-s2.0-S0961953419303629-main-Figure6-1.png
[680] file name: 1-s2.0-S0961953419303629-main-Figure8-1.png
[681] file name: 1-s2.0-S0961953419303666-main-Figure7-1.png
[682] file name: 1-s2.0-S0961953419303666-main-Figure9-1.png
[683] file name: 1-s2.0-S0961953420300118-main-Figure4-1.png
[684] file name: 1-s2.0-S0961953420300192-main-Figure3-1.png
[685] file name: 1-s2.0-S0961953420300386-main-Figure10-1.png
[686] file name: 1-s2.0-S0961953420300386-main-Figure11-1.png
[687] file name: 1-s2.0-S0961953420300386-main-Figure12-1.png
[688] file name: 1-s2.0-S0961953420300386-main-Figure13-1.png
[689] file name: 1-s2.0-S0961953420300386-main-Figure3-1.png
[690] file name: 1-s2.0-S0961953420300386-main-Figure6-1.png
[691] file name: 1-s2.0-S0961953420300386-main-Figure8-1.png
[692] file name: 1-s

[810] file name: 1-s2.0-S2211926418305988-main-Figure7-1.png
[811] file name: 1-s2.0-S2211926418307756-main-Figure4-1.png
[812] file name: 1-s2.0-S2211926418307756-main-Figure6-1.png
[813] file name: 1-s2.0-S2211926418307756-main-Figure8-1.png
[814] file name: 1-s2.0-S2211926418307756-main-Figure9-1.png
[815] file name: 1-s2.0-S221192641830804X-main-Figure4-1.png
[816] file name: 1-s2.0-S221192641830804X-main-Figure5-1.png
[817] file name: 1-s2.0-S221192641830804X-main-Figure7-1.png
[818] file name: 1-s2.0-S221192641830804X-main-Figure9-1.png
[819] file name: 1-s2.0-S2211926418308403-main-Figure7-1.png
[820] file name: 1-s2.0-S2211926418308403-main-Figure8-1.png
[821] file name: 1-s2.0-S2211926418309688-main-Figure6-1.png
[822] file name: 1-s2.0-S2211926418311317-main-Figure1-1.png
[823] file name: 1-s2.0-S2211926418311317-main-Figure2-1.png
[824] file name: 1-s2.0-S2211926418311317-main-Figure6-1.png
[825] file name: 1-s2.0-S2211926419303704-main-Figure12-1.png
[826] file name: 1-s2.0

[960] file name: acssuschemeng.6b01957-Figure1-1.png
[961] file name: acssuschemeng.6b02367-Figure5-1.png
[962] file name: acssuschemeng.6b02367-Figure6-1.png
[963] file name: acssuschemeng.6b02367-Figure7-1.png
[964] file name: acssuschemeng.7b00226-Figure6-1.png
[965] file name: acssuschemeng.7b00233-Figure1-1.png
[966] file name: acssuschemeng.7b00233-Figure3-1.png
[967] file name: acssuschemeng.7b01473-Figure1-1.png
[968] file name: acssuschemeng.7b02052-Figure8-1.png
[969] file name: acssuschemeng.7b02226-Figure8-1.png
[970] file name: acssuschemeng.7b02854-Figure1-1.png
[971] file name: acssuschemeng.7b03328-Figure2-1.png
[972] file name: acssuschemeng.7b04338-Figure5-1.png
[973] file name: acssuschemeng.7b04338-Figure6-1.png
[974] file name: acssuschemeng.8b01368-Figure1-1.png
[975] file name: acssuschemeng.8b01368-Figure3-1.png
[976] file name: acssuschemeng.8b01368-Figure5-1.png
[977] file name: acssuschemeng.8b01368-Figure8-1.png
[978] file name: acssuschemeng.8b02012-Figure1

[1123] file name: ef502574b-Figure6-1.png
[1124] file name: ef502773w-Figure2-1.png
[1125] file name: ef700497d-Figure6-1.png
[1126] file name: ef900027d-Figure5-1.png
[1127] file name: ef900379p-Figure8-1.png
[1128] file name: Effect of hydration calcination CaO on the deoxygenation of bio oil from pyrolysis of Nannochloropsis sp-Figure4-1.png
[1129] file name: energies-09-00888-Figure1-1.png
[1130] file name: energies-11-00521-v2-Figure7-1.png
[1131] file name: energies-11-00521-v2-Figure8-1.png
[1132] file name: energies-11-00564-Figure4-1.png
[1133] file name: energies-11-01693-v3-Figure5-1.png
[1134] file name: energies-12-00723-Figure5-1.png
[1135] file name: energies-12-00729-Figure2-1.png
[1136] file name: energies-12-00809-v2-Figure1-1.png
[1137] file name: energies-13-00124-v3 (1)-Figure3-1.png
[1138] file name: energies-13-00124-v3 (1)-Figure8-1.png
[1139] file name: energies-13-00124-v3-Figure3-1.png
[1140] file name: energies-13-00124-v3-Figure8-1.png
[1141] file name: ent

[1246] file name: The Effect of Wood Vinegar on Hydrothermal Liquefaction of Cotton Stalk Under CO Atmosphere-Figure2-1.png
[1247] file name: The Effect of Wood Vinegar on Hydrothermal Liquefaction of Cotton Stalk Under CO Atmosphere-Figure3-1.png
[1248] file name: Tobacco biomass as a source of advanced biofuels-Figure1-1.png
[1249] file name: Wagner2018_Article_HydrothermalConversionOfLipid--Figure2-1.png
[1250] file name: wene.319-Figure5-1.png
[1251] file name: wene.319-Figure6-1.png
[1252] file name: Xie2019_Article_DewaterabilityEnhancementAndHe-Figure5-1.png
[1253] file name: Yu2014_Article_NutrientFlowsAndQualityOfBio-c-Figure1-1.png
