## Importing dependencies & defining global variables

In [1]:
import boto3
import re
import os
from pprint import pprint
import IPython.display as disp
from PIL import Image, ImageDraw
from pdf2image import convert_from_path

## Defining functions

In [2]:
def convert_pdf2jpeg(pdf_file, exam_name, output_dir):
    pages = convert_from_path(pdf_file)
    page_count = 0
    for page in pages:
        page_count+=1
        page.save(f'{output_dir}/{exam_name}_{page_count}pg.jpg', 'JPEG')
    return page_count

def load_image(filename):
    with open(filename, "rb") as imageFile:
      f = imageFile.read()
      return bytearray(f)
    
def save_lines(textract_response):
    lines=[]
    blocks = textract_response['Blocks']
    for block in blocks:
        if block['BlockType'] == 'LINE':
            lines.append(block)
    return lines

def identify_questions(lines):
    questions=[]
    for line in lines:
        if line['Text'].startswith('Questão') \
        or line['Text'].startswith('CIÊNCIAS HUMANAS') \
        or line['Text'].startswith('CIÊNCIAS DA NATUREZA') \
        or line['Text'].startswith('MATEMÁTICA') \
        or line['Text'].startswith('LINGUAGENS') \
        or line['Text'].startswith('INSTRUÇÕES PARA'):
            questions.append(line)
    return questions

def localize_questions(questions,exam_area):
    questions_bboxes = []
    for question in questions:
        #check if it is an area identifier
        if question['Text'].startswith('CIÊNCIAS HUMANAS'):
            exam_area = 'CH'
        elif question['Text'].startswith('CIÊNCIAS DA NATUREZA'):
            exam_area = 'CN'
        elif question['Text'].startswith('MATEMÁTICA'):
            exam_area = 'MT'
        elif question['Text'].startswith('LINGUAGENS'):
            exam_area = 'LC'
        elif question['Text'].startswith('INSTRUÇÕES PARA'):
            exam_area = 'RD'
        question['Geometry']['BoundingBox']['Area']=exam_area
        question['Geometry']['BoundingBox']['Question']=question['Text']
        questions_bboxes.append(question['Geometry']['BoundingBox'])
    return questions_bboxes

def order_questions(questions):    
    questions_p1 = list(filter(lambda x: x.get('Left')<0.5,questions))
    questions_p1.sort(key=lambda question: question.get('Top') )

    questions_p2 = list(filter(lambda x: x.get('Left')>=0.5,questions))
    questions_p2.sort(key=lambda question: question.get('Top') )

    questions = questions_p1+questions_p2
    
    return questions

In [3]:
def detect_page_style(width, questions_locations):
    middle_horizontal = width/2
    page_style = '' # centered, columns, broken_columns
    for question in questions_locations:
        if width * question['Left'] > middle_horizontal:
            # page has 2 columns
            if height * question['Top'] > (height * questions_locations[0]['Top'] * 1.10):
                # question is broken
                page_style = 'broken_columns'
            else:
                # question is not broken
                page_style = 'columns'
            break
        else:
            page_style = 'centered'
    return page_style

def create_question_bbox(page_style, box, width, height, previous_question, next_question):
    middle_horizontal = width/2
    broken = False
    if page_style == 'centered':
        if box['Top'] != next_question['Top'] and box['Top'] <= previous_question['Top']:
            x0 = 0
            y0 = height*box['Top']
            x1 = width
            y1 = height*next_question['Top']
        else:
            # question is on the bottom of page
            x0 = 0
            y0 = height*box['Top']
            x1 = width
            y1 = height
    else:
        if width * box['Left'] < middle_horizontal:
            # question is on the left
            if  width * next_question['Left'] > middle_horizontal:
                # next question is on the right
                x0 = 0
                y0 = height*box['Top']
                x1 = width/2
                y1 = height
            else:
                x0 = 0
                y0 = height*box['Top']
                x1 = width/2
                y1 = height*next_question['Top']
        else:
            # question is on the right
            if box['Top'] < next_question['Top']:
                # question is on the top right
                x0 = width/2
                y0 = height*box['Top']
                x1 = width
                y1 = height*next_question['Top']
            else:
                # question is on the bottom right
                x0 = width/2
                y0 = height*box['Top']
                x1 = width
                y1 = height
    return [x0, y0, x1, y1]

def create_broken_piece_bbox(width, height, next_question):
    x0 = width/2
    y0 = height*questions_locations[0]['Top']
    x1 = width
    y1 = height*next_question['Top']
    return [x0, y0, x1, y1]

def show_bbox(draw, box, color):
    return draw.rectangle(box, outline=color, width=4)
    
def cut_question(image, question_bbox):
    crop = image.crop(question_bbox)
    return crop

In [4]:
def save_question_cuts(questions_locations,orig_file):
    question_cuts = []
    second_col = False

    for qnum,question in enumerate(questions_locations):
        quest_text = find_number(question['Question'],'Questão ')
        #print(quest_text)
        new_file_question = orig_file.replace('.jpg','_'+str(qnum)+'_a'+question['Area']+'_q'+quest_text+'.jpg')
        #print(quest_text+" - "+new_file_question)
        question_bbox = create_question_bbox(page_style, questions_locations[qnum], width, height, questions_locations[max(0,qnum-1)], questions_locations[min(len(questions_locations)-1,qnum+1)])
        cut = cut_question(image, question_bbox)

        #check if question is the last of first column (style broken_columns) 
        if not second_col and page_style == 'broken_columns' and width * questions_locations[qnum+1]['Left'] > middle_horizontal:
            #pprint(question)
            broken_question_bbox = create_broken_piece_bbox(width, height, questions_locations[qnum+1])
            broken_cut = cut_question(image,broken_question_bbox)
            #concatenate pics vertically
            cut = get_concat_v_blank(cut,broken_cut)
            #broken_cut.show()
            second_col = True
        print(new_file_question)
        cut.save(new_file_question, 'JPEG')
        question_cuts.append(cut)
    return question_cuts

def get_concat_v_blank(im1, im2, color=(0, 0, 0)):
    dst = Image.new('RGB', (max(im1.width, im2.width), im1.height + im2.height), color)
    dst.paste(im1, (0, 0))
    dst.paste(im2, (0, im1.height))
    return dst

def find_number(text, c):
    if text.startswith('Questão'):
        #get digits after text (question number)
        return re.findall(r'%s(\d+)' % c, text)[0]
    else:
        return 'xx'

## Defining variables

In [5]:
#exam_name = 'custom_exam'
#exam_name = 'enem_2019_01'
exam_name = 'ENEM_2019_P1_CAD_07_DIA_2_AZUL'
#exam_name = 'ENEM_2019_P1_CAD_01_DIA_1_AZUL'
exam_bucket = 'exams-szacca'
exams_dir = '../exams'
output_dir = '../outputs'
exam_file = f'{exams_dir}/{exam_name}.pdf'
region = 'us-east-1'


## Extract pages from pdf

In [6]:
output_pages_dir = exams_dir+'/'+exam_name

if not os.path.exists(output_pages_dir):
    os.makedirs(output_pages_dir)

npages = convert_pdf2jpeg(exam_file, exam_name, output_pages_dir)
npages

32

## Cut questions from each page in separate files

In [10]:
start_page = 2
end_page = 31
output_question_dir = output_dir+'/'+exam_name
exam_area = 'xx'

if not os.path.exists(output_question_dir):
    os.makedirs(output_question_dir)

for page in range(start_page,end_page+1):
    orig_image = f'{output_pages_dir}/{exam_name}_{page}pg.jpg'

    print('Processing page '+str(page)+' in '+orig_image+'')
    
    image = Image.open(orig_image)
    width, height =image.size
    middle_horizontal = width/2
    draw = ImageDraw.Draw(image)

    textract = boto3.client('textract')

    response = textract.detect_document_text(
        Document={
            'Bytes': load_image(orig_image),
        }
    )

    lines = save_lines(response)
    questions = identify_questions(lines)
    questions_locations = localize_questions(questions,exam_area)

    #store the actual exam area to keep using it in next page
    exam_area = questions_locations[-1]['Area']
    #pprint(questions_locations[-1])
    questions_locations = order_questions(questions_locations)
    page_style = detect_page_style(width, questions_locations)

    dest_file =output_question_dir+'/'+exam_name+'_pg'+str(page)+'.jpg'
    #print(dest_file)
    question_cuts = save_question_cuts(questions_locations,dest_file)

print('Done.')

Processing page 2 in ../exams/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_2pg.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg2_0_aCN_qxx.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg2_1_aCN_q91.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg2_2_aCN_q92.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg2_3_aCN_q93.jpg
Processing page 3 in ../exams/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_3pg.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg3_0_aCN_q94.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg3_1_aCN_q95.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg3_2_aCN_q96.jpg
Processing page 4 in ../exams/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_4pg.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_D

../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg23_0_aMT_q155.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg23_1_aMT_q156.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg23_2_aMT_q157.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg23_3_aMT_q158.jpg
Processing page 24 in ../exams/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_24pg.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg24_0_aMT_q159.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg24_1_aMT_q160.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg24_2_aMT_q161.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg24_3_aMT_q162.jpg
../outputs/ENEM_2019_P1_CAD_07_DIA_2_AZUL/ENEM_2019_P1_CAD_07_DIA_2_AZUL_pg24_4_aMT_q163.jpg
Processing page 25 in ../exams/ENEM_2019_P1_CAD_07_DIA_2_AZUL