In [1]:
import pytesseract
import cv2
import numpy as np
import os
import re
import pandas as pd

In [2]:
def image_to_conf(image, text_type, n=0):
    '''
    Using tesseract to get text and confidence
    '''
    if text_type == 0:
        # to recognize a line of text
        data = pytesseract.image_to_data(image, config='-l eng --psm 7', output_type='data.frame')
    elif text_type == 1:
        # to regognize a block of text
        data = pytesseract.image_to_data(image, config='-l eng --oem 1 --psm 6', output_type='data.frame')
    else:
        data = pytesseract.image_to_data(image, config='--psm 13 --oem 3 -c tessedit_char_whitelist=0123456789',
                                         output_type='data.frame')
    text_list = []
    sum_conf = 0
    len_conf = 0
    for i in range(len(data)):
        if data.iloc[i]['conf'] > n:
            text_list.append(str(data.iloc[i]['text']))
            sum_conf += len(str(data.iloc[i]['text'])) * data.iloc[i]['conf']
            len_conf += len(str(data.iloc[i]['text']))
    if len_conf == 0:
        conf = 0
    else:
        conf = sum_conf/len_conf
    return ' '.join(text_list), conf


class CardDetect(object):
    """
    OCR for card recognition
    """
    def __init__(self, img_path):
        self.image = cv2.imread(img_path, 1)
        self.gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)

    # detect first line and draw Auxiliary line
    def first_line(self):
        # Image Binarization
        ret, thresh1 = cv2.threshold(self.gray, 200, 255, cv2.THRESH_BINARY)
        # Median filter
        blur = cv2.medianBlur(thresh1, 3)  # ksize: 3*3
        blur = cv2.medianBlur(blur, 3)
        h, w = self.gray.shape
        # horizontal line
        horizontal_lines = []
        for i in range(h - 1):
            # find the split line
            if abs(np.mean(blur[i, :]) - np.mean(blur[i + 1, :])) > 100:
                # plot the line
                horizontal_lines.append([0, i, w, i])
        if horizontal_lines and horizontal_lines[0][1] < 150:
            first_line = horizontal_lines[0]
        else:
            first_line = [0, 104, 930, 104]
        # plot lines
        lines = [[23, 30, 23, 239], [930, 30, 930, 239]]
        first_line[1] -= 55
        first_line[3] -= 55
        lines.append(first_line)
        for l in lines:
            self.image = cv2.line(self.image, (l[0], l[1]), (l[2], l[3]), (0, 0, 255), 2)

        return self.image

    # detect cell
    def find_form(self):
        image = self.first_line()
        self.gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Image Binarization
        binary = cv2.adaptiveThreshold(~self.gray, 255,
                                       cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -10)
        rows, cols = binary.shape
        scale = 20

        # detect horizontal line
        kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (cols // scale, 1))
        eroded = cv2.erode(binary, kernel, iterations=2)
        dilatedcol = cv2.dilate(eroded, kernel, iterations=2)

        # detect vertical line
        kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (1, rows // scale))
        eroded = cv2.erode(binary, kernel, iterations=2)
        dilatedrow = cv2.dilate(eroded, kernel, iterations=2)

        # get vertex
        vertex = cv2.bitwise_and(dilatedcol, dilatedrow)

        # plot cell
        merge = cv2.add(dilatedcol, dilatedrow)
        return vertex, merge

    # get the coordinate
    def get_coordinate(self):
        a, b = self.find_form()
        ys, xs = np.where(a > 0)
        # create coordinate
        x_list, y_list = [], []

        # sort list
        xs, ys = np.sort(xs), np.sort(ys)

        for i in range(len(xs) - 1):
            if xs[i+1] - xs[i] > 20:
                x_list.append(xs[i])
        x_list.append(xs[i])
        if x_list[0] > 30:
            x_list.append(24)
        x_list = sorted(x_list)

        for i in range(len(ys) - 1):
            if ys[i+1] - ys[i] > 20:
                y_list.append(ys[i])
        y_list.append(ys[i])
        try:
            y_list[1] += 18
        except IndexError:
            pass

        try:
            y_list[2] -= 5

        except IndexError:
            pass

        return x_list, y_list

    # find cell
    def cell_detect(self):

        x_list, y_list = self.get_coordinate()
        # print(x_list)
        # crop the image by second line
        try:
            self.image = self.image[0:y_list[2], 0:950].copy()
        except IndexError:
            pass
        rects = []

        try:
            first_line = [x_list[0], x_list[-3]-25, x_list[-1]]
            for i in range(0, len(first_line) - 1):
                for j in range(len(y_list[:2]) - 1):
                    rects.append((first_line[i], y_list[j], first_line[i + 1], y_list[j + 1]-20))
        except IndexError:
            pass
        if len(y_list) > 3:
            y_list.pop()
        else:
            pass

        if len(x_list) >= 8:
            x_list.pop(-4)
        else:
            pass

        for i in range(0, len(x_list) - 1):
            for j in range(1, len(y_list) - 1):
                rects.append((x_list[i], y_list[j], x_list[i + 1], y_list[j + 1]))

        # plot rect:
        for rect in rects:
            self.image = cv2.rectangle(self.image, (rect[0], rect[1]), (rect[2], rect[3]), (255, 0, 0), 2)

        # print(rects)
        return rects

    # extract text:
    def ocr_text(self):
        rects = self.cell_detect()
        thresh = self.gray

        target = [0, 2, 3, 4, 5]
        file_name = ['Name', 'Date of interment', 'Section', 'Lot', 'GR']
        special_char = '‘’,|-_<"=;«“&—]uv'
        file_type = [0, 0, 1, 1, 2]
        threshold = [-1, -1, -1, -1, -1]
        # rect1 = rects[target[0]]
        # detect_img = thresh[rect1[1]:rect1[3], rect1[0]:rect1[2]]
        # data = pytesseract.image_to_data(detect_img, config='--psm 7', lang='eng', output_type='data.frame')
        # print(data)
        result = {}
        conf = 0
        try:
            for i in range(5):
                rect1 = rects[target[i]]
                detect_img = thresh[rect1[1]:rect1[3], rect1[0]:rect1[2]]
                name = file_name[i]
                type1 = file_type[i]
                thresh1 = threshold[i]
                text1, conf1 = image_to_conf(detect_img, type1, thresh1)
                text1 = ''.join([char for char in text1 if char not in special_char])
                conf += conf1
                if i == 0:
                    text1 = text1.lstrip('7')
                    text1 = re.sub("\s+", " ", ''.join(re.findall(r'[A-Za-z0-9]\s*',
                                                                  re.sub("[,|()|-]", " ", text1)))).upper()
                elif i == 1:
                    text1 = re.sub("\s+", " ", ''.join(re.findall(r'[A-Za-z0-9]\s*\/*', text1)))
                else:
                    text1 = text1.rstrip('.0')
                    text1 = re.sub("\s+", " ", ''.join(re.findall(r'[A-Za-z0-9]\s*', text1)))

                result[name] = text1
                # print(name, ':', text1, end='\n')
            result['Avg_conf'] = round((conf / 5), 2)
        except IndexError:
            pass

        return result

    # show image
    def show_image(self):
        cv2.imshow('result', self.image)
        cv2.waitKey(0)


def main():
    temp = []
    for i in range(1, len(images)):
        card = CardDetect(path + '/' + images[i])
        ret = card.ocr_text()
        temp.append(ret)
    df = pd.DataFrame(temp, columns=['Name', 'Date of interment', 'Section', 'Lot', 'GR', 'Avg_conf'])

    return df.to_csv('result.csv')


if __name__ == '__main__':
    path = 'Form_A'
    images = sorted(os.listdir(path))
    # main()




In [3]:
ocr_data = pd.read_csv('result.csv', index_col=0)
ori_csv = pd.read_csv('actual_result.csv')
ori_csv = ori_csv.loc[ori_csv['Form']=='A']

In [4]:
ori_data = pd.DataFrame()
for i in range(len(ori_csv)):
    text_name = ori_csv.iloc[i][7].upper() +" "+ ori_csv.iloc[i][6].upper()
    text_date = ori_csv.iloc[i][8]
    text_section = ori_csv.iloc[i][9]
    text_lot = ori_csv.iloc[i][11]
    text_gr = ori_csv.iloc[i][12]
    ori_data[i] = [text_name,text_date,text_section,text_lot,text_gr]

ori_data = ori_data.T

In [5]:
from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

score_name,score_date,score_section,score_lot,score_gr =0,0,0,0,0
for i in range(len(ocr_data)):
    score_name += similarity(str(ocr_data.iloc[i][0]),str(ori_data.iloc[i][0]))
    score_date += similarity(str(ocr_data.iloc[i][1]),str(ori_data.iloc[i][1]))
    score_section += similarity(str(ocr_data.iloc[i][2]),str(ori_data.iloc[i][2]))
    score_lot += similarity(str(ocr_data.iloc[i][3]),str(ori_data.iloc[i][3]))
    score_gr += similarity(str(ocr_data.iloc[i][4]),str(ori_data.iloc[i][4]))
    
print("score_name",score_name/len(ocr_data))
print("score_date",score_date/len(ocr_data))
print("score_section",score_section/len(ocr_data))
print("score_lot",score_lot/len(ocr_data))
print("score_gr",score_gr/len(ocr_data))

score_name 0.8790624447914716
score_date 0.7914588809119252
score_section 0.7934391534391535
score_lot 0.673345331998583
score_gr 0.37719576719576714
