In [1]:
import cv2
import os

def extract_cell_images_from_table(image):
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    vertical = horizontal = img_bin.copy()
    SCALE = 5
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    
    mask = horizontally_dilated + vertically_dilated
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
    )
    
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.05 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    
    # Filter out contours that aren't rectangular. Those that aren't rectangular
    # are probably noise.
    approx_rects = [p for p in approx_polys if len(p) == 4]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
    
    # Filter out rectangles that are too narrow or too short.
    MIN_RECT_WIDTH = 40
    MIN_RECT_HEIGHT = 10
    bounding_rects = [
        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
    ]
    
    # The largest bounding rectangle is assumed to be the entire table.
    # Remove it from the list. We don't want to accidentally try to OCR
    # the entire table.
    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
    
    cells = [c for c in bounding_rects]
    def cell_in_same_row(c1, c2):
        c1_center = c1[1] + c1[3] - c1[3] / 2
        c2_bottom = c2[1] + c2[3]
        c2_top = c2[1]
        return c2_top < c1_center < c2_bottom
    
    orig_cells = [c for c in cells]
    rows = []
    while cells:
        first = cells[0]
        rest = cells[1:]
        cells_in_same_row = sorted(
            [
                c for c in rest
                if cell_in_same_row(c, first)
            ],
            key=lambda c: c[0]
        )
    
        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
        rows.append(row_cells)
        cells = [
            c for c in rest
            if not cell_in_same_row(c, first)
        ]
    
    # Sort rows by average height of their center.
    def avg_height_of_center(row):
        centers = [y + h - h / 2 for x, y, w, h in row]
        return sum(centers) / len(centers)
    
    rows.sort(key=avg_height_of_center)
    cell_images_rows = []
    for row in rows:
        cell_images_row = []
        for x, y, w, h in row:
            cell_images_row.append(image[y:y+h, x:x+w])
        cell_images_rows.append(cell_images_row)
    return cell_images_rows

def main(f):
    results = []
    directory, filename = os.path.split(f)
    table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
    rows = extract_cell_images_from_table(table)
    cell_img_dir = os.path.join(directory, "cells")
    os.makedirs(cell_img_dir, exist_ok=True)
    out_path = '/home/vimalkumar/Documents/Back up/cv/cells'
    paths = []
    for i, row in enumerate(rows):
        for j, cell in enumerate(row):
            cell_filename = "{:03d}-{:03d}.png".format(i, j)
            path = os.path.join(out_path, cell_filename)
            cv2.imwrite(path, cell)
            paths.append(path)
    return paths

f = '/home/vimalkumar/Downloads/final_tables_2.png'

main(f)

['/home/vimalkumar/Documents/Back up/cv/cells/000-000.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/000-001.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/000-002.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/000-003.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/000-004.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/000-005.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/001-000.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/001-001.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/001-002.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/001-003.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/001-004.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/001-005.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/002-000.png',
 '/home/vimalkumar/Documents/Back up/cv/cells/002-001.png']

In [3]:
import math
import os
import sys

import cv2
import numpy as np
import pytesseract

def main(image_file, tess_args):
    """
    OCR the image and output the text to a file with an extension that is ready
    to be used in Tesseract training (.gt.txt).
    Tries to crop the image so that only the relevant text gets passed to Tesseract.
    Returns the name of the text file that contains the text.
    """
    #file_path = '/home/vimal/Documents/table_detect_samples/structured images/input/011364700/cells'
    #image_file = os.listdir(file_path)
    
    for f in image_file:
        #print(f)
        directory, filename = os.path.split(f)
        filename_sans_ext, ext = os.path.splitext(filename)
        image = cv2.imread(os.path.join(file_path,f), cv2.IMREAD_GRAYSCALE)
        cropped = crop_to_text(image)
        #ocr_data_dir = os.path.join(directory, "ocr_data")
        #os.makedirs(ocr_data_dir, exist_ok=True)
        #out_imagepath = os.path.join(ocr_data_dir, filename)
        out_path = '/home/vimalkumar/Documents/Back up/cv/text'
        out_txtpath = os.path.join(out_path, "{}.gt.txt".format(filename_sans_ext))
        #cv2.imwrite(out_imagepath, cropped)
        if not tess_args:
            d = os.path.dirname(sys.modules["table_ocr"].__file__)
            tessdata_dir = os.path.join(d, "tessdata")
            tess_args = ["--psm", "7", "-l", "table-ocr", "--tessdata-dir", tessdata_dir]
        txt = ocr_image(cropped, " ".join(tess_args))
        with open(out_txtpath, "w") as txt_file:
            txt_file.write(txt)
        #return out_txtpath
def crop_to_text(image):
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2

    img_bin = cv2.adaptiveThreshold(
        ~image,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )

    img_h, img_w = image.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    both = horizontal_lines + vertical_lines
    cleaned = img_bin - both

    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
    opened = cv2.dilate(opened, kernel)

    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
    char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
    if char_sized_bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
        for x, y, w, h in char_sized_bounding_rects:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
            maxy = max(maxy, y + h)
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
        # If we morphed out all of the text, assume an empty image.
        cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
def ocr_image(image, config):
    return pytesseract.image_to_string(
            image,
            config='--psm 6'
        )

file_path = '/home/vimalkumar/Documents/Back up/cv/cells'
image_file = os.listdir(file_path)
tess_args = os.listdir(file_path)

main(image_file, tess_args)


In [4]:
import io 
import csv

def text_files_to_csv(files):
    """Files must be sorted lexicographically
    Filenames must be <row>-<colum>.txt.
    000-000.txt
    000-001.txt
    001-000.txt
    etc...
    """
    rows = []
    for f in files:
        directory, filename = os.path.split(f)
        with open(os.path.join(file_path,f)) as of:
            txt = of.read().strip()
        row, column = map(int, filename.split(".")[0].split("-"))
        if row == len(rows):
            rows.append([])
        rows[row].append(txt)

    csv_file = io.StringIO()
    writer = csv.writer(csv_file)
    writer.writerows(rows)
    return csv_file.getvalue()

def main(files):
    return text_files_to_csv(files)


file_path = '/home/vimalkumar/Documents/Back up/cv/text'
file = file = os.listdir(file_path)
files = sorted(file)


a = main(files)
#print(a)


text_file = open("/home/vimalkumar/Documents/Back up/cv/txt/HL_txt.txt", "wt")
n = text_file.write(a)
text_file.close()


import pandas as pd

df = pd.read_csv('/home/vimalkumar/Documents/Back up/cv/txt/HL_txt.txt',header=None,delimiter=',', 
                     names=list(range(11)))
    #df = df.dropna(how='all', axis=1)
    #df.columns = df.iloc[0]
    #df = df[1:]
df = df.dropna(thresh = 3)

df.to_csv('/home/vimalkumar/Documents/Back up/cv/txt/HL_csv.csv')
#df = df.dropna(thresh = 3)
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Kind of Tax\n(a),Tax Period\nEnding\n(b},Identifying Number\nfe,Date of\nAssessment\n(dj,Last Day for\nRefiling\nfe),Unpaid Balance\nof Assessment\n(f},,,,,
1,CIVP\nCIVP\nCIVP\nCIVP\nCIVP\nCIVP\nCIVP\nCIVP...,06/30/2011\n06/30/2012\n09/30/2012\n12/31/2012...,XXX-XX-0496\nXXX-XX-0496\nXXX-XX-0496\nXXX-XX-...,11/02/2015\n11/02/2015\n11/02/2015\n11/02/2015...,12/02/2025\n12/02/2025\n12/02/2025\n12/02/2025...,1934.88\n3098.94\n2166.40\n2047.09\n2740.05\n3...,,,,,


In [5]:
df.shape

(2, 11)

In [9]:
df.iloc[0,:]

0                       Kind of Tax\n(a)
1                Tax Period\nEnding\n(b}
2                 Identifying Number\nfe
3               Date of\nAssessment\n(dj
4            Last Day for\nRefiling\nfe)
5     Unpaid Balance\nof Assessment\n(f}
6                                    NaN
7                                    NaN
8                                    NaN
9                                    NaN
10                                   NaN
Name: 0, dtype: object