#Installation part

Please run this code snippet to install all the required packages

In [None]:
# We need to install a number of libraries and environment before we start
!apt-get install poppler-utils
!pip install pdf2image

# install dependencies: (use cu100 because colab is on CUDA 10.0)
!pip install -U torch==1.4+cu100 torchvision==0.5+cu100 -f https://download.pytorch.org/whl/torch_stable.html 
!pip install cython pyyaml==5.1
!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
import torch, torchvision
torch.__version__
!gcc --version
# opencv is pre-installed on colab

# install detectron2:
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu100/index.html

# install pytersseract
! apt install tesseract-ocr
! apt install libtesseract-dev
!pip install pytesseract
!pip install tox


# Major Functions Snippet

Run this code snippet to enable extraction and detection function.

In [None]:
import os, fnmatch
from pdf2image import convert_from_path
from pdf2image.generators import (
    ThreadSafeGenerator,
    counter_generator)

from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError)

# You may need to restart your runtime prior to this, to let your installation take effect
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
from numpy import array
from numpy import argmax
from numpy import log
import cv2
import random
import math
from google.colab.patches import cv2_imshow
import json
from detectron2.structures import BoxMode


# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data.datasets import register_coco_instances
from detectron2.data import DatasetCatalog, MetadataCatalog

from detectron2.data.datasets import register_coco_instances
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.visualizer import ColorMode

# Pytesseract
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
import tox

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import matplotlib.pyplot as plt

from google.colab import files
from google.colab import drive

import sqlite3

conn = sqlite3.connect('Table_DB.db')

def convert_pdf_to_jpg(input_path,output_path):
    # create the folder to store all images from multiple pdfs
    if not os.path.exists(output_path): 
        os.makedirs(output_path, exist_ok=True)

    # checks if path is a file or a directory
    # pdfNames is the output, list of all pdf files' path
    pdfNames = []
    if os.path.isfile(input_path):
        print('The input is one pdf file.')
        pdfNames = [input_path]
    else:
        print('The input is multiple pdf files.')
        listOfFiles = os.listdir(input_path)
        pattern = "*.pdf"

        # build the list for all file names
        for entry in listOfFiles:
            if fnmatch.fnmatch(entry, pattern):
                pdfNames.append(input_path + '/'+ entry)
        pdfNames = sorted(pdfNames)

    for inputpdf in pdfNames:
        print("Transfering to JPG for table detection:\n",inputpdf,'\nThis may take a while...')

        # create the folder to store all images from one pdf
        image_folder = output_path + '/' + inputpdf.split('/')[-1] + '_dir'
        if not os.path.exists(image_folder): 
            os.makedirs(image_folder, exist_ok=True)

        # convert pdf pages into images automatically
        # pdfName.pdf.1-pageNumber(3 digits).jpg
        generator = convert_from_path(inputpdf, dpi=200, fmt = 'jpeg',
                                        output_folder = image_folder,
                                        output_file = counter_generator(inputpdf.split('/')[-1] + '.', padding_goal=0))
        
        # pageNumber_pdfName.pdf.jpg
        for count, filename in enumerate(os.listdir(image_folder)):
            num_page = int(filename.split('1-',1)[1].split('.jpg')[0])
            dst = str('{0:04d}'.format(num_page)) + '_' + filename.split('1-',1)[0] +  "jpg"
            src = image_folder + '/' + filename 
            dst = image_folder + '/' + dst 

            # rename all the files 
            os.rename(src, dst)
    print('Convert all PDF files succussfully!')
    return pdfNames

# loading the detection model
def load_detection_model(training_data_json, training_data, final_detection_model):

    ## model should be put in the same workspace.
    cfg = get_cfg()
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))
    register_coco_instances("my_dataset_train", {}, training_data_json, training_data)

    cfg.SOLVER.IMS_PER_BATCH = 2
    cfg.SOLVER.BASE_LR = 0.001
    cfg.SOLVER.MAX_ITER = 450 
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128  
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (table)

    ## We can change the model to be used
    ## Detection model address may need to be changed
    cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, final_detection_model)
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.95  # set the testing threshold for this model

    predictor = DefaultPredictor(cfg)
    table_train_metadata = MetadataCatalog.get("my_dataset_train")

    return predictor, table_train_metadata


# 
def matt_filter(crop):
  custom_oem_psm_config = r'--oem 3 --psm 6'
  sl = pytesseract.image_to_data(crop, lang = 'eng',
                                          output_type = pytesseract.Output.DATAFRAME, 
                                          config = custom_oem_psm_config)
  if firstFilter(sl):
    return True
  else:
    return False

def firstFilter(pdFrame):
  counter = 0
  targetList = ["class","Pass-Through", "S&P","Fitch", "Moody's", "LIBOR Rate", "Reallowance Discount"]
  for i in pdFrame.iloc[:,11]:
    if(type(i) != float):
      for target in targetList:
        if(target.casefold() in i.casefold()):
          targetList.remove(target)
          counter = counter +1
          if (counter >= 2):
            print('matt_true')
            return True

  return False

class TableHero(object):
    def __init__(
        self,
        input_path           = None,
        pdfNames             = [],
        jpgFileName          = '/content/allPage_Image',
        # change the path of json file of training data
        training_data_json   = '/content/drive/My Drive/unit of study/capstone/dataset/user_version/training_data.json',
        # change the path of training data
        training_data        = '/content/drive/My Drive/unit of study/capstone/dataset/user_version/training_data',
        # change the path of detection model
        final_detection_model= '/content/drive/My Drive/unit of study/capstone/model/model_final_15_450.pth',
        predictor            = None,
        table_train_metadata = None,
        crop_table_list      = None,
    ):

        self.input_path           = input_path
        self.jpgFileName          = jpgFileName
        self.pdfNames             = convert_pdf_to_jpg(input_path,jpgFileName)
        
        self.training_data_json   = training_data_json
        self.training_data        = training_data
        self.final_detection_model= final_detection_model
        self.predictor, self.table_train_metadata = load_detection_model(training_data_json,
                                                                         training_data,
                                                                         final_detection_model)

    def detect_tables(self):
        ## Matthew: This is the path for img with bounding box of tables
        if not os.path.exists('Page_with_Bbox'): 
            os.makedirs('Page_with_Bbox', exist_ok=True)

        ## Matthew: This is the path for cropped table img
        if not os.path.exists('Table_JPG'): 
            os.makedirs('Table_JPG', exist_ok=True)
        
        table_info = []
        id = 0
        print(len(self.pdfNames))
        for file_name in self.pdfNames:
            # create folder for each pdf file
            if not os.path.exists('/content/Page_with_Bbox/' + file_name.split('/')[-1] + '_dir'): 
                os.makedirs('/content/Page_with_Bbox/' + file_name.split('/')[-1] + '_dir', exist_ok=True)

            # read each pdf file and its' list of all page images
            print('Detecting pdf: ', file_name)
            jpg_name_list = os.listdir(self.jpgFileName + '/' + file_name.split('/')[-1] + '_dir')
            jpg_name_list.sort()

            # detect tables in each page image
            for image in jpg_name_list:
                page = image.split('_')[0]
                # d is the path of each page image
                d = self.jpgFileName + '/' + file_name.split('/')[-1] + '_dir/' + image
                
                im = cv2.imread(d)
                outputs = self.predictor(im)
                table_coordinates = outputs["instances"].pred_boxes.tensor.tolist()

                v = Visualizer(im[:, :, ::-1],
                                metadata= self.table_train_metadata, 
                                scale=0.5, 
                                )

                # check if table exists
                if not table_coordinates:
                    a = 'no page'
                    # print("There is no table in this page: ", page)
                else:
                    print("In page: " + str(page), ', There are ' + str(len(table_coordinates)) + ' tables.')
                    print("Table edge coordinates are: ", table_coordinates)

                    # Drawing the page with tables
                    v = v.draw_instance_predictions(outputs["instances"].to("cpu"))
                    page_with_bbox = v.get_image()[:, :, ::-1]
                    # cv2_imshow(page_with_bbox)

                    # pwbName = str(page) + '_' + str(len(table_coordinates)) + '_' + image.split('_',1)[1]
                    # save image, page with bounding boxes, in the folder
                    if not cv2.imwrite(os.path.join('/content/Page_with_Bbox/' + file_name.split('/')[-1] + '_dir', image), page_with_bbox):
                        raise Exception("Could not write image")
                    
                    
                    # loop tables in one page
                    for i in range(len(table_coordinates)):
                        table_imageName = str(page) + '_' + str(i+1) + '_' + image.split('_',1)[1]
                        x0 = int(table_coordinates[i][0])
                        y0 = int(table_coordinates[i][1])
                        x1 = int(table_coordinates[i][2])
                        y1 = int(table_coordinates[i][3])
                        crop = im[y0:y1, x0:x1]

                        id += 1
                        pdfFileName = file_name.split('/')[-1]
                        pageNum  = str(page)
                        tableNum = str(i+1)
                        pageWithBbox = image

                        # save image, table under bounding box, in the folder
                        if matt_filter(crop):
                            cv2_imshow(crop)
                            tableJPG = table_imageName
                            if not cv2.imwrite(os.path.join('/content/Table_JPG', table_imageName), crop):
                                raise Exception("Could not write image")
                        else:   
                            tableJPG = ''
                        table_info.append((id, pdfFileName, pageNum, tableNum, pageWithBbox, tableJPG))
                            
        cropfileName = '/content/Table_JPG/'
        self.crop_table_list = os.listdir(cropfileName)
        self.crop_table_list.sort()

        header = ['id', 'PDF File Name', 'Page Number', 'Table Number', 'Page With Bounding Box', 'Table Image Name']
        table_info_df = pd.DataFrame.from_records(table_info, columns = header)
        print(table_info_df)

        # save detection result into csv
        table_info_df.to_csv('/content/detectionResult.csv')


"""# Convert image into dataframe

### Func: image_to_data
1. image with title: par_num:0,1 block_num = 1, texts are title； 
2. image without title: part_num:0,1,2 header order by columns' location
"""

# Function to convet jpg to dataframe
def to_data_psm6(image_path):
    # psm 6 = Assume a single uniform block of text.
    custom_oem_psm_config6 = r'--oem 3 --psm 6'
    image_psm6 = pytesseract.image_to_data(image_path, lang = 'eng',
                                                output_type = pytesseract.Output.DATAFRAME, 
                                                config = custom_oem_psm_config6)
    return image_psm6

"""### cutting title in table"""

# Function to decide whether the tile exists
def if_title(image_psm6):
    # par_num = 2 means the body
    if 2 in image_psm6.get('par_num').array:
        return True
    else:
        return False

# Function to cut the tile, based on if_title()
def cut_title(image,image_psm6,image_path):
    
    # image -> x0:left,y0:top,x1:right,y1:below
    x0 = 0 
    y0 = 0
    x1 = image.shape[1]
    y1 = int(image.shape[0])

    # fing y0, new top coordinate below title
    for i in image_psm6.values:
        line   = i[4]
        top    = i[7]
        height = i[9]
        text   = i[-1]

        # generaly, title is in one line
        if line == 1:
            if str(text) != 'nan' and str(text) != ' ':
                print(text)
            y0 = int(top + height)
        elif line == 2:
            # break in second line
            break

    # croping the image under title
    crop = image[y0+10:y1, x0:x1][:, :, ::-1]
    cv2_imshow(crop) 
    
    # replace the image
    if not cv2.imwrite(os.path.join(image_path), crop):
                    raise Exception("Could not replace the image")
    else: print('The title was deleted successfully.')

# Function to save image after cutting the tile, based on cut_title()
def update_image(image_path):
    # read image
    image = cv2.imread(image_path)

    # psm 6
    image_psm6 = to_data_psm6(image_path)
    # print(image_psm6)

    # decide if there exists title in table.
    # if so, cut title
    if if_title(image_psm6) is False:
        print('The table '+ image_path + ' may exist title.')
        cut_title(image,image_psm6,image_path)

    # if not, continue
    else:
        print('The table '+ image_path + ' has no title.')
    
    return image_path

"""## image to row list 
combine texts from each cell in one row
"""

# Function to extract info in one row
def extract_each_row_info(image_psm6):
    # set a new dictionary to store each text info in each row
    each_row_info = {}

    # df by psm 6
    im_value = image_psm6.values
    for i in range(len(im_value)):
        # info in each cell
        line   = im_value[i][4]
        word   = im_value[i][5]
        left   = im_value[i][6]
        top    = im_value[i][7]
        width  = im_value[i][8]
        height = im_value[i][9]
        text   = im_value[i][11]
        info   = [line, left, top, width, height, text]

        # number of rows
        num_row = len(each_row_info)

        # change
        # change
        # change
        # change
        if str(text) != 'nan' \
            and str(text) != ' ' \
            and str(text) != '—'\
            and str(text) != '_'\
            and str(text) != '__':

            # add the text in the same row
            # 'same row' means same line and top coordinate, and strarting word > 1
            if num_row > 0 \
                    and line == each_row_info[num_row-1][0][0] \
                    and word > 1 \
                    and top - each_row_info[num_row-1][0][2] < 50:
                    # print(each_row_info[num_row-1][word-2])
                    if each_row_info[num_row-1][-1][5] == '$':
                        info[5] = '$'+ text
                        each_row_info[num_row-1][-1] = info
                    else:
                        # print(type(text),text)
                        each_row_info[num_row-1].append(info)
                        # print(each_row_info[num_row-1])
            
            else:
                # sort info in each row by left coordinates
                if num_row > 0:
                    each_row_info[num_row-1].sort(key=lambda x: x[1])

                # create a new row
                num_row = len(each_row_info)
                each_row_info[num_row] = [info]
                # print(each_row_info[num_row],num_row)

    return each_row_info

# Function to keep left coordinate and text in each cell in one row
def text_rc(each_row_column):
    # print(each_row_column)
    each_cell = []
    for i in range(len(each_row_column)):
        # info in list
        left   = each_row_column[i][0]
        width  = each_row_column[i][1]
        center = left+width/2
        text   = each_row_column[i][3]

        if i > 0:
            last_center = each_cell[0]
            last_text = each_cell[1]

            # keep the leftest coordinate
            if center < last_center:
                each_cell[0] = center
            # integrate the text in one cell
            each_cell[1] = last_text + ' ' + text

        else:
            each_cell.append(center)
            each_cell.append(text)
    return each_cell

# Function to integrate texts in each row
def combine_info(row_list):
    # create a new list to store the text in each row
    each_row = []
    length = len(each_row)
    for i in range(len(row_list)):
        # info without combination
        line   = row_list[i][0]
        left   = row_list[i][1]
        top    = row_list[i][2]
        width  = row_list[i][3]
        height = row_list[i][4]
        text   = row_list[i][5]
        
        # x_center = int(left + width/2)
        # y_center = int(left + width/2)


        # combine text if the space is too short
        if i > 0:
            # last info after combination 
            last_left  = each_row[length-1][-1][0]
            last_width = each_row[length-1][-1][1]
            last_top   = each_row[length-1][-1][2]

            # horizontal space between two text
            h_space = left - last_left - last_width
            # vertical space between two text
            v_space = top - last_top
            # print(text,each_row[length-1][-1][-1],h_space,v_space)

            
            if h_space < 26 and v_space < -10:
                each_row[length-1].insert(-1,[left,width,top,text])
            elif h_space < 26 and v_space >= -10:
                each_row[length-1].append([left,width,top,text])
            else:
                each_row[length-1] = text_rc(each_row[length-1])
                each_row.append([[left,width,top,text]])
        else:
            each_row.append([[left,width,top,text]])

    each_row[-1] = text_rc(each_row[-1])
    return each_row

# Function to output each row
def output_row(each_row_info):
    row_dic = []
    for k in range(len(each_row_info.values())):
        row_list = each_row_info[k]
        each_row = combine_info(row_list)
        row_dic.append(each_row)
    return row_dic

"""## row list to dataframe"""

from bisect import bisect_left

def take_closest(myList, myNumber):
    """
    Assumes myList is sorted. Returns closest value to myNumber.

    If two numbers are equally close, return the smallest number.
    """
    pos = bisect_left(myList, myNumber)
    if pos == 0:
        return myList[0]
    if pos == len(myList):
        return myList[-1]
    before = myList[pos - 1]
    after = myList[pos]
    if after - myNumber < myNumber - before:
       return after
    else:
       return before

def structure_body(row_dic):
    # max number of cells in one row
    max_len = 0
    max_index = None
    for idx, cell in enumerate(row_dic):
        if len(cell) >= max_len:
            max_len   = len(cell)
            max_index = idx

    # list of center coordinates in longest row
    center_list = [x[0] for x in row_dic[max_index]]
    # print(center_list)
    # print(max_len, max_index)

    for idx,cell in enumerate(row_dic):
        for i in cell:
            i[0] = take_closest(center_list,i[0])

    for key, cell in enumerate(row_dic):
        # print(cell)
        cell_in_row = {}
        for idx in range(max_len):
            center = center_list[idx]
            # print(center)
            cell_in_row[idx] = np.nan
            for i in cell:
                # print(i)
                if center-100 <= i[0] <= center+100:
                    cell_in_row[idx] = str(i[1])
                    break
            # print(cell_in_row[idx])

        row_dic[key] = cell_in_row

    first_r = [x for x in row_dic[0].values() if str(x) != 'nan']
    if len(first_r) <= 1 and len(first_r[0].split(' ')) > 4:
        del row_dic[0]

    return row_dic
        # break

"""# Separate header and body

## Beam-search
"""

# Commented out IPython magic to ensure Python compatibility.
## this is a naive dataset for typical financial table headings 
data = '''Mortgage Loan
Cut-off Date Principal Balance
# % ofInitial Outstanding Pool Balance
Lock- Out Period (months from Cut-off Date)
Year
Approximate Initial Principal Amount of Year Securities
Property Type
Number of Mortgage Loans
Aggregate Cut-off Date Principal Balance ($)
# % of Mortgage Pool by Aggregate Cut-off Date Principal Balance (4)
Weighted Average Gross Interest Rate (%)
Weighted Average Remaining Term (months)
Weighted Average Combined Original LTV (%)
Loan Purpose
Number of Mortgage Loans
Principal Balance
Principal Amount
Standard Interest Rate
Percentage of Mortgage Loans
Average Principal Balance
Weighted Average Credit Score
Weighted Average Loan-to-Value Ratio
Initial Certificate Principal Balance”
Class
Pass-Through Rate
Scheduled Final Maturity Date
Distribution Date
Offered Certificates
Stated Maturity
Initial Offering Price
S&P Rating
Moody's Rating
'''
tokenizer = Tokenizer()
max_sequence_len = None

## model functions
def dataset_preparation(data):

    # basic cleanup
    corpus = data.lower().split("\n")

    # tokenization	
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    # create input sequences using list of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

    # convert to one-hot 
    label = ku.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len, total_words

def create_model(predictors, label, max_sequence_len, total_words):
    # LSTM model
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
    model.add(LSTM(150, return_sequences = True))

    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')

    model.fit(predictors, label, epochs=20, verbose=1, callbacks=[earlystop])

    print(model.summary())
    return model 

## beam-search functions
# convert index to word


def ind_to_word(predicted_ind):
  answer = ''
  for i in predicted_ind:
    for word, index in tokenizer.word_index.items():
        if index == i:
          answer = answer+word+' '
  return answer    


# get the top k most predicted results
###

def get_topK(predicted, k):
    
    top_k = np.argsort(predicted[0])[-k:]

    return top_k, predicted[0][top_k]


# generate text, currently only works with k=1 
# (does not store candidates, you need to modify the code to store candidates and pick the highest scored sequence)

def generate_text(seed_text, initial_candidate_number, max_sequence_len, k=1):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
  predicted = model.predict(token_list)
  
  first_round_candidates, first_probs = get_topK(predicted,initial_candidate_number)
  
  # First set of candidates based on seed text
  candidates = []
  for i in range(initial_candidate_number):
    candidates.append([[first_round_candidates[i]],first_probs[i]])
  initial_candidates = []
  for kkk in range(initial_candidate_number):
    initial_candidates.append(ind_to_word(candidates[kkk][0]))

  # Using beamserch method to get top candidates
  candidates = beam_next(candidates, seed_text, k)
  best_sequence = candidates[:1]
  output_word = ind_to_word(best_sequence[0][0])
        
  seed_text += " " + output_word
        
  return seed_text, candidates, candidates_to_text(candidates)

def candidates_to_text(candidates):
  text_list = []
  for c in candidates:
    text_list.append(ind_to_word(c[0]))
  return text_list

# you can add more function if you want to
def beam_next(candidates, seed_text, k):
  K=k-1

  for kk in range(K):

    for i in range(k):
      temp_text = seed_text + " " + ind_to_word(candidates[i][0])
      token_list = tokenizer.texts_to_sequences([temp_text])[0]
      token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
      predicted = model.predict(token_list)
      round_candidates, probs = get_topK(predicted,k)
      for x in range(k):
        new_candidate = []
        for c in candidates[i][0]:
          new_candidate.append(c)
        new_candidate.append(round_candidates[x])
        new_prob = candidates[x][1] + probs[x]
        candidates.append([new_candidate,new_prob])
        
      
    ordered = sorted(candidates, key=lambda tup:tup[1], reverse=True)
    candidates = ordered[:k]
   
  return candidates

# Train a naive s2s model for table headers
predictors, label, max_sequence_len, total_words = dataset_preparation(data)
model = create_model(predictors, label, max_sequence_len, total_words)

def s2s_beamsearch_extract(table): 
  # table = table.df
  combinedRow = ()
  rowNo = len(table)
  columnNo = len(table.columns)
  savedList = []
  headerFrame = [""] * columnNo
  titleList = [""] * columnNo
  previous_guest_list = [[]] * columnNo
  pointToBreak = 0
  listToAdd =[]
  booleanList = [False] * columnNo
  skip = False
  previousRowIsTitle = False

  # First for loop should loop through the whole table, and if statement should 
  # used here to determine when we should stop with the first for loop
  # Row loop
  for i in range(0,rowNo):
    if(i == pointToBreak & i != 0):
      break
    else:
     
      tempList = previous_guest_list
      tupleToAdd = ()
      guessList = [[]] * columnNo

      #Column Loop
      for j in range(0,columnNo): 
        if (i==0):
          if (str(table[j][i])==""):
            guessList[j] = []
            headerFrame[j] = ''
            titleList[j] = False
          
          
          else:
            targetString = str(str(table[j][i]))
            targetStringLower = targetString.lower()    
            top_result, candidates_ind_prob, candidates_result = generate_text(targetStringLower, 10, max_sequence_len, k=1)
            guessList[j] = candidates_result
            headerFrame[j] = headerFrame[j] + targetString
            titleList[j] = True

        else:
          if (skip == True):
            skip = False
            headerFrame = [""] * columnNo
            previous_guest_list = [[]] * columnNo
            if (str(table[j][i])==""):
              guessList[j] = []
              headerFrame[j] = ''
              
            else:
              targetString = str(table[j][i])
              targetStringLower = targetString.lower()
              headerFrame[j] = headerFrame[j] + targetString


          else:
            if (str(table[j][i])==""):
              if(headerFrame[j] ==""):
    
                guessList[j] = []
                headerFrame[j] = ''
              else:
                booleanList[j] = False
              
            else:
            
              targetString = str(str(table[j][i]))
              targetStringLower = targetString.lower()
              
              if not tempList[j]:
                top_result, candidates_ind_prob, candidates_result = generate_text(targetString, 10, max_sequence_len, k=1)
                guessList[j] = candidates_result
                headerFrame[j] = headerFrame[j] + targetString
              else:
                for item in tempList[j]:
                  ##### This part has to be reviewed #######
                  if(item.strip() in targetStringLower):
                    headerFrame[j] = headerFrame[j] +" " + targetString
                    booleanList[j] = True
                    top_result, candidates_ind_prob, candidates_result = generate_text(headerFrame[j], 10, max_sequence_len, k=1)
                    guessList[j] = candidates_result
                    break
                  else:
                    booleanList[j] = False

        previous_guest_list= guessList

      if (i != 0):
        if any(booleanList) :
          print(" ")
        else:
          if (previousRowIsTitle != True):
            pointToBreak = i+1
          else:
            previousRowIsTitle = False
    
    if(i == 0):
      
      for count in range(len(titleList)):
       
        number = math.trunc(columnNo/2) - 1
        if (titleList[count] == True and count == number and sum(titleList) == 1):
          headerFrame = [""] * columnNo
          previous_guest_list = [[]] * columnNo
          skip = True
          previousRowIsTitle = True
          # booleanList[columnNo-1] = True
        else:
          print("  ")
    else:
      skip = False
    


  contentStartRow = pointToBreak -1
  for i in range(contentStartRow, rowNo):
 
    tupleToAdd = ()
    cellFrame = [""] * columnNo
    for j in range(0, columnNo):
      try:
        cellFrame[j] = cellFrame[j] +" "+ str(table[j][i])
      except:
        continue
    tupleToAdd = tuple(cellFrame)
    savedList.append(tupleToAdd)

  result = pd.DataFrame(savedList , columns = headerFrame)
  return result

"""## regular"""

def is_a_header_row(s):
    
    keyword_list = ['class','%','$','billion',
                    'November','A-','February',
                    'II','M-','I-A-A','I-A-B',
                    'II-A-','A-A','A-B',
                    'Ifl-A-','September','+','AAA','Class',
                   '.%',',',',,',',,,']
    
    header_keyword_list = ['Date','Balance','Rate','S&P','Moody','Certificates']
    
    s_no_number = ''.join([i for i in s if not i.isdigit()])
    s_no_number = ''.join(s_no_number.split('\n'))
    s_list = s_no_number.split(' ')
    hit = 0
    for item in s_list:
        if item in keyword_list:
            hit += 1
        else:
            for char in item:
                 if char in keyword_list:
                    hit+=1

    for item in s_list:
        if item in header_keyword_list:
            hit = 0

    if hit >= 2:
        return False
    else:
        return True
    
    
def regular_rows_identifier(df):
    row_array = df.text.values
    row_class = []
    for row in row_array:
        if is_a_header_row(row):
            row_class.append(0)
        else:
            row_class.append(1)
    return row_class


def table_row_to_text(table):
    text_from_table = ''
    total_text_list = []
    # for r_idx in range(len(table.rows)):
    for r_idx in range(table.shape[0]):
        text_from_table = ''
        # for c_idx in range(len(table.cols)):
        for c_idx in range(table.shape[1]):
            text_from_table = text_from_table + ' ' + str(table.values[r_idx][c_idx]).split('...')[0]
        total_text_list.append(text_from_table)
    return total_text_list

def regular_expression_identifier(table):
    df = pd.DataFrame(table_row_to_text(table), columns=['text'])
    header_body_list = regular_rows_identifier(df)
    
    # count how many headers are identified
    header_index = 1
    while header_index < table.shape[0]:
        if header_body_list[header_index] == 0:
            header_index += 1
        else:
            break
   
    # making header array
    cols_number = table.shape[1]
    header_array = []
    for r_idx in range(header_index):
        row_header = []
        for c_idx in range(cols_number):
            row_header.append(str(table.values[r_idx][c_idx]).replace('\n', ' ').replace('nan',''))
        
        if header_array:
            for cc in range(cols_number):
                header_array[cc] = header_array[cc] + ' ' + row_header[cc]
        else:
            header_array = row_header
        
    #making body arrays
    body_array = []
    while header_index < len(header_body_list):
        each_body_array = []
        for c_idx in range(cols_number):
            each_body_array.append(str(table.values[header_index][c_idx]).replace('\n', ' ').replace('nan',''))
        body_array.append(each_body_array)
        header_index += 1
    
    reformed_df = pd.DataFrame(body_array, columns = header_array) 
    
    return header_array, body_array, reformed_df

"""# Read image and Save CSV"""

def read_from_path(folder_path):
    listOfJPG = os.listdir(folder_path)
    

    for index,img in enumerate(listOfJPG):
        filename = os.path.join(folder_path, img)
        listOfJPG[index] = filename
    return listOfJPG

def extract_table(folder_path,method_name):
    listOfJPG = read_from_path(folder_path)

    dataf = pd.DataFrame()
    # print(len(listOfJPG))
    for image_path in listOfJPG:
        image_path = image_path
        # foler name is the PDF file name, file name is the page number plus PDF name
        # folder_name = image_path.split('/')[-1].split('_')[1].split('.')[0]
        file_name   = image_path.split('/')[-1].split('.')[0]

        if image_path.split('.')[-1] != 'jpg':
            print(image_path + 'is not the image.')
        print('image_path: --',image_path)

        ## todo Tin, we can use update_image to cut titiles. However, the performance
        ## is not stable.
        # image_path = update_image(image_path)

        # extract image TSV by py-tesseract
        image_psm6 = to_data_psm6(image_path)
        # print(image_psm6)


        each_row_info = extract_each_row_info(image_psm6)
        row_dic = structure_body(output_row(each_row_info))
        # for i in row_dic:
        #     print(i)
        df = pd.DataFrame(row_dic).dropna(how='all')
        # print(df)


        # combine header by bean search or regular
        if method_name == 'beamsearch':
            final_df = s2s_beamsearch_extract(df)
        elif method_name == 'regular':
            header,body,final_df = regular_expression_identifier(df)
        
        # create a folder to save csv file
        # if not os.path.exists('/content/csv'):
        #     os.makedirs('/content/csv', exist_ok=True)

        # check 
        if False in final_df.columns.duplicated():
            final_df = final_df.groupby(final_df.columns.values, axis=1).agg(lambda x: x.values.tolist()).sum().apply(pd.Series).T
        
        final_df.columns = list(i.strip() for i in final_df.columns)

        # write into DB
        final_df.to_sql(file_name, conn, if_exists='replace', index=True)

        # save dateframe into csv if you want
        final_df.to_csv('/content/csv/' + file_name + '.csv')
    
        # file name, page number, table number, columns
        page_num = int(file_name.split('_')[0])
        table_num = int(file_name.split('_')[1])

        # get the column
        for col in final_df.columns:
            row = pd.DataFrame([[file_name, page_num, table_num, col]],columns=['file name', 'page number', 'table number', 'column'])
            dataf = pd.concat([dataf, row], ignore_index=True)
    
    # print(dataf)
    dataf.to_sql('ColumnSummary', conn, if_exists='replace', index=True)

# Demo 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Input path of root folder

This part here is to setup the directory path of your root folder

The easiest way to get your file path that contains all of your pdf document is
to : 

1. First click the file button on the left navigation bar, you should able to 
see there is a drive file

2. Click it and there is a file name My Drive, after you
expand the "My Drive" folder, you would be able to see all the files that your google drive has

3. Right click the folder that contains all of your pdf documents
that you want to detect/extract

4. Click copy path, and paste the new root folder path into the root_folder params

5. After everything is set, run this code snippet.

In [None]:
root_folder = "/content/drive/My Drive/capstone/Fianl slides/test files" #@param {type:"string"}
listFiles = os.listdir(root_folder)
print(listFiles)

['20060131_!!00NE2G_Prospectus_SD000000000017379366.pdf', '20070619_!!00XY0D_Prospectus_SD000000000064855320.pdf', '20070705_!!00YPNU_Prospectus_SD000000000065504230.pdf', 'Statement.pdf']


### Select PDF files needed to be extracted

In this part you are able to select single pdf as your target or all of the pdf
documents that exist in your root_folder. 

1. Please copy the output from the previous code snippet

2. Replace the old list from the parameters, "pick_file". 

3. Select whether you want single file or whole folder from the dropdown option on the right.

4. If you select single file, please make sure you choose the desired file to extract on the pick_file option as well. If you select whole folder, just ignore the pick_file option.

5. After everything is ready, run this code snippet.

In [None]:
#@markdown Sinlge file or multiple files as input.
singleFile_or_multipleFiles = 'single file'  #@param ['single file', 'whole folder'] {allow-input: true}
pick_file = "20060131_!!00NE2G_Prospectus_SD000000000017379366.pdf" #@param ['20060131_!!00NE2G_Prospectus_SD000000000017379366.pdf', '20070619_!!00XY0D_Prospectus_SD000000000064855320.pdf', '20070705_!!00YPNU_Prospectus_SD000000000065504230.pdf'] {allow-input: true}


if singleFile_or_multipleFiles == 'single file':
    input_p = root_folder + '/' + pick_file
else:
    input_p = root_folder

# Read pdf files by using class object TableHero
# And save it to a folder called 'allPaga_Image'
tableHero = TableHero(input_path = input_p)

The input is one pdf file.
Transfering to JPG for table detection:
 /content/drive/My Drive/capstone/Fianl slides/test files/20060131_!!00NE2G_Prospectus_SD000000000017379366.pdf 
This may take a while...
Convert all PDF files succussfully!


AssertionError: ignored

### Demo - detect_tables()

tableHero.detect_tables() will perform a three step operation here.

**Output: 1. Page_with_Bbox folder, 2. Table_JPG, 3. DetectionResult.csv**

1. tableHero will first identify all the tables that contains in the pdf documents regardless of it is a target table or non-target table and store it in the folder named "Page_with_Bbox". In this folder, it contains all the orignal pdf pages that table in that page will be bound with a bounding box. 

2. tableHero then will start filtering the result on folder "Page_with_Bbox", it starts to filter out those pdf pages with non-target tables since we only want tables that has principal amount and ratings etc.

3. tableHero will generate DetectionResult.csv at last. This csv file contains several column, which are PDF file names, page number, table number, page with bounding box and Table Image Name. If column "Table Image Name" cell is not empty, then it means that is the required table image and vice versa. 



In [None]:
tableHero.detect_tables()

### Demo - extract_table()

**Output: 1. Table_DB.db **

extract_table('/content/Table_JPG', 'regular')

This function here :

1. extract all the image from "Table_JPG" folder that we generate from the previous code snippet into a .db file. It uses regular expression to determine whether a row should be identified as header row or not. For details, please go to Major function Snippet "regular" section.

ps. db file contains all table that can be import into sqlite database and we also create a special table named "column_summary" that shows all the headers from different tables per user requirements.



In [None]:
 extract_table('/content/Table_JPG', 'regular')