In [6]:
from paddleocr import PaddleOCR,draw_ocr
import layoutparser as lp
import os
import cv2
import pandas as pd
from transformers import TableTransformerForObjectDetection
from transformers import DetrFeatureExtractor
import torch
from PIL import Image
from pdf2image import convert_from_path
import csv
import ast
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from catboost import CatBoostClassifier
import os
import re
import pdfplumber
import pandas as pd
import PyPDF2
import tabula.io as tabula
from pdf2image import convert_from_path
from PIL import Image as PilImage
from img2table.document import Image as TableImage
from tabula.io import read_pdf
import joblib
import numpy as np


# Detection 

In [4]:
def table_detection_microsoft(image, threshold=0.3):
    model_microsoft = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
    feature_extractor = DetrFeatureExtractor()
    width, height = image.size
    target_sizes = [(height, width)]
    encoding = feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model_microsoft(**encoding)
    results = feature_extractor.post_process_object_detection(outputs, threshold=threshold, target_sizes=target_sizes)[0]
    return {
        'microsoft_probabilities': str(results['scores'].tolist())
    }

def table_detection_paddle(image):
    model = lp.PaddleDetectionLayoutModel(
        config_path="lp://TableBank/ppyolov2_r50vd_dcn_365e_tableBank_latex/config",
        label_map={0: "Table"},
        threshold=0.3,
        enforce_cpu=False,
        enable_mkldnn=True
    )
    image_cv = np.array(image)[:, :, ::-1]  # Convert PIL.Image to cv2 image (BGR)
    layout = model.detect(image_cv)
    scores = [l.score for l in layout if l.type == 'Table']
    return {
        'paddle_probabilities': str(scores)
    }


In [7]:

def detect_table_pdfplumber(img_pdf_path):
    try:
        with pdfplumber.open(img_pdf_path) as pdf:
            page = pdf.pages[0]
            table = page.extract_table()
            return bool(table)
    except Exception as e:
        return False

def detect_table_tabula(img_pdf_path):
    try:
        tables = tabula.read_pdf(img_pdf_path, pages=1)
        return len(tables) > 0
    except Exception as e:
        return False

def detect_table_img2table(img_path):
    try:
        ti = TableImage(img_path)
        table_data = ti.get_table()
        return bool(table_data)
    except Exception as e:
        return False

In [69]:
def final_table_detector(pdf_directory, output_file, table_directory, no_table_directory):
    data = []

    # Iterate over the PDF files in the directory
    for file in os.listdir(pdf_directory):
        if not file.endswith(".pdf"):  # Skip non-PDF files
            continue
        file_path = os.path.join(pdf_directory, file)
        # Convert PDF to images
        images = convert_from_path(file_path)
        for i, img in enumerate(images, start=1):
            pdf_name = os.path.splitext(file)[0]
            page_number = i

            # Temporary image file path for pdfplumber and tabula
            temp_img_pdf_path = f'temp_{pdf_name}_page_{page_number}.pdf'
            img.save(temp_img_pdf_path, 'PDF')

            # Detect tables using pdfplumber
            table_pdfplumber = detect_table_pdfplumber(temp_img_pdf_path)

            # Detect tables using tabula
            table_tabula = detect_table_tabula(temp_img_pdf_path)

            # Detect tables using img2table
            table_img2table = detect_table_img2table(f'temp_{pdf_name}_page_{page_number}.png')

            # Cleanup temporary files
            os.remove(temp_img_pdf_path)

            # Detect tables using Microsoft model
            ms_result = table_detection_microsoft(img)
            ms_probabilities = ast.literal_eval(ms_result['microsoft_probabilities'])
            ms_max_prob = max(ms_probabilities) if ms_probabilities else 0
            
            # Detect tables using Paddle model
            paddle_result = table_detection_paddle(img)
            paddle_probabilities = ast.literal_eval(paddle_result['paddle_probabilities'])
            paddle_max_prob = max(paddle_probabilities) if paddle_probabilities else 0

            # Append the results
            data.append({
                'pdf_name': pdf_name,
                'page_number': page_number,
                'microsoft_max_probabilities': ms_max_prob,
                'paddle_max_probabilities': paddle_max_prob,
                'pdfplumber_table_found': table_pdfplumber,
                'img2table_table_found': table_tabula,
                'tabula_table_found': table_img2table
            })
    
    # Create a DataFrame
    tables = pd.DataFrame(data)

    X_test= tables[["microsoft_max_probabilities", "paddle_max_probabilities","pdfplumber_table_found", "img2table_table_found", "tabula_table_found" ]]
    
    model_CatBoost_loaded_model = CatBoostClassifier().load_model("./model_CatBoost.cbm final_trained_model.cbm")

    # Load the model from the file

    # Use the loaded model to make predictions
    prediction = model_CatBoost_loaded_model.predict(X_test)

    tables["table_found"] = prediction
    
    
    # Save the images to corresponding folders
    for i, row in tables.iterrows():
        img = images[i]
        if row['table_found'] == "True":
            img.save(os.path.join(table_directory, f'{row["pdf_name"]}_page_{row["page_number"]}.png'))
        else:
            img.save(os.path.join(no_table_directory, f'{row["pdf_name"]}_page_{row["page_number"]}.png'))

    # Remove unnecessary columns
    tables = tables.drop(columns = ["microsoft_max_probabilities", "paddle_max_probabilities", "pdfplumber_table_found", "img2table_table_found", "tabula_table_found"])

    # Save the DataFrame to a CSV file
    tables.to_csv(output_file, index=False)

# Prediction

In [70]:
final_table_detector("./Test_data", "./Test_data/output.csv", "./Test_data/Table","./Test_data/No_Table")


E0802 16:23:26.917349 229287424 analysis_config.cc:121] Please use PaddlePaddle with GPU version.
E0802 16:23:34.069947 229287424 analysis_config.cc:121] Please use PaddlePaddle with GPU version.
E0802 16:23:40.642962 229287424 analysis_config.cc:121] Please use PaddlePaddle with GPU version.
E0802 16:23:47.181788 229287424 analysis_config.cc:121] Please use PaddlePaddle with GPU version.
E0802 16:23:53.730849 229287424 analysis_config.cc:121] Please use PaddlePaddle with GPU version.
E0802 16:24:00.213634 229287424 analysis_config.cc:121] Please use PaddlePaddle with GPU version.
E0802 16:24:06.722815 229287424 analysis_config.cc:121] Please use PaddlePaddle with GPU version.
E0802 16:24:13.345325 229287424 analysis_config.cc:121] Please use PaddlePaddle with GPU version.
