## Importing necessary libraries

In [1]:
from img2table.document import Image
from img2table.ocr import PaddleOCR
from PIL import Image as PIL_Image, ImageDraw
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt

In [2]:
#######################################################################
# Code just for displaying images in full size, remove for final code #
#######################################################################

def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

In [3]:
# Dictionary for storing marks of each papers
my_dict = {'1a': [], '1b': [], '1c': [], '2a': [], '2b': [], '2c': [], '3a': [], '3b': [], '3c': [], '4a': [], '4b': [], '4c': [], '5a': [], '5b': [], '5c': [], '6a': [], '6b': [], '6c': [], '7a': [], '7b': [], '7c': [], '8a': [], '8b': [], '8c': [], '9a': [], '9b': [], '9c': [], '10a': [], '10b': [], '10c': [], '11a': [], '11b': [], '11c': [], '12a': [], '12b': [], '12c': []}

## Image opening & performing OCR

In [4]:
from img2table.ocr.tesseract import TesseractOCR
from img2table.document import Image
src = "D:/AJAYMON/AJAY/Programming/S6_Mini_Project/Codes/image to table/ad_cut.jpg"

path = "C:/Program Files/Tesseract-OCR/tessdata/eng.traineddata"
# Instantiation of OCR
tess_ocr = TesseractOCR(n_threads=1, lang="eng", tessdata_dir=path)

# tessdata_dir="C:/Program Files/Tesseract-OCR/tesseract.exe"

# Instantiation of document, either an image or a PDF
doc = Image(src, dpi=200)

# Table extraction
extracted_tables = doc.extract_tables(ocr=tess_ocr,
                                      implicit_rows=True,
                                      min_confidence=50)

extracted_tables

CalledProcessError: Command 'tesseract C:\Users\ASUS\AppData\Local\Temp\tmp7xax2brh.jpg stdout --psm 11 -l eng hocr' returned non-zero exit status 1.

In [None]:
from IPython.display import display_html

table = extracted_tables.pop()
display_html(table.html_repr(title="Regular table"), raw=True)

In [None]:
paddle_ocr = PaddleOCR(lang="en")
src = "D:/AJAYMON/AJAY/Programming/S6_Mini_Project/Codes/image to table/ab_cut.jpg"

In [None]:
# doc = Image(src, dpi=200)
# extracted_tables = doc.extract_tables(ocr=paddle_ocr, implicit_rows=True, min_confidence=50)

In [None]:
# Load the image using PIL
img = PIL_Image.open(src)

# Create a draw object
draw = ImageDraw.Draw(img)

img.save("temp/ori_img.jpg")
display("temp/ori_img.jpg")

## Getting red-lines on border lines of the image

In [None]:
for table in extracted_tables:
    for row in table.content.values():
        for cell in row:
            draw.rectangle((cell.bbox.x1, cell.bbox.y1, cell.bbox.x2, cell.bbox.y2), outline="red", width=3)
            
img.save("temp/img_with_redlines.jpg")
display("temp/img_with_redlines.jpg")

## Extracting info & Preprocessing the dataframe

In [None]:
df = table.df
df

In [None]:
df = df.iloc[1:, 1:] # deleting first row and column
df = df.drop(index=df.index[-1]) # remove the last row
df

## Flattening & adding marks to my_dict

In [None]:
arr = df.to_numpy()
flat = arr.flatten(order='F') # flattening column-wise
cell_vals = [str(i) for i in flat]
cell_vals

In [None]:
len(cell_vals) # 36 should be the result

In [None]:
# Adding values to dictionary
i = 0
for key in my_dict:
    my_dict[key].append(cell_vals[i])
    i+=1

my_dict

## Dictionary to dataframe & it's preprocessing

In [None]:
# dictionary pre-processes
dict_df = pd.DataFrame(my_dict)
dict_df # original df of dictionary

### The df output may have 'None' values but they're actually NaN, so they won't come in exported CSV file

In [None]:
col_name = dict_df.columns[(dict_df == 'None').all()] # finding the column with "None" word
dict_df = dict_df.drop(col_name, axis=1) # delete the identified columns
dict_df = dict_df.replace(to_replace="None", value=np.nan) # Replacing all "None" to NaN, which will be empty when converted to CSV
dict_df

In [None]:
# saving dict as csv
dict_df.to_csv("dict_csv.csv", index=False)