<a href="https://colab.research.google.com/github/Bhavya05shah/OCR-Digitalisation/blob/main/easyOCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2
import easyocr
import numpy as np
import pandas as pd

# Initialize the EasyOCR reader
reader = easyocr.Reader(['hi'])

# Step 1: Load the image
image = cv2.imread('OCRexp1.png')

# Step 2: Convert to grayscale and apply thresholding
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV)

# Step 3: Detect table lines using morphological operations
kernel = np.ones((1, 5), np.uint8)  # Horizontal kernel for detecting horizontal lines
horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

kernel = np.ones((5, 1), np.uint8)  # Vertical kernel for detecting vertical lines
vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

# Combine horizontal and vertical lines to get the table structure
table_structure = cv2.add(horizontal_lines, vertical_lines)

# Step 4: Detect and extract table cells
# Use contours to detect individual cells
contours, _ = cv2.findContours(table_structure, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Sort contours (optional, but may help in extracting cells in correct order)
contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[1])  # Sort by y-coordinate first

table_data = []
for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)
    cell = image[y:y+h, x:x+w]

    # Extract text using easyocr
    result = reader.readtext(cell, detail=0)
    if result:
        text = ' '.join(result)  # Join text if multiple results
    else:
        text = ''

    table_data.append(text.strip())

# Here we assume the table has a consistent number of columns.
# If needed, manually define n_cols based on the image structure.
n_cols = 5  # Example: Assuming 5 columns in the table

# Into a 2D array
table_array = np.array(table_data).reshape(-1, n_cols)

df = pd.DataFrame(table_array)

df.to_csv('digital_health_record.csv', index=False)

print(df)




[[  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0 255]
 [  0   0   0 ...   0   0 255]
 [255 255 255 ... 255 255 255]]


TypeError: cv2_imshow() takes 1 positional argument but 2 were given

# New Section