In [6]:
# import torch
# torch.cuda.is_available()

In [1]:
import cv2
import numpy as np
from PIL import Image
import pytesseract
import json

In [8]:
img_file = "media/semester-4.png"

In [9]:

img = Image.open(img_file)

In [10]:
ocr_result = pytesseract.image_to_string(img)
# print(ocr_result)

In [11]:
image = cv2.imread(img_file)

In [12]:
# Firstly converting the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)


In [13]:
# Convert into binary image
_, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
cv2.imwrite('temp/binary_image.png', thresh)

True

In [14]:
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 2)) # creating a kernel of 10x2
dilated = cv2.dilate(thresh, kernel, iterations=1) # dilating means white thickening of text (if text color is white and background is black in binary)
cv2.imwrite('temp/white_thickend_dilated.png', dilated)

True

In [15]:
# identify contours-> contours are boxes/area which defines the boundary inside an image
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)


In [16]:
# Extract and sort bounding boxes into rows
boxes = [cv2.boundingRect(c) for c in contours]
boxes = sorted(boxes, key=lambda b: (b[1], b[0]))  # sort top to bottom, then left to right

In [17]:
rows = []
current_row = []
row_y_threshold = 15

for box in boxes:
    x, y, w, h = box #getting the coordinates of bounding boxes
    if not current_row:
        current_row.append(box)
    else:
        if abs(y - current_row[-1][1]) <= row_y_threshold:
            current_row.append(box)
        else:
            rows.append(sorted(current_row, key=lambda b: b[0]))  # sort left to right
            current_row = [box]
if current_row:
    rows.append(sorted(current_row, key=lambda b: b[0]))

table = []
for row in rows:
    row_data = []
    for (x, y, w, h) in row:
        roi = image[y:y+h, x:x+w] #getting the region of interest
        text = pytesseract.image_to_string(roi, config='--psm 6').strip()
        row_data.append(text)
    table.append(row_data)
print(table)

[['Subject Code', 'Subject', 'Credit'], ['MATH 207', 'Differential Equations and Complex Variables', '4'], ['MCSC 202', 'Numerical Methods', '3'], ['COMP 204', 'Communication and Networking', '3'], ['COMP 231', 'Microprocessor and Assembly Language', '3'], ['COMP 232', 'Database Management Systems', '3'], ['COMP 207', 'Computer Project II', '2']]


In [18]:
# Create JSON from table
import re
header = table[0]
json_data = []

hardcoded_credits = {
    "Laboratory Work": 1,
}

def extract_credit(val):
    match = re.search(r'\d+', val)  # Look for the first number in the string
    if match:
        return int(match.group())
    return 2  # default credit if no number is found

for row in table[1:]:
    if len(row) == 2: # for elective subjects with no subject code
        row = ["", row[0], row[1]]  # Fill Subject Code as empty
    elif len(row) < len(header):
        row += [""] * (len(header) - len(row))  # pad with empty strings
    item = {}
    subj_val = row[header.index("Subject")].strip()
    for i, cell in enumerate(row):
        key = header[i]
        val = row[i]
        if "Credit" in key:
            if subj_val in hardcoded_credits:
                val = hardcoded_credits[subj_val]
            else:
                val = extract_credit(val)
        item[key] = val
    json_data.append(item)

# Output the result
print(json.dumps(json_data, indent=2))

[
  {
    "Subject Code": "MATH 207",
    "Subject": "Differential Equations and Complex Variables",
    "Credit": 4
  },
  {
    "Subject Code": "MCSC 202",
    "Subject": "Numerical Methods",
    "Credit": 3
  },
  {
    "Subject Code": "COMP 204",
    "Subject": "Communication and Networking",
    "Credit": 3
  },
  {
    "Subject Code": "COMP 231",
    "Subject": "Microprocessor and Assembly Language",
    "Credit": 3
  },
  {
    "Subject Code": "COMP 232",
    "Subject": "Database Management Systems",
    "Credit": 3
  },
  {
    "Subject Code": "COMP 207",
    "Subject": "Computer Project II",
    "Credit": 2
  }
]
