In [None]:
import os
import cv2
import imutils
import numpy as np
from imutils.object_detection import non_max_suppression
import pytesseract

In [None]:
dataset =[ os.path.join("data", i) for i in os.listdir("data") ]
print("Total number of images in the dataset : {}".format(len(dataset)))

In [None]:
base_image = cv2.imread(dataset[0])
cv2.namedWindow("Base Image", cv2.WINDOW_NORMAL) # to create an window and then populate it with the image
# WINDOW_NORMAL : in order to allow for the resizing of the window thats displayed.
cv2.imshow("Base Image", base_image) 
cv2.waitKey(0)
cv2.destroyAllWindows()
base_image.shape # this is the resolution of the image, (height,width)

In [None]:
# East  detector model will be used for the detection of the text from the screen
# this requires that the image width  & height to be a multiple of 32
# So image will be resized to 1920 and 1280
resized_image = cv2.resize(base_image, (1280, 1984))
resized_image.shape

In [None]:
# cv2.namedWindow("Resized Image", cv2.WINDOW_NORMAL) # to create an window and then populate it with the image
# WINDOW_NORMAL : in order to allow for the resizing of the window thats displayed.
cv2.imshow("Resized Image", resized_image) 
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
orig_height, orig_width = base_image.shape[:2]
new_height, new_width = resized_image.shape[:2]
width_ratio = orig_width/float(new_width)
heigth_ratio = orig_height/float(new_height)

In [None]:
# The below are the two layers that we are intrested in :
#  one to get the probabilities and the
#  second is to derive bounding boxes of the text.

layerNames = ["feature_fusion/Conv_7/Sigmoid","feature_fusion/concat_3"]
east_text_detector = cv2.dnn.readNet("frozen_east_text_detection.pb")
print("EAST text detector loaded :  ", east_text_detector)

In [None]:
# construct a blob from the image to forward pass it to EAST model
blob = cv2.dnn.blobFromImage(image=resized_image,
                             scalefactor=1.0,
                             size=(new_width, new_height),
                             mean=(123.68, 116.78, 103.94),
                             swapRB=True,crop=False)

print("Blob shape ----> ", blob.shape, blob.dtype)
print("resized_image shape ----> ", resized_image.shape, resized_image.dtype)
east_text_detector.setInput(blob)
(scores, geometry) = east_text_detector.forward(layerNames)

In [None]:
# Scores contains the probability of the region containing the text
print(scores.shape, scores.ndim, scores.shape[2:4])
# For the box co-ordinates of the text
print(geometry.shape, geometry.ndim, geometry.shape[2:4])

In [None]:
def decode_text_predictions(scores, geometry):
    rows,cols = scores.shape[2:4]
    min_confidence = 0.5
    rects = []
    confidences = []

    for y in range(0, rows):
        # extract the scores (probabilities), followed by the geometrical
        # data used to derive potential bounding box coordinates that
        # surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]
        # loop over the number of columns
        for x in range(0, cols):
            # if our score does not have sufficient probability, ignore it
            if scoresData[x] < min_confidence:
                continue
            # compute the offset factor as our resulting feature maps will
            # be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
            # extract the rotation angle for the prediction and then
            # compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)
            # use the geometry volume to derive the width and height of
            # the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]
            # compute both the starting and ending (x, y)-coordinates for
            #  the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            # add the bounding box coordinates and probability score to
            # our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])
    return (rects, confidences)

In [None]:
rects, confidences =  decode_text_predictions(scores, geometry)
# apply non-maxima suppression to suppress weak, overlapping bounding boxes
boxes = non_max_suppression(np.array(rects), probs=confidences)
# loop over the bounding boxes
for (startX, startY, endX, endY) in boxes:
    # scale the bounding box coordinates based on the respective
    # ratios
    startX = int(startX * width_ratio)
    startY = int(startY * heigth_ratio)
    endX = int(endX * width_ratio)
    endY = int(endY * heigth_ratio)
    # draw the bounding box on the image
    cv2.rectangle(base_image, (startX, startY), (endX, endY), (0, 255, 0), 2)
cv2.imshow("Text Detection", base_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
# Now for live video

In [None]:
# from imutils.video import VideoStream
(W, H) = (None, None)
(newW, newH) = (480,640)
(rW, rH) = (None, None)
# vs = VideoStream(src=0).start()
cap = cv2.VideoCapture(0)
i = 0
cv2.namedWindow("Realtime", cv2.WINDOW_NORMAL) # to create an window and then populate it with the image

while True:
    # grab the current frame, then handle if we are using a
    # VideoStream or VideoCapture object
    ret, frame = cap.read()
    cv2.imshow("Realtime", frame) 
    cv2.waitKey(0)
    # frame = frame[1] # if args.get("video", False) else frame
    # check to see if we have reached the end of the stream
    if not ret:
        break
    print("Source Frame shape ,", frame.shape, frame.dtype)
    # resize the frame, maintaining the aspect ratio
    # frame = imutils.resize(frame, width=960)
    # orig = frame.copy()
    # if our frame dimensions are None, we still need to compute the
    # ratio of old frame dimensions to new frame dimensions
    #if W is None or H is None:
    #    (H, W) = frame.shape[:2]
    #    rW = W / float(newW)
    #    rH = H / float(newH)
    # resize the frame, this time ignoring aspect ratio
    # frame = cv2.resize(frame, (newW, newH))
    print(frame.shape, newW,newH,frame.dtype)
    frame_blob = cv2.dnn.blobFromImage(frame, 1.0, (newW, newH), (123.68, 116.78, 103.94), swapRB=True, crop=False)
    print("Blob shape ---->", frame_blob.shape, frame_blob.dtype)
    east_text_detector.setInput(frame_blob)
    try:
        (scores, geometry) = east_text_detector.forward(layerNames)
    except Exception as ex:
        print(ex)
        continue
    # decode the predictions, then  apply non-maxima suppression to
    # suppress weak, overlapping bounding boxes
    (rects, confidences) = decode_text_predictions(scores, geometry)
    boxes = non_max_suppression(np.array(rects), probs=confidences)
    # loop over the bounding boxes
    for (startX, startY, endX, endY) in boxes:
        # scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)
        # draw the bounding box on the frame
        cv2.rectangle(frame, (startX, startY), (endX, endY), (0, 255, 0), 2)
    
    cv2.imshow("Realtime", frame)
    key = cv2.waitKey(1) & 0xFF
    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
        break
cv2.destroyAllWindows()