In [121]:
import numpy as np
import cv2

In [122]:
def create_cross_kernel(size):
    kernel = np.zeros((size, size), dtype=np.uint8)
    center = size // 2
    kernel[:, center] = 1
    kernel[center, :] = 1
    return kernel

def filterSmallBoxes(bounding_boxes, min_w=10, min_h=10):
    filtered_boxes = [box for box in bounding_boxes if box[2] >= min_w and box[3] >= min_h]
    return filtered_boxes

def gaussian_blur(image, kernel_size):
    kernel = np.fromfunction(
        lambda x, y: (1/(2*np.pi*(kernel_size/2)**2)) * np.exp(-((x-(kernel_size-1)/2)**2 + (y-(kernel_size-1)/2)**2) / (2*(kernel_size/2)**2)),
        (kernel_size, kernel_size)
    )
    kernel /= np.sum(kernel)
    blurred_image = cv2.filter2D(image, -1, kernel)
    return blurred_image

# Word Detection

In [123]:
def word_filter(frame, width_percentage=0.80, height_percentage=0.135):
    height, width, _ = frame.shape

    # Get the region which the subtitle at
    roi_width = int(width * width_percentage)
    roi_height = int(height * height_percentage)
    roi_x = int((width - roi_width) / 2)
    roi_y = int(height - roi_height)
    roi = frame[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width]

    # Change the roi background into black except white pixels
    hsv_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
    lower_white = np.array([0, 0, 200], dtype=np.uint8)
    upper_white = np.array([255, 15, 255], dtype=np.uint8)
    white_mask = cv2.inRange(hsv_roi, lower_white, upper_white)

    black_frame = np.zeros_like(frame)
    black_frame[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width][white_mask != 0] = frame[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width][white_mask != 0]
    
    kernel = np.ones((1,15), np.uint8)
    black_frame = cv2.dilate(black_frame, kernel, iterations=1)
    return black_frame



def find_bounding_boxes(frame, kernel_size=3):
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame = cv2.GaussianBlur(frame, (kernel_size, kernel_size), 5)

    _, binary_frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    _, labels, stats, _ = cv2.connectedComponentsWithStats(binary_frame, connectivity=8)

    bounding_boxes = []
    for stat in stats[1:]:
        x, y, w, h = stat[0], stat[1], stat[2], stat[3]
        if w > 5 and h > 5:
            bounding_boxes.append((x, y, w, h))

    bounding_boxes = filterSmallBoxes(bounding_boxes)
    return bounding_boxes



def draw_word_box(frame, bounding_boxes):
    for box in bounding_boxes:
        x, y, w, h = box
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 1)
    return frame

# Line Detection

In [124]:
def white_pixel_filter(frame, width_percentage=0.80, height_percentage=0.135):
    height, width, _ = frame.shape
    
    # Get the region which the subtitle at
    roi_width = int(width * width_percentage)
    roi_height = int(height * height_percentage)
    roi_x = int((width - roi_width) / 2)
    roi_y = int(height - roi_height)
    roi = frame[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width]

    # Change the roi background into black except white pixels
    hsv_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
    lower_white = np.array([0, 0, 200], dtype=np.uint8)
    upper_white = np.array([255, 15, 255], dtype=np.uint8)
    white_mask = cv2.inRange(hsv_roi, lower_white, upper_white)

    black_frame = np.zeros_like(frame)
    black_frame[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width][white_mask != 0] = frame[roi_y:roi_y + roi_height, roi_x:roi_x + roi_width][white_mask != 0]
    
    # Remove white noise
    # kernel = np.ones((1,5), np.uint8)
    kernel = create_cross_kernel(4)
    black_frame = cv2.dilate(black_frame, kernel, iterations=1)
    black_frame = cv2.erode(black_frame, kernel, iterations=1)
    return black_frame



def group_words_into_line(bounding_boxes, max_vertical_gap=10):
    if not bounding_boxes:
        return []

    bounding_boxes.sort(key=lambda box: box[1])
    grouped_lines = []
    current_line = [bounding_boxes[0]]

    for box in bounding_boxes[1:]:
        _, y, _, _ = box
        _, prev_y, _, _ = current_line[-1]
        if abs(y - prev_y) <= max_vertical_gap:
            current_line.append(box)
        else:
            grouped_lines.append(current_line)
            current_line = [box]
    grouped_lines.append(current_line)
    return grouped_lines



def draw_line_box(frame, grouped_lines, padding=5):
    for line in grouped_lines:
        x_min = max(0, min(box[0] - padding for box in line))
        y_min = min(box[1] for box in line)
        x_max = min(frame.shape[1], max(box[0] + box[2] + padding for box in line))
        y_max = max(box[1] + box[3] for box in line)
        
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 0, 255), 1)
    return frame

In [125]:
def display_filtered_video_using_mask_filtration(video_path):
    cap = cv2.VideoCapture(video_path)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        filtered_frame = white_pixel_filter(frame)
        cv2.imshow('Line Segmentation', filtered_frame)
        filtered_word = word_filter(frame)
        cv2.imshow('Word Segmentation', filtered_word)

        line_boxes = find_bounding_boxes(filtered_frame)
        word_boxes = find_bounding_boxes(filtered_word)

        word_green_frame = draw_word_box(frame.copy(), word_boxes)
        # cv2.imshow('Word Frame', word_green_frame)

        lines = group_words_into_line(line_boxes, max_vertical_gap=10)
        line_red_frame = draw_line_box(frame.copy(), lines)
        # cv2.imshow('Line Frame', line_red_frame)

        final_frame = draw_word_box(line_red_frame, word_boxes)
        cv2.imshow('All is good', final_frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# main

In [126]:
if __name__ == '__main__':
    video_path = 'Project Video.mp4'
    display_filtered_video_using_mask_filtration(video_path)