In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

import cv2
from tqdm.notebook import tqdm

In [None]:
#Location of the downloaded manga chapters
source_dir = 'D:\\OPM\\source\\'
#Target location of the split manga pages
target_dir = 'D:\\OPM\\target\\'

In [None]:
def find_patch_size(chapter_dir):
    """
    This function tries to find the borders between two patches. This is accomplished by using Probabilistic Hough Lines with an angle or 0 degrees.
    Note: The End and the beginning of a patch is often not easy to distinguish. Thereforce, multiple pages of each chapter are evaluated and checked
    for horizontal lines. The most frequent line candidates for a patch boundary are being evaluated with the known possible patch dimensions.
    In the case that the patch height can not be determined, the user will be asked for input with a sample image.
    """
    horizontals = []
    for i in range(0, len(os.listdir(chapter_dir)), 3):
        
        img = cv2.imread(f'{chapter_dir}{os.sep}{os.listdir(chapter_dir)[i]}', cv2.IMREAD_COLOR)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        blur = cv2.GaussianBlur(gray, (5, 5), 3)

        _, thresh = cv2.threshold(blur, 200, 255, cv2.THRESH_BINARY)

        dst = cv2.Canny(thresh, 50, 200, apertureSize=5)

        lines = cv2.HoughLinesP(dst ,rho = 1,theta = 1*np.pi/180, threshold=100, minLineLength=100, maxLineGap=50)

        try:
            for line in lines:
                x1, y1, x2, y2 = line[0]
                angle = np.arctan2(y2 - y1, x2 - x1) * 180. / np.pi
                
                if int(angle) == 0:
                    horizontals.extend([y1, y2])
                    
        except:
            pass

    #Calculate the five most frequent horizontal line.
    horizontals_mode = Counter(horizontals)
    horizontals_mode = sorted([int(y) for (y, _) in horizontals_mode.most_common(6)])

    #Check if the most frequent horizontal lines are within a margin of a few pixels of the two known patch heights.
    if any([(276 <= divmod(x, 280)[1] <= 284) for x in horizontals_mode]):
        return 280
    
    elif any([(308 <= divmod(x, 312)[1] <= 316) for x in horizontals_mode]):
        return 312
    
    else:
        y1 = [280, 280]
        y2 = [312, 312]
        
        fig, ax = plt.subplots(dpi=200)
        ax.imshow(img)
        ax.axhline(y1[0], color='red', linewidth='1')
        ax.axhline(2 * y1[0], color='red', linewidth='1')
        ax.axhline(y2[0], color='blue', linewidth='1')
        ax.axhline(2 * y2[0], color='blue', linewidth='1')

        ax.set_yticks([*ax.get_yticks(), y1[0], y2[0], 2*y1[0], 2*y2[0]], 
                      labels=[*ax.get_yticklabels(), y1[0], y2[0], f'2*{y1[0]}', f'2*{y2[0]}'])
        ax.set_ylim([img.shape[0], 0])
        plt.show()
        
        return int(input('Patch height (280 or 312): '))

In [None]:
def crop_image(src_dir, trgt_dir, s_height):
    """
    This function will split the image into smaller patches and save the patches into their corresponding chapter folder.
    """
    for subdir, dirs, files in os.walk(src_dir):
        for file in files:
            if file.endswith('.jpeg'):
                image_path = subdir + os.sep + file
                _crop_image_helper(f'{image_path}', f'{trgt_dir}{subdir.split(os.sep)[-1]}{os.sep}', f'{file.split(".")[0]}', s_height)


def _crop_image_helper(img_path, trgt_dir, page_num, s_height):

        os.makedirs(trgt_dir, exist_ok=True)
        
        img_data = cv2.imread(f'{img_path}', cv2.IMREAD_COLOR)
        img_height, img_width, img_dim = img_data.shape

        count = 0

        for y in range(0, img_height, s_height):
            for x in range(0, img_width, 200):
                segment = img_data[y:y + s_height, x: x + 200]
                
                if segment.shape[0] == s_height:
                    cv2.imwrite(f'{trgt_dir}{page_num}_{count}.jpeg', segment)
                    count = count + 1

In [None]:
for subdir in tqdm(os.listdir(source_dir)):
    chapter_dir = os.path.join(source_dir, subdir)
    size = find_patch_size(chapter_dir)
    if size == 312:
        print(f'{subdir}') 

    crop_image(chapter_dir, target_dir, size)