In [1]:
import sys
sys.path.append('../../30_data_tools/')

In [2]:
from pathlib import Path
from tqdm import tqdm
from PIL import Image
import numpy as np
import pickle
import random
import cv2
import plotly.express as px

import pandas as pd
import sqlite3
import pytesseract
import os

In [3]:
from helper import load_dotenv

In [4]:
from mask_functions import is_text_mask, filter_intersected_masks, is_above_min_size, load_masks, get_whole_mask, save_masks

In [5]:
dotenv = load_dotenv()

In [6]:
con = sqlite3.connect(dotenv['DB_PATH'])

In [7]:
remaining_pages = pd.read_sql(
    '''
        SELECT cf.* FROM (
        	SELECT * FROM related_file
        	WHERE variant_name = 'halftone300dpi' AND "type" = '4c'
        ) cf
        LEFT JOIN (
        	SELECT job, pdf_filename, 1 AS has_mask FROM related_file 
        	WHERE variant_name = 'halftone300dpi' AND "type" = 'masks'
        ) mf ON cf.job=mf.job AND cf.pdf_filename=mf.pdf_filename 
        WHERE mf.has_mask IS NOT NULL
    ''',
    con
).sample(frac=1)

# Verarbeitung

In [8]:
available_masks = list(dotenv['DATA_DIR'].glob('./*/halftone300dpi/*.masks.*'))
masks_to_process = []

for mask_path in tqdm(available_masks):
    mask_path_out = mask_path.parent.parent / 'halftone600dpi' / mask_path.name

    if mask_path_out.exists() == False:
        masks_to_process.append(mask_path)

100%|████████████████████████████████████| 1921/1921 [00:00<00:00, 27836.15it/s]


In [9]:
len(masks_to_process), len(available_masks), len(masks_to_process) / len(available_masks)

(672, 1921, 0.3498178032274857)

In [10]:
0 / 0

ZeroDivisionError: division by zero

In [11]:
for mask_path in tqdm(masks_to_process):
    img_path = mask_path.parent / mask_path.name.replace('masks.pkl','4c.jpg')
    mask_path_out = mask_path.parent.parent / 'halftone600dpi' / mask_path.name

    if mask_path.exists() and img_path.exists():
        img = Image.open(img_path)
        mask_scale_factor = 600 / 300

        masks = load_masks( mask_path )

        # img_size Wert setzen
        for m in masks:
            if 'img_size' not in m:
                m['img_size'] = (
                    int(round(img.size[0] * mask_scale_factor)),
                    int(round(img.size[1] * mask_scale_factor))
                )
            
        # filter by size
        masks = [m for m in masks if is_above_min_size(m)]
        # filter by text box
        masks = [m for m in masks if is_text_mask( img, m ) == False]
        # filter duplicates
        masks = filter_intersected_masks( masks )

        # masken auf 600dpi skalieren
        for m in masks:
            m['bbox'] = [int(val * mask_scale_factor) for val in m['bbox']]
        
        save_masks( masks, mask_path_out ) 

  0%|                                       | 1/672 [04:05<45:39:55, 245.00s/it]


KeyboardInterrupt: 

In [None]:
with mask_path.open('rb') as mask_file:
    masks = pickle.load(mask_file)

In [None]:
masks[0]['mask'].shape

In [None]:
masks[0]['bbox']

In [None]:
0 / 0

In [None]:
masks = list(dotenv['DATA_DIR'].glob('./*/halftone600dpi/*.masks.pkl'))

for mask_path in tqdm(masks):
    img_path = mask_path.parent / mask_path.name.replace( ".masks.pkl", ".4c.jpg" )

    if img_path.exists():
        masks = load_masks( mask_path )
        img = Image.open( img_path )

        if len(masks) > 0 and img.size != masks[0]['img_size']:
            for m in masks:
                m['img_size'] = img.size

            save_masks( masks, mask_path )

# Alt

In [None]:
mask_rows = []

for i in tqdm(range(10)):
    row = remaining_pages.sample(n=1).iloc[0]
    img_path = dotenv['DATA_DIR'] / row['job'] / row['variant_name'] / row['filename']
    mask_path = img_path.parent / f'{ img_path.name.strip( "." + row["type"] + img_path.suffix ) }.masks.pkl'    

    if mask_path.exists() and img_path.exists():
        img_size = get_image_size( img_path )
        img = Image.open(img_path)

        masks = load_masks( mask_path )
        for i in range(len(masks)):
            masks[i]['idx'] = i
        
        filtered_size = [i for i in range(len(masks)) if masks[i]['bbox'][2] * masks[i]['bbox'][3] >= config['mask_min_area']]
        filtered_text_box = [i for i in filtered_size if is_text_mask( img, masks[i] ) == False]
        filtered_duplicates = [m['idx'] for m in filter_intersected_masks( [masks[i] for i in filtered_text_box], img.size )]
    
        mask_rows.append((
            img_path, masks, filtered_size, filtered_text_box, filtered_duplicates
        ))

In [None]:
deleted_masks = []

for r in mask_rows:
    deleted_masks_row = [
        r[1][i] for i in range(len(r[1]))
        if i not in r[4]
    ]

    deleted_masks += [
        (r[0], dmr) for dmr in deleted_masks_row
    ]

In [None]:
kept_masks = []

for r in mask_rows:
    kept_masks += [(r[0],r[1][i]) for i in r[4]]

In [None]:
sum([len(r[1]) for r in mask_rows]), len(deleted_masks), len(kept_masks)

In [None]:
i = 0

In [None]:
m = kept_masks[i][1]
print( f'{i}/{ len(kept_masks) }' )

cropped_img = Image.open(kept_masks[i][0]).crop((
    m['bbox'][0],m['bbox'][1],
    m['bbox'][0]+m['bbox'][2],m['bbox'][1]+m['bbox'][3]
))
i += 1

cropped_img