In [1]:
import sys
sys.path.append('../../30_data_tools/')
sys.path.append('../process_masks/')

In [2]:
from pathlib import Path
import random

In [3]:
from PIL import Image
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

In [4]:
from load_dataset import get_available_moires, get_train_data, get_moire_path, get_non_moire_path

In [5]:
from helper import load_dotenv
from get_labelstudio_data import get_moires_of_project
from mask_functions import load_masks, get_whole_mask
import cv2
from shapely.geometry import Polygon

In [6]:
import re

In [None]:
# für Polygonintersection

p1 = Polygon([(0,0), (1,1), (1,0)])
p2 = Polygon([(0,1), (1,0), (1,1)]) 

p1.intersection(p2).area / p1.area

In [7]:
dotenv = load_dotenv()

In [8]:
IMG_SIZE = (
    280,
    280
)
MIN_MASK_PARTIAL = 0.5

In [9]:
DATASET_DIR = Path('./dataset/')

In [10]:
def get_masks_path( row ):
    return dotenv['DATA_DIR'] / row.job / row.variant_name / f"{row.pdf_filename}.masks.pkl"

In [11]:
def get_img_pair( row, dotenv ):
    non_moire_path = row.non_moire_path
    moire_path = row.moire_path
    
    non_moire_img_dpi = int(re.search(r'(\d+)dpi', row.variant_name).groups()[0])
    moire_img_dpi = int(re.match(r'.+\.4c_(\d+)$', row.moire_path.name.replace( row.moire_path.suffix, '' )).groups()[0])
    target_dpi = dotenv['TRAIN_DATA_DPI']
    
    non_moire_img = Image.open( non_moire_path )
    if target_dpi != non_moire_img_dpi:
        non_moire_img = non_moire_img.resize((
            round(non_moire_img.size[0] * (target_dpi / non_moire_img_dpi)),
            round(non_moire_img.size[1] * (target_dpi / non_moire_img_dpi))
        ))
        
    moire_img = Image.open( moire_path )
    original_size = moire_img.size
    if target_dpi != moire_img_dpi:
        moire_img = moire_img.resize((
            round(moire_img.size[0] * (target_dpi / moire_img_dpi)),
            round(moire_img.size[1] * (target_dpi / moire_img_dpi))
        ))
    moire_img = moire_img.resize(non_moire_img.size)

    rescale_factor = [
        moire_img.size[0] / original_size[0],
        moire_img.size[1] / original_size[1]
    ]
    
    bbox = [int(val) for val in row.bbox.split(';')]
    bbox = [
        int(round(bbox[0] * rescale_factor[0])),
        int(round(bbox[1] * rescale_factor[1])),
        int(round(bbox[2] * rescale_factor[0])),
        int(round(bbox[3] * rescale_factor[1]))
    ]

    non_moire_img = 1 - np.array(non_moire_img)[:,:,3] / 255
    moire_img = 1 - np.array(moire_img)[:,:,3] / 255
    
    return non_moire_img, moire_img, bbox

In [12]:
def cut_img_to_tiles( img, relevant_box, mask=None ):
    # convolve over image
    out = []
    conv_size = IMG_SIZE
    step_horizontal = int(round(conv_size[0] / 2))
    step_vertical = int(round(conv_size[1] / 2))
    
    for left in range(0, img.shape[1] - step_horizontal, step_horizontal):
        for top in range(0, img.shape[0] - step_vertical, step_vertical):
            
            if left + conv_size[0] > img.shape[1]:
                left = img.shape[1] - conv_size[0]

            if top + conv_size[1] > img.shape[0]:
                top = img.shape[0] - conv_size[1]

            if (mask is None) == False:
                mask_partial = mask[
                    top:top+conv_size[1],
                    left:left+conv_size[0]
                ].mean()
            else:
                # ist keine Maske vorhanden wird die Intersection
                # mit der Moirebox verwendet
                intersection_box = [
                    max([relevant_box[0], left]),
                    max([relevant_box[1], top]),
                    min([relevant_box[0] + relevant_box[2], left+conv_size[0]]),
                    min([relevant_box[1] + relevant_box[3], top+conv_size[1]])
                ]
        
                if intersection_box[0] <= intersection_box[2] and intersection_box[1] <= intersection_box[3]:
                    mask_partial = ((intersection_box[2] - intersection_box[0]) * (intersection_box[3] - intersection_box[1]))  / (conv_size[0] * conv_size[1])      
                else:
                    mask_partial = 0
            
            if mask_partial > MIN_MASK_PARTIAL:
                out.append(
                    img[
                        top:top+conv_size[1],
                        left:left+conv_size[0]
                    ]
                )
                
    return out

In [13]:
def process_sample( row, dotenv ):
    bbox = [int(val) for val in row.bbox.split(';')]
    masks = [m for m in load_masks( get_masks_path( row ) ) if m['bbox'] == bbox]
    non_moire_img_dpi = int(re.search(r'(\d+)dpi', row.variant_name).groups()[0])
    
    non_moire_img, moire_img, mask_box = get_img_pair( row, dotenv )

    if len(masks) > 0:
        whole_mask = (get_whole_mask( masks[0] ) * 255).astype('uint8')
        mask = cv2.resize(
            whole_mask,
            (non_moire_img.shape[1], non_moire_img.shape[0])
        )
        mask = (mask / 255).astype('bool')
    else:
        mask = None
    
    moire_samples = [img for img in cut_img_to_tiles( moire_img, mask_box, mask=mask)]
    non_moire_samples = [img for img in cut_img_to_tiles( non_moire_img, mask_box, mask=mask)]

    return [(row.img_name, non_moire_samples[i], moire_samples[i]) for i in range(len(moire_samples))]

In [14]:
def test_train_val_split( data, labels, test_size=.2, val_size=.1 ):
    idx_list = [i for i in range(len(data))]
    random.shuffle(idx_list)
    test_limiter = round(len(idx_list) * test_size)
    val_limiter = test_limiter + round(len(idx_list) * val_size)

    train_data = [data[idx] for idx in idx_list[val_limiter:]]
    train_labels = [labels[idx] for idx in idx_list[val_limiter:]]

    test_data = [data[idx] for idx in idx_list[:test_limiter]]
    test_labels = [labels[idx] for idx in idx_list[:test_limiter]]

    val_data = [data[idx] for idx in idx_list[test_limiter:val_limiter]]
    val_labels = [labels[idx] for idx in idx_list[test_limiter:val_limiter]]

    return train_data, train_labels, test_data, test_labels, val_data, val_labels

In [15]:
def write_tiles( data, set_name ):
    set_dir = DATASET_DIR / set_name
    if set_dir.exists() == False:
        set_dir.mkdir()

    for i in tqdm(range(len(data))):
        entry = data[i]
        
        for j in range(1,len(entry)):
            if (entry[j] is None) == False:
                parent_dir_name = 'no_moire' if j == 1 else 'moire'
                parent_dir = set_dir / parent_dir_name
    
                if parent_dir.exists() == False:
                    parent_dir.mkdir()
                
                out_path = parent_dir / f"{ entry[0] }.{str(i).zfill(4)}.jpg"
            
                img = Image.fromarray( np.uint8(data[i][j] * 255) ).convert('RGB')
                img.save( out_path, progressive=True )

In [16]:
def generate_tiles( sample, set_name ):
    combined_data = []

    for i in tqdm(range(sample.shape[0])):
        row = sample.iloc[i]
        combined_data += process_sample( row, dotenv )

    write_tiles( combined_data, set_name )

In [17]:
from get_labelstudio_data import get_results_of_project
import pandas as pd
import sqlite3

In [18]:
def get_available_moires():
    # results laden
    dotenv = load_dotenv()
    moire_results = [r for r in get_results_of_project(2) if 'checked_moire' in r['rectanglelabels']]
    
    results_frame = pd.DataFrame(
        [
            (
                r['img_name'],
                ";".join([
                    str(r['value']['x']),
                    str(r['value']['y']),
                    str(r['value']['width']),
                    str(r['value']['height'])
                ]),
                r['rectanglelabels'][0]
            )
            for r in moire_results
        ],
        columns=['img_name','bbox','label']
    )
    
    # masken laden
    con = sqlite3.connect( dotenv['DB_PATH'] )
    
    masks = pd.read_sql(
        'SELECT * FROM mask',
        con
    )
    
    masks.loc[
        :,
        'img_name'
    ] = masks.apply(lambda val: f"{ val.job }.{ val.pdf_filename }.{ val.method }.{ val.idx }.4c_600.jpg", axis=1)

    # Frames mergen
    merged = pd.merge(
        masks,
        results_frame,
        how="left",
        on=['img_name','bbox']
    )
    
    merged = merged.loc[
        merged.label == 'checked_moire'
    ]
    
    return merged

In [19]:
0 / 0

ZeroDivisionError: division by zero

In [20]:
df = get_available_moires()
df.loc[
    :,
    'non_moire_path'
] = df.apply(lambda row: (dotenv['DATA_DIR'] / row.job / 'ps2400dpi150lpi' / f"{ row.pdf_filename }.4c_600.jpg"), axis=1)

df.loc[
    :,
    'moire_path'
] = df.apply(lambda row: (dotenv['GENERIC_GENERATED_DATA_DIR'] / row.img_name), axis=1)

In [21]:
df = df.loc[
    df.apply(lambda row: row.moire_path.exists() and row.non_moire_path.exists(), axis=1)
]

In [22]:
df.shape

(550, 17)

In [23]:
TEST_SIZE = .2
VAL_SIZE = .1

In [24]:
sample = df.sample(frac=1)
test_limiter = round(sample.shape[0] * TEST_SIZE)
val_limiter = test_limiter + round(sample.shape[0] * VAL_SIZE)

test_rows = sample.iloc[:test_limiter]
val_rows = sample.iloc[test_limiter:val_limiter]
train_rows = sample.iloc[val_limiter:]

In [25]:
generate_tiles( train_rows, 'train' )
generate_tiles( test_rows, 'test' )
generate_tiles( val_rows, 'val' )

  0%|          | 0/385 [00:00<?, ?it/s]



  0%|          | 0/2427 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/610 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

# ok_samples hinzufügen

In [26]:
tile_target_count = round(sample.shape[0] * 0.5)

In [27]:
con = sqlite3.connect( dotenv['DB_PATH'] )

In [29]:
def get_overlap( box, row, results_dict ):
    relevant_boxes = []

    if row.job not in results_dict:
        return 0

    if row.pdf_filename not in results_dict[row.job]:
        return 0

    intersections = []
    for r in results_dict[row.job][row.pdf_filename]:    
        intersection_box = [
            max([box[0], r['x']]),
            max([box[1], r['y']]),
            min([box[0] + box[2], r['x'] + r['width']]),
            min([box[1] + box[3], r['y'] + r['height']])
        ]
        
        if intersection_box[0] <= intersection_box[2] and intersection_box[1] <= intersection_box[3]:
            intersections.append((intersection_box[2] - intersection_box[0]) * (intersection_box[3] - intersection_box[1]))        

    if len(intersections):
        return max(intersections + [0])

    return 0

In [30]:
# Die im System vorhandenen Moires werden in eine Form gebracht, durch sie ausgeschlossen werden können
real_moires = get_results_of_project(1)
for r in real_moires:
    r['dataset'] = 'real'

generic_moires = get_results_of_project(2)
for r in generic_moires:
    r['dataset'] = 'generic'

results = [
    r for r in real_moires + generic_moires
    if r['rectanglelabels'][0] in ['checked_moire','moire_l_01','moire_l_05','moire_l_10','moire']
]

# die Boxen werden nach Job und pdf_filename
# in ein dict einsortiert
results_dict = {}

for r in results:
    if r['dataset'] == 'real':
        job, pdf_filename, dpi = re.match(r'(.+?)\..+?\.(.+)\.4c_(\d+)\.jpg', r['img_name']).groups()
    else:
        job, pdf_filename, dpi = re.match(r'(.+?)\.(.+)\.soft_light.+\.4c_(\d+)\.jpg', r['img_name']).groups()
 
    dpi = int(dpi)

    out = {
        'x' : r['value']['x'],
        'y' : r['value']['y'],
        'width' : r['value']['width'],
        'height' : r['value']['height'],
        'img_name' : r['img_name']
    }

    # boxen werden auf 600dpi skaliert
    if dpi != dotenv['LOFI_DPI']:
        rescale_factor = dotenv['LOFI_DPI'] / dpi

        out['x'] = out['x'] * rescale_factor
        out['y'] = out['y'] * rescale_factor
        out['width'] = out['width'] * rescale_factor
        out['height'] = out['height'] * rescale_factor

    # result wird einsortiert
    if job not in results_dict:
        results_dict[job] = {}

    if pdf_filename not in results_dict[job]:
        results_dict[job][pdf_filename] = []

    results_dict[job][pdf_filename].append(out)

In [31]:
related_files = pd.read_sql(
    '''
        SELECT job, pdf_filename, filename FROM related_file rf
        WHERE rf.variant_name == 'ps2400dpi150lpi' AND
        "type" = '4c_600'
    ''',
    con
)

In [32]:
tiles_out = []

with tqdm(total=tile_target_count) as pbar:
    while len(tiles_out) < tile_target_count:
        sample = related_files.sample(n=1).iloc[0]
        filepath = dotenv['DATA_DIR'] / sample.job / 'ps2400dpi150lpi' / sample.filename
        img = Image.open(filepath)
        found_tiles = []
        i = 0
        while random.random() > (len(found_tiles) / 10) ** 2 and i < 1000:
            posX = random.randrange(img.size[0] - IMG_SIZE[0])
            posY = random.randrange(img.size[1] - IMG_SIZE[1])
        
            tile_img = 255 - np.array(img.crop((
                posX,posY,
                posX+IMG_SIZE[0],posY+IMG_SIZE[1]
            )))[:,:,3]
        
            overlap = get_overlap( [posX, posY, IMG_SIZE[0], IMG_SIZE[1]], sample, results_dict )
        
            if overlap == 0:
                found_tiles.append((
                    filepath.name.replace(filepath.suffix,''),
                    (posX,posY),
                    tile_img
                ))
        
            i += 1
        
        tiles_out += found_tiles
        pbar.update(len(found_tiles))

  0%|          | 0/275 [00:00<?, ?it/s]

In [33]:
random.shuffle(tiles_out)

In [34]:
TEST_SIZE = .2
VAL_SIZE = .1

In [35]:
test_limiter = round(len(tiles_out) * TEST_SIZE)
val_limiter = test_limiter + round(len(tiles_out) * VAL_SIZE)

test_rows = tiles_out[:test_limiter]
val_rows = tiles_out[test_limiter:val_limiter]
train_rows = tiles_out[val_limiter:]

In [36]:
write_tiles( [(f"ok_sample_{t[0]}", t[2], None) for t in train_rows], 'train' )
write_tiles( [(f"ok_sample_{t[0]}", t[2], None) for t in test_rows], 'test' )
write_tiles( [(f"ok_sample_{t[0]}", t[2], None) for t in val_rows], 'val' )

  0%|          | 0/197 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

# Real Validation schreiben

In [None]:
MIN_IOU = 0.9

In [None]:
con = sqlite3.connect(dotenv['DB_PATH'])

In [None]:
moires = [
    r for r in get_results_of_project(1)
    if (
        r['rectanglelabels'][0] != 'potential_moire' and
        re.match(r'\d{6}.+', r['img_name'])
    )
]

In [None]:
def get_moire_mask_path( img_name ):
    jobname, filename = re.match(r'(.+?)\..+?\.(.+)\..+?\.jpg', img_name).groups()
    
    return dotenv['DATA_DIR'] / jobname / "halftone600dpi" / f"{ filename }.masks.pkl"

In [None]:
def calc_bbox_iou( bbox_a, bbox_b ):
    intersection_box = [
        max([bbox_a[0], bbox_b[0]]),
        max([bbox_a[1], bbox_b[1]]),
        min([bbox_a[0]+bbox_a[2], bbox_b[0]+bbox_b[2]]),
        min([bbox_a[1]+bbox_a[3], bbox_b[1]+bbox_b[3]])
    ]
    intersection = (intersection_box[2] - intersection_box[0]) * (intersection_box[3] - intersection_box[1])

    union_box = [
        min([bbox_a[0], bbox_b[0]]),
        min([bbox_a[1], bbox_b[1]]),
        max([bbox_a[0]+bbox_a[2], bbox_b[0]+bbox_b[2]]),
        max([bbox_a[1]+bbox_a[3], bbox_b[1]+bbox_b[3]])
    ]
    union = (union_box[2] - union_box[0]) * (union_box[3] - union_box[1])

    if (intersection_box[2] - intersection_box[0]) < 0 or (intersection_box[3] - intersection_box[1]) < 0:
        return 0
    
    return intersection / union

In [None]:
def get_moire_mask_path( job, page, dotenv, con ):
    c = con.cursor()
    c.execute(f'''
        SELECT * FROM related_file
        WHERE
            job='{ job }' AND
            variant_name='halftone{ dotenv["LOFI_DPI"] }dpi' AND
            pdf_filename LIKE '%{ page }%'
            AND "type" = 'masks'
    ''')
    mask_entry = c.fetchone()
    c.close()

    if mask_entry != None:
        mask_path = dotenv['DATA_DIR'] / job / f'halftone{ dotenv["LOFI_DPI"] }dpi' / mask_entry[4] 
    
        if mask_path.exists():
            return mask_path

    return None

In [None]:
def get_corresponding_ps_img_path( job, page_name, dotenv, con ):
    c = con.cursor()
    c.execute(f'''
        SELECT * FROM related_file
        WHERE
            job='{ job }' AND
            variant_name='ps2400dpi150lpi' AND
            pdf_filename LIKE '%{ page_name }%'
            AND "type" = '4c_{ dotenv["LOFI_DPI"] }'
    ''')
    ps_entry = c.fetchone()
    c.close()

    if ps_entry:
        ps_path = dotenv['DATA_DIR'] / job / ps_entry[0] / ps_entry[4]
    
        if ps_path.exists():
            return ps_path

    return None

In [None]:
def process_real_moire( moire, dotenv, con ):
    job, variant_name, page, dpi = re.match(r'(.+?)\.(.+?)\.(.+)\.4c_(\d+).jpg', moire['img_name']).groups()
    target_dpi = dotenv['TRAIN_DATA_DPI']
    
    ps_path = get_corresponding_ps_img_path( job, page, dotenv, con )

    if ps_path == None:
        return []

    # dpi wird aus dem PS-Bildnamen ausgelesen
    dpi = re.match(r'.+\.4c_(\d+).jpg', ps_path.name).groups()[0]
    dpi = int(dpi)
    ps_img = Image.open(ps_path)
    
    scale_factor_x = ps_img.size[0] / moire['original_width']
    scale_factor_y = ps_img.size[1] / moire['original_height']
    moire_bbox = [
        round(moire['value']['x'] * scale_factor_x),
        round(moire['value']['y'] * scale_factor_y),
        round(moire['value']['width'] * scale_factor_x),
        round(moire['value']['height'] * scale_factor_y)
    ]
                    
    mask_path = get_moire_mask_path( job, page, dotenv, con )
    use_mask = False

    if mask_path and mask_path.exists():
        masks = load_masks(mask_path)
        max_iou = -1
        max_mask = None
        
        for mask in masks:
            iou = calc_bbox_iou(
                moire_bbox,
                mask['bbox']
            )
            
            if iou > MIN_IOU and iou > max_iou:
                max_iou = iou
                max_mask = get_whole_mask(mask)
                use_mask = True        
    
    if dpi != target_dpi:
        rescale_factor = target_dpi / dpi
        ps_img = ps_img.resize((
            round(ps_img.size[0] * rescale_factor),
            round(ps_img.size[1] * rescale_factor)
        ))

        if use_mask:
            max_mask = cv2.resize(max_mask, (ps_img.size[0], ps_img.size[1])).astype('bool')
    
        moire_bbox = [round(val * rescale_factor) for val in moire_bbox]
    
    tiles = cut_img_to_tiles(
        1 - np.array(ps_img)[:,:,3] / 255,
        moire_bbox,
        mask=max_mask if use_mask else None
    )

    return [(moire['img_name'],t) for t in tiles]

In [None]:
def get_real_non_moire_tiles( moire, moires, dotenv, con ):
    job, variant_name, page, dpi = re.match(r'(.+?)\.(.+?)\.(.+)\.4c_(\d+).jpg', moire['img_name']).groups()
    target_dpi = dotenv['TRAIN_DATA_DPI']
    
    ps_path = get_corresponding_ps_img_path( job, page, dotenv, con )
    
    if ps_path == None:
        return []

    # dpi wird aus dem PS-Bildnamen ausgelesen
    dpi = re.match(r'.+\.4c_(\d+).jpg', ps_path.name).groups()[0]
    dpi = int(dpi)
    
    ps_img = Image.open(ps_path)
    
    scale_factor_x = ps_img.size[0] / moire['original_width']
    scale_factor_y = ps_img.size[1] / moire['original_height']
    relevant_boxes = [[
        round(m['value']['x'] * scale_factor_x),
        round(m['value']['y'] * scale_factor_y),
        round(m['value']['width'] * scale_factor_x),
        round(m['value']['height'] * scale_factor_y)]
        for m in moires
        if m['img_name'] == moire['img_name']
    ]
                    
    if dpi != target_dpi:
        rescale_factor = target_dpi / dpi
        ps_img = ps_img.resize((
            round(ps_img.size[0] * rescale_factor),
            round(ps_img.size[1] * rescale_factor)
        ))
        relevant_boxes = [
            [round(val * rescale_factor) for val in rb]
            for rb in relevant_boxes
        ]

    iterations = 0
    tiles = []
    while len(tiles) < 25 and iterations < 1000:
        posX = random.randrange( ps_img.size[0] - IMG_SIZE[0] )
        posY = random.randrange( ps_img.size[1] - IMG_SIZE[1] )

        for rb in relevant_boxes:
            intersection_box = [
                max([posX, rb[0]]),
                max([posY, rb[1]]),
                min([posX + IMG_SIZE[0], rb[0]+rb[2]]),
                min([posY + IMG_SIZE[1], rb[1]+rb[3]])
            ]
            intersection = (intersection_box[0] - intersection_box[2]) * (intersection_box[1] - intersection_box[3])

            if intersection == 0:
                tiles.append(
                    1 - np.array(ps_img.crop((
                        posX,
                        posY,
                        posX + IMG_SIZE[0],
                        posY + IMG_SIZE[1]
                    )))[:,:,3] / 255
                )
        
        iterations += 1

    return [(moire['img_name'],t) for t in tiles]

In [None]:
real_moire_tiles = []
real_non_moire_tiles = []

for m in tqdm(moires):
    real_moire_tiles += process_real_moire(m, dotenv, con)
    real_non_moire_tiles += get_real_non_moire_tiles( m, moires, dotenv, con )

In [None]:
len(real_moire_tiles), len(real_non_moire_tiles)

In [None]:
tiles = [(t[0],None,t[1]) for t in real_moire_tiles] + [(t[0],t[1],None) for t in real_non_moire_tiles]
write_tiles( tiles, 'real_val' )