In [None]:
import sys
sys.path.append('../../30_data_tools/')
sys.path.append('../process_masks/')

In [None]:
from pathlib import Path
import random
from PIL import Image
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

import re
from helper import load_dotenv
from get_labelstudio_data import get_moires_of_project
from mask_functions import load_masks, get_whole_mask
import cv2
from shapely.geometry import Polygon

from load_dataset import get_available_moires, get_train_data, get_moire_path, get_non_moire_path, get_results_of_project
from get_labelstudio_data import get_results_of_project
import pandas as pd
import sqlite3
from skimage.metrics import structural_similarity as ssim
from datetime import datetime

In [None]:
from file_interaction import get_generic_image_filepath, get_related_filepath, open_img

In [None]:
dotenv = load_dotenv()

In [None]:
IMG_SIZE = (
    224,
    224
)
RESOLUTIONS = [300,200,150]
MIN_MASK_PARTIAL = 0.25
MIN_TILE_PARTIAL = 0.9

In [None]:
Image.MAX_IMAGE_PIXELS = None

In [None]:
DATASET_DIR = dotenv['TILE_DATASET_DIR']

In [None]:
with sqlite3.connect( dotenv['DB_PATH'] ) as con:
    data = pd.merge(
        pd.read_sql(
            '''
                SELECT m.* FROM mask m
                LEFT JOIN generic_image gi 
                ON
                	m.job=gi.job AND
                	m.pdf_filename=gi.pdf_filename AND
                	m."type"=gi."type" AND
                	m.variant_name=gi.variant_name AND
                	m."method"=gi."method" AND
                	m.idx=gi.idx 
                WHERE
                	gi.job='24-03-05-01_randomTrainPages'
            ''',
            con
        ),
        pd.DataFrame(
            [(r['id'],r['labels'][0],r['updated_at']) for r in get_results_of_project(2) if 'id' in r],
            columns=['mask_id','label','updated_at']
        ),
        on='mask_id',
        how='left'
    )
    data.label.fillna('undefined', inplace=True)
    data = data.loc[
        data.label == 'checked_moire'
    ]

    already_processed_mask_ids = list(set([
        re.match(r'(.+)_\d{3}.\d+', e.name.replace(e.suffix,'')).groups()[0]
         for e in DATASET_DIR.glob('./**/*.jpg')
    ]))
    
    data = data.loc[
        data.mask_id.isin(already_processed_mask_ids) == False
    ]

# Funktionen

In [None]:
def rescale_img_by_dpi( img, source_dpi, target_dpi ):
    if target_dpi != source_dpi:
        img = img.resize((
            round(img.size[0] * (target_dpi / source_dpi)),
            round(img.size[1] * (target_dpi / source_dpi))
        ))

    return img


def rescale_mask_by_target_size( mask, img_size ):
    whole_mask = (get_whole_mask( mask ) * 255).astype('uint8')
    mask = cv2.resize(
        whole_mask,
        img_size
    )
    mask = (mask / 255).astype('bool')

    return mask


def rescale_bounding_box_by_rescale_factor( bounding_box, rescale_factor ):
    return [
        int(round(bounding_box[0] * rescale_factor[0])),
        int(round(bounding_box[1] * rescale_factor[1])),
        int(round(bounding_box[2] * rescale_factor[0])),
        int(round(bounding_box[3] * rescale_factor[1]))
    ]

In [None]:
def get_img_pair( row, dotenv ):   
    non_moire_path = get_related_filepath( row.job, 'ps2400dpi150lpi', f'{ row.pdf_filename }.4c_{ dotenv["LOFI_DPI"] }.jpg' )
    moire_path = get_generic_image_filepath( row.pdf_filename, row.job, row.method, row.idx, variant=f'4c_{ dotenv["LOFI_DPI"] }'  )
    moire_img = open_img( moire_path )
    non_moire_img = open_img( non_moire_path )

    return non_moire_img, moire_img

In [None]:
def cut_img_to_tiles( img, relevant_box, mask=None, is_polygon_box=False ):
    # convolve over image
    out = []
    conv_size = IMG_SIZE
    step_horizontal = int(round(conv_size[0] / 2))
    step_vertical = int(round(conv_size[1] / 2))
    tile_partial = 0
    
    for left in range(0, img.shape[1] - step_horizontal, step_horizontal):
        for top in range(0, img.shape[0] - step_vertical, step_vertical):
            
            if left + conv_size[0] > img.shape[1]:
                left = img.shape[1] - conv_size[0]

            if top + conv_size[1] > img.shape[0]:
                top = img.shape[0] - conv_size[1]

            if (mask is None) == False:
                mask_partial = mask[
                    top:top+conv_size[1],
                    left:left+conv_size[0]
                ].mean()   
            elif is_polygon_box:
                p_label = Polygon(relevant_box)
                p_tile = Polygon([
                    (left,top),
                    (left + step_horizontal,top),
                    (left + step_horizontal,top + step_vertical),
                    (left,top + step_vertical)
                ])

                mask_partial = p_label.intersection(p_tile).area / p_tile.area
                tile_partial = p_label.intersection(p_tile).area / p_label.area
            else:
                # ist keine Maske vorhanden wird die Intersection
                # mit der Moirebox verwendet
                intersection_box = [
                    max([relevant_box[0], left]),
                    max([relevant_box[1], top]),
                    min([relevant_box[0] + relevant_box[2], left+conv_size[0]]),
                    min([relevant_box[1] + relevant_box[3], top+conv_size[1]])
                ]
        
                if intersection_box[0] <= intersection_box[2] and intersection_box[1] <= intersection_box[3]:
                    mask_partial = ((intersection_box[2] - intersection_box[0]) * (intersection_box[3] - intersection_box[1]))  / (conv_size[0] * conv_size[1])      
                else:
                    mask_partial = 0

            if mask_partial > MIN_MASK_PARTIAL or tile_partial > MIN_TILE_PARTIAL:
                out.append(
                    img[
                        top:top+conv_size[1],
                        left:left+conv_size[0]
                    ]
                )
                
    return out

In [None]:
def process_sample( row, dotenv ):
    bbox = [int(val) for val in row.bbox.split(';')]
    masks = [m for m in load_masks( get_related_filepath( row.job, f'halftone{dotenv["LOFI_DPI"]}dpi', f'{ row.pdf_filename }.masks.pkl' ) ) if m['bbox'] == bbox]
    orig_non_moire_img, orig_moire_img = get_img_pair( row, dotenv )
    original_size = orig_non_moire_img.size
    
    if len(masks) > 0:
        whole_mask = (get_whole_mask( masks[0] ) * 255).astype('uint8')
        use_mask=True
    else:
        use_mask=False
    
    moire_samples = []
    non_moire_samples = []

    for resolution in RESOLUTIONS:
        non_moire_img = rescale_img_by_dpi(
            orig_non_moire_img,
            dotenv["LOFI_DPI"],
            resolution
        )
        non_moire_img = 1 - np.array(non_moire_img)[:,:,3] / 255
        
        moire_img = rescale_img_by_dpi(
            orig_moire_img,
            dotenv["LOFI_DPI"],
            resolution
        )
        moire_img = 1 - np.array(moire_img)[:,:,3] / 255

        rescale_factor = [
            moire_img.shape[1] / original_size[0],
            moire_img.shape[0] / original_size[1]
        ]
        
        mask_box = rescale_bounding_box_by_rescale_factor(
            bbox,
            rescale_factor
        )

        if use_mask:
            mask = (cv2.resize(
                whole_mask,
                (non_moire_img.shape[1], non_moire_img.shape[0])
            ) / 255).astype('bool')
        else:
            mask = None
        
        moire_samples.append((resolution, cut_img_to_tiles( moire_img, mask_box, mask=mask)))
        non_moire_samples.append((resolution, cut_img_to_tiles( non_moire_img, mask_box, mask=mask)))

    # out berechnen
    out = []

    for i in range(len(moire_samples)):
        resolution,moire_tiles = moire_samples[i]
        _,non_moire_tiles = non_moire_samples[i]

        out += [(f'{ row.mask_id }_{ resolution }', non_moire_tiles[j], moire_tiles[j]) for j in range(len(moire_tiles))]
    
    return out

In [None]:
def test_train_val_split( data, labels, test_size=.2, val_size=.1 ):
    idx_list = [i for i in range(len(data))]
    random.shuffle(idx_list)
    test_limiter = round(len(idx_list) * test_size)
    val_limiter = test_limiter + round(len(idx_list) * val_size)

    train_data = [data[idx] for idx in idx_list[val_limiter:]]
    train_labels = [labels[idx] for idx in idx_list[val_limiter:]]

    test_data = [data[idx] for idx in idx_list[:test_limiter]]
    test_labels = [labels[idx] for idx in idx_list[:test_limiter]]

    val_data = [data[idx] for idx in idx_list[test_limiter:val_limiter]]
    val_labels = [labels[idx] for idx in idx_list[test_limiter:val_limiter]]

    return train_data, train_labels, test_data, test_labels, val_data, val_labels

In [None]:
def write_tiles( data, set_name ):
    set_dir = DATASET_DIR / set_name
    if set_dir.exists() == False:
        set_dir.mkdir()

    for i in range(len(data)):
        entry = data[i]
        
        for j in range(1,len(entry)):
            if (entry[j] is None) == False:
                parent_dir_name = 'no_moire' if j == 1 else 'moire'
                parent_dir = set_dir / parent_dir_name
    
                if parent_dir.exists() == False:
                    parent_dir.mkdir()
                
                out_path = parent_dir / f"{ entry[0] }.{str(i).zfill(4)}.jpg"
            
                img = Image.fromarray( np.uint8(data[i][j] * 255) ).convert('RGB')
                img.save( out_path, progressive=True )

In [None]:
def generate_tiles( sample, set_name ):
    for i in tqdm(range(sample.shape[0])):
        try:
            write_tiles( process_sample( sample.iloc[i], dotenv ), set_name )
        except:
            print( sample.iloc[i].mask_id )

In [None]:
def get_available_moires():
    # results laden
    dotenv = load_dotenv()
    moire_results = [r for r in get_results_of_project(2) if 'checked_moire' in r['labels']]
    
    results_frame = pd.DataFrame(
        [
            (
                r['img_name'],
                ";".join([
                    str(r['value']['x']),
                    str(r['value']['y']),
                    str(r['value']['width']),
                    str(r['value']['height'])
                ]),
                r['rectanglelabels'][0]
            )
            for r in moire_results
        ],
        columns=['img_name','bbox','label']
    )
    
    # masken laden
    con = sqlite3.connect( dotenv['DB_PATH'] )
    
    masks = pd.read_sql(
        'SELECT * FROM mask',
        con
    )
    
    masks.loc[
        :,
        'img_name'
    ] = masks.apply(lambda val: f"{ val.job }.{ val.pdf_filename }.{ val.method }.{ val.idx }.4c_600.jpg", axis=1)

    # Frames mergen
    merged = pd.merge(
        masks,
        results_frame,
        how="left",
        on=['img_name','bbox']
    )
    
    merged = merged.loc[
        merged.label == 'checked_moire'
    ]
    
    return merged

In [None]:
0 / 0

# tiles

In [None]:
data = data.sample(frac=1)

In [None]:
data.shape[0]

In [None]:
TEST_SIZE = .2
VAL_SIZE = .1
TRAIN_SIZE = 1 - TEST_SIZE - VAL_SIZE

N = 1000
N = N if data.shape[0] > N else data.shape[0]

In [None]:
entries = list(set([
    (e.parent.parent.name, e.parent.name, re.match(r'(.+)_\d{3}.\d+', e.name.replace(e.suffix,'')).groups()[0])
    for e in DATASET_DIR.glob('./**/*.jpg')
    if "ok_sample" not in e.name
]))

current_train = len([e for e in entries if e[0]=='train'])
current_test = len([e for e in entries if e[0]=='test'])
current_val = len([e for e in entries if e[0]=='val'])

train_rows = []
test_rows = []
val_rows = []

for i in range(N):
    total = current_train + current_test + current_val + len(train_rows) + len(test_rows) + len(val_rows)
    appended = False
    possible_datasets = []

    if (current_test + len(test_rows)) / total - TEST_SIZE < 0:
        possible_datasets.append('test')

    if (current_val + len(val_rows)) / total - VAL_SIZE < 0:
        possible_datasets.append('val')

    if (current_train + len(train_rows)) / total - TRAIN_SIZE < 0:
        possible_datasets.append('train')

    if len(possible_datasets) == 0:
        possible_datasets = ['train','test','val']

    next_dataset = random.choice(possible_datasets)

    if next_dataset == 'train':
        train_rows.append( data.iloc[i].name )
    elif next_dataset == 'test':
        test_rows.append( data.iloc[i].name )
    elif next_dataset == 'val':
        val_rows.append( data.iloc[i].name )

In [None]:
test_rows = data.loc[data.index.isin(test_rows)]
val_rows = data.loc[data.index.isin(val_rows)]
train_rows = data.loc[data.index.isin(train_rows)]

In [None]:
len(train_rows), len(test_rows), len(val_rows)

In [None]:
generate_tiles( train_rows, 'train' )
generate_tiles( test_rows, 'test' )
generate_tiles( val_rows, 'val' )

# ok_samples hinzufügen

In [None]:
ok_tiles = len(list(dotenv['TILE_DATASET_DIR'].glob('./**/ok_sample_*.jpg')))
all_tiles = len(list(dotenv['TILE_DATASET_DIR'].glob('./**/*.jpg'))) - ok_tiles

In [None]:
tile_target_count = round(all_tiles * 0.5 - ok_tiles)

In [None]:
con = sqlite3.connect( dotenv['DB_PATH'] )

In [None]:
def get_overlap( box, row, results_dict ):
    relevant_boxes = []

    if row.job not in results_dict:
        return 0

    if row.pdf_filename not in results_dict[row.job]:
        return 0

    intersections = []
    for r in results_dict[row.job][row.pdf_filename]:    
        intersection_box = [
            max([box[0], r['x']]),
            max([box[1], r['y']]),
            min([box[0] + box[2], r['x'] + r['width']]),
            min([box[1] + box[3], r['y'] + r['height']])
        ]
        
        if intersection_box[0] <= intersection_box[2] and intersection_box[1] <= intersection_box[3]:
            intersections.append((intersection_box[2] - intersection_box[0]) * (intersection_box[3] - intersection_box[1]))        

    if len(intersections):
        return max(intersections + [0])

    return 0

In [None]:
# Die im System vorhandenen Moires werden in eine Form gebracht, durch sie ausgeschlossen werden können
real_moires = get_results_of_project(1)
for r in real_moires:
    r['dataset'] = 'real'

generic_moires = get_results_of_project(2)
for r in generic_moires:
    r['dataset'] = 'generic'

results = [
    r for r in real_moires + generic_moires
    if r['labels'][0] in ['checked_moire','moire_l_01','moire_l_05','moire_l_10','moire']
]

# die Boxen werden nach Job und pdf_filename
# in ein dict einsortiert
results_dict = {}

for r in results:
    if r['dataset'] == 'real':
        job, pdf_filename, dpi = re.match(r'(.+?)\..+?\.(.+)\.4c_(\d+)\.jpg', r['img_name']).groups()
    else:
        job, pdf_filename, dpi = re.match(r'(.+?)\.(.+)\.soft_light.+\.4c_(\d+)\.jpg', r['img_name']).groups()
 
    dpi = int(dpi)

    out = {
        'x' : r['value']['x'],
        'y' : r['value']['y'],
        'width' : r['value']['width'],
        'height' : r['value']['height'],
        'img_name' : r['img_name']
    }

    # boxen werden auf 600dpi skaliert
    if dpi != dotenv['LOFI_DPI']:
        rescale_factor = dotenv['LOFI_DPI'] / dpi

        out['x'] = out['x'] * rescale_factor
        out['y'] = out['y'] * rescale_factor
        out['width'] = out['width'] * rescale_factor
        out['height'] = out['height'] * rescale_factor

    # result wird einsortiert
    if job not in results_dict:
        results_dict[job] = {}

    if pdf_filename not in results_dict[job]:
        results_dict[job][pdf_filename] = []

    results_dict[job][pdf_filename].append(out)

In [None]:
related_files = pd.read_sql(
    '''
        SELECT job, pdf_filename, filename FROM related_file rf
        WHERE rf.variant_name == 'ps2400dpi150lpi' AND
        "type" = '4c_600' AND job = '24-03-05-01_randomTrainPages'
    ''',
    con
)

In [None]:
def write_ok_samples( found_tiles ):
    random.shuffle(found_tiles)
    
    test_limiter = round(len(found_tiles) * TEST_SIZE)
    val_limiter = test_limiter + round(len(found_tiles) * VAL_SIZE)
    
    test_rows = found_tiles[:test_limiter]
    val_rows = found_tiles[test_limiter:val_limiter]
    train_rows = found_tiles[val_limiter:]
    
    write_tiles( [(f"ok_sample_{t[0]}_{t[1]}_", t[3], None) for t in train_rows], 'train' )
    write_tiles( [(f"ok_sample_{t[0]}_{t[1]}_", t[3], None) for t in test_rows], 'test' )
    write_tiles( [(f"ok_sample_{t[0]}_{t[1]}_", t[3], None) for t in val_rows], 'val' )

In [None]:
tiles_out = 0
found_tiles = []

with tqdm(total=tile_target_count) as pbar: 
    while tiles_out < tile_target_count:
        sample = related_files.sample(n=1).iloc[0]
        filepath = get_related_filepath( sample.job, 'ps2400dpi150lpi', sample.filename )
        orig_img = open_img(filepath)
        source_dpi = dotenv["LOFI_DPI"]
        tiles_of_img = []
        
        i = 0
        while random.random() > (len(tiles_of_img) / 30) ** 2 and i < 1000:
            target_dpi = random.choice(RESOLUTIONS)
            img = orig_img.resize((
                round(orig_img.size[0] * (target_dpi / source_dpi)),
                round(orig_img.size[1] * (target_dpi / source_dpi))
            ))
            
            posX = random.randrange(img.size[0] - IMG_SIZE[0])
            posY = random.randrange(img.size[1] - IMG_SIZE[1])
        
            tile_img = 1 - np.array(img.crop((
                posX,posY,
                posX+IMG_SIZE[0],posY+IMG_SIZE[1]
            )))[:,:,3] / 255
        
            tiles_of_img.append((
                sample.pdf_filename,
                target_dpi,
                (posX,posY),
                tile_img
            ))
        
            i += 1

        found_tiles += tiles_of_img

        if len(found_tiles) >= 100:
            pbar.update(len(found_tiles))
            tiles_out += len(found_tiles)
            write_ok_samples(found_tiles)
            found_tiles = []

    pbar.update(len(found_tiles))
    write_ok_samples( found_tiles )

# Real Validation schreiben

In [None]:
MIN_IOU = 0.9

In [None]:
con = sqlite3.connect(dotenv['DB_PATH'])

In [None]:
IMG_SIZE

In [None]:
moires = [
    r for r in get_results_of_project(3)
    if 'moire' in r['labels'] and 'id' in r
]

In [None]:
def process_real_moire( moire, dotenv, con ):
    job, variant_name, pdf_filename, dpi = re.match(r'(.+?)\.(.+?)\.(.+)\.4c_(\d+).jpg', moire['img_name']).groups()
    target_dpi = dotenv['TRAIN_DATA_DPI']

    ps_path = get_related_filepath( job, 'ps2400dpi150lpi', f'{ pdf_filename }.4c_{ dotenv["LOFI_DPI"] }.jpg' )

    if ps_path is None:
        return []

    dpi = dotenv["LOFI_DPI"]
    ps_orig_img = open_img(ps_path)

    # Größe der Punkte normalisieren
    scale_factor_x = ps_orig_img.size[0] / moire['original_width']
    scale_factor_y = ps_orig_img.size[1] / moire['original_height']
    points_orig = [
        (pt[0] * scale_factor_x, pt[1] * scale_factor_y)
        for pt in moire['points']
    ]
    tiles = []

    for resolution in RESOLUTIONS:
        rescale_factor = resolution / dpi
        ps_img = ps_orig_img.resize((
            round(ps_orig_img.size[0] * rescale_factor),
            round(ps_orig_img.size[1] * rescale_factor)
        ))
        points = [(pt[0] * rescale_factor, pt[1] * rescale_factor) for pt in points_orig]
        
        tiles += [
            (f"{moire['id']}_{ resolution }", None, t)
            for t in cut_img_to_tiles(
                1 - np.array(ps_img)[:,:,3] / 255,
                points,
                is_polygon_box=True
            )
        ]

    return tiles

In [None]:
def get_real_non_moire_tiles( moire, moires, dotenv, con ):
    job, variant_name, pdf_filename, dpi = re.match(r'(.+?)\.(.+?)\.(.+)\.4c_(\d+).jpg', moire['img_name']).groups()
    target_dpi = dotenv['TRAIN_DATA_DPI']
    
    ps_path = get_related_filepath( job, 'ps2400dpi150lpi', f'{ pdf_filename }.4c_{ dotenv["LOFI_DPI"] }.jpg' )

    if ps_path is None:
        return []

    dpi = dotenv["LOFI_DPI"]
    ps_orig_img = open_img(ps_path)

    # Größe der Punkte normalisieren
    scale_factor_x = ps_orig_img.size[0] / moire['original_width']
    scale_factor_y = ps_orig_img.size[1] / moire['original_height']
    relevant_boxes_orig = [
        [(pt[0] * scale_factor_x, pt[1] * scale_factor_y) for pt in m['points']]
        for m in moires
        if m['img_name'] == moire['img_name']
    ]
    tiles = []

    resources_by_resolution = {}
    
    for resolution in RESOLUTIONS:
        rescale_factor = resolution / dpi
        ps_img = ps_orig_img.resize((
            round(ps_orig_img.size[0] * rescale_factor),
            round(ps_orig_img.size[1] * rescale_factor)
        ))
        relevant_boxes = [[(pt[0] * rescale_factor, pt[1] * rescale_factor) for pt in ptm] for ptm in relevant_boxes_orig]
    
        resources_by_resolution[resolution] = {
            'img' : ps_img,
            'boxes' : relevant_boxes
        }
   
    iterations = 0
    tiles = []
    while len(tiles) < 25 and iterations < 1000:
        target_resolution = random.choice(list(resources_by_resolution.keys()))
        target_img = resources_by_resolution[target_resolution]
        
        posX = random.randrange( target_img['img'].size[0] - IMG_SIZE[0] )
        posY = random.randrange( target_img['img'].size[1] - IMG_SIZE[1] )

        for rb in target_img['boxes']:
            p_label = Polygon(rb)
            p_tile = Polygon([
                (posX,posY),
                (posX + IMG_SIZE[0],posY),
                (posX + IMG_SIZE[0],posY + IMG_SIZE[1]),
                (posX,posY + IMG_SIZE[1])
            ])          

            if p_label.intersection(p_tile).area == 0:
                tiles.append((
                    target_resolution,
                    1 - np.array(ps_img.crop((
                        posX,
                        posY,
                        posX + IMG_SIZE[0],
                        posY + IMG_SIZE[1]
                    )))[:,:,3] / 255
                ))
        
        iterations += 1
    
    return [(f"{ moire['id'] }_ok_sample_{ t[0] }", t[1], None ) for t in tiles]

In [None]:
for m in tqdm(moires):
    if len(list((dotenv['TILE_DATASET_DIR'] / 'real_val' / 'moire').glob(f'./{ m["id"] }*.jpg'))) == 0:
        moire_tiles = process_real_moire(m, dotenv, con)
        write_tiles( moire_tiles, 'real_val' )
    
        no_moire_tiles = get_real_non_moire_tiles( m, moires, dotenv, con )
        write_tiles( no_moire_tiles, 'real_val' )

In [None]:
moire_tiles

In [None]:
len(real_moire_tiles), len(real_non_moire_tiles)