In [None]:
import sys
sys.path.append('..')

In [2]:
import sqlite3
import pandas as pd
from helper import load_dotenv
from get_labelstudio_data import get_results_of_project, get_tasks
from file_interaction import get_related_filepath, get_blobs, download_blob, upload_file, upload_buffer, copy_blob
import re
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime
import requests
from PIL import Image
from azure.storage.blob import BlobServiceClient

In [3]:
dotenv = load_dotenv()

In [4]:
con = sqlite3.connect(dotenv['DB_PATH'])

# Label Transfer

In [None]:
def create_annotation( target_task, results ):
    with Image.open( download_blob(target_task['data_path']) ) as img:
        img_size = img.size
     
    headers = {'Authorization': f'Token { dotenv["LABEL_STUDIO_TOKEN"] }'}
    json_data = {
        "completed_by": 1,
        "result": [
            {
                "original_width": img_size[0],
                "original_height": img_size[1],
                "image_rotation": 0,
                "value": {
                    "points" : [[pt[0] / img_size[0] * 100,pt[1] / img_size[1] * 100] for pt in r['points']],
                    "closed": True,
                    "polygonlabels": r['labels']
                },
                "from_name": "label",
                "to_name": "image",
                "type": "polygonlabels",
                "origin": "manual"
            }
            for r in results
        ],
        "was_cancelled": False,
        "ground_truth": False,
        "draft_created_at": datetime.now().isoformat(),
        "task": target_task["id"],
        "project": target_task["project_id"],
        "updated_by": 1,
        "parent_prediction": None,
        "parent_annotation": None
    }

    requests.post(
        f'http://localhost:8080/api/tasks/{ target_task["id"] }/annotations/',
        headers=headers,
        json=json_data
    )

In [26]:
source_results = [
    r for r in get_results_of_project(1)
    if (r['labels'][0] != 'potential_moire' and r['storage_type'] == 'local')
]
target_results = get_results_of_project(3)

source_results_by_image = {}

for r in source_results:
    key = re.match(r'(.+)\.4c_\d{3}\.jpg$', r['img_name']).groups()[0]

    if key not in source_results_by_image:
        source_results_by_image[key] = []

    source_results_by_image[key].append(r)


target_results_by_image = {}

for r in target_results:
    key = re.match(r'(.+)\.4c_\d{3}\.jpg$', r['img_name']).groups()[0]

    if key not in target_results_by_image:
        target_results_by_image[key] = []

    target_results_by_image[key].append(r)

In [47]:
keys_to_process = []

for key in source_results_by_image:
    job, _, filename = re.match(r'(.+?)\.(.+?)\.(.+)', key).groups()
    target_key = f'{ job }.ps2400dpi150lpi.{ filename }'

    if target_key not in target_results_by_image.keys():
        keys_to_process.append(key)

In [72]:
for key in tqdm(keys_to_process):
    job,_,filename,source_dpi = re.match(r'(.+?)\.(.+?)\.(.+)\.4c_(\d{3}).jpg', source_results_by_image[key][0]['img_name']).groups()
    source_dpi = int(source_dpi)
    rescale_factor = dotenv['LOFI_DPI'] / source_dpi
    target_file_name = f'{ job }.ps2400dpi150lpi.{ filename }.4c_{ dotenv["LOFI_DPI"] }.jpg'
    
    transformed_results = [
        {
            'points' : [
                [r['bbox']['x'] * rescale_factor,r['bbox']['y'] * rescale_factor],
                [(r['bbox']['x'] + r['bbox']['width']) * rescale_factor, r['bbox']['y'] * rescale_factor],
                [(r['bbox']['x'] + r['bbox']['width']) * rescale_factor, (r['bbox']['y'] + r['bbox']['height']) * rescale_factor],
                [r['bbox']['x'] * rescale_factor,(r['bbox']['y'] + r['bbox']['height']) * rescale_factor]
            ],
            'labels' : r['labels']
        } for r in source_results_by_image[key]
    ]

    tasks = [t for t in get_tasks(3) if target_file_name in t['data_path']]
    
    if len(tasks) > 0:
        task = tasks[0]

        print( task )
        create_annotation(
            task,
            transformed_results
        )

  0%|          | 0/297 [00:00<?, ?it/s]

In [None]:
images_to_process.file_transferred.sum()

In [None]:
images_to_process.labels_transferred.sum()

In [None]:
images_to_process.shape[0]

# übertragene Blobs löschen

In [81]:
def delete_blob( blob_name ):
    blob_service_client = BlobServiceClient.from_connection_string(dotenv['AZURE_CONNECTION_STRING'])

    blob_client = blob_service_client.get_blob_client(container=dotenv['AZURE_CONTAINER_NAME'], blob=blob_name)
    blob_client.delete_blob()

In [111]:
def delete_task( target_task ):
    headers = {'Authorization': f'Token { dotenv["LABEL_STUDIO_TOKEN"] }'}

    requests.delete(
        f'http://localhost:8080/api/tasks/{ target_task["id"] }',
        headers=headers
    )

In [133]:
source_tasks = [
    t for t in get_tasks(3)
    if (t['storage_type'] == 'azure') and ('vps2400dpi150lpi' in t['data_path']) and (t['data_path'].endswith('.4c_300.jpg'))
]

target_tasks = [
    t for t in get_tasks(3)
    if (t['storage_type'] == 'azure') and ('ps2400dpi150lpi' in t['data_path']) and (t['data_path'].endswith('.4c_600.jpg')) and t['total_annotations'] > 0
]

In [134]:
len(source_tasks), len(target_tasks)

(3647, 38)

In [132]:
has_deleted = False

for tt in tqdm(target_tasks):
    job, _, filename = re.match(r'.+/(.+?)\.(.+?)\.(.+)\.4c_600.jpg', tt['data_path']).groups()
    source_filename = f"{ job }.vps2400dpi150lpi.{ filename }.4c_300.jpg"

    relevant_source_tasks = [t for t in source_tasks if source_filename in t['data_path']]

    for t in relevant_source_tasks:
        delete_task( t )
        delete_blob( t['data_path'] )
        has_deleted = True

  0%|          | 0/38 [00:00<?, ?it/s]

# Related Files nachtragen

In [None]:
from helper import get_pdf_page_processing_status

In [None]:
data_overview = get_pdf_page_processing_status( 'ps2400dpi150lpi', 'K' )

In [None]:
data_overview.loc[
    data_overview.file_available == False
].shape

In [None]:
data_overview.loc[
    data_overview.file_available == False
].job.value_counts()

In [None]:
data_overview.loc[
    data_overview.file_available == False
][:50]

In [None]:
data_overview.loc[
    data_overview.file_available == False
][:50]

In [None]:
data_overview.file_available.value_counts()

In [None]:
from file_interaction import get_data_files

In [None]:
from add_data_to_db import add_variants

In [None]:
all_files = get_data_files()

In [None]:
all_files = [af for af in all_files if '.DS_Store' not in str(af[0])]

In [None]:
add_related_files( all_files, con )

In [None]:
def add_related_files( all_files, con ):
    related_files = []
    error_paths = []
    
    for filepath,storage_type in all_files:
        filepath = str(filepath)
        filepath = filepath[filepath.index('data'):]
    
        try:
            parts = str(filepath).split('/')
            job = parts[1]
            variant_name = parts[2]
            filename = parts[3]
        
            if variant_name != 'pdf' and filename != '.DS_Store':
                related_files.append({
                    'job' : job,
                    'variant_name' : variant_name,
                    'filename' : filename,
                    'filepath' : filepath,
                    'storage_type' : storage_type
                })
        except:
            error_paths.append(filepath)

    # Varianten hinzufügen
    variants = list(set([rf['variant_name'] for rf in related_files]))
    add_variants( variants, con )


    # related_file tabelle laden
    c = con.cursor()
    c.execute('''
        SELECT variant_name,pdf_filename,job,type,filename
        FROM related_file
    ''')
    available_data = c.fetchall()
    c.close()

    # duplikate herausfiltern
    data_to_add = []
    c = con.cursor()
    
    for rf in related_files:
        res = re.match(r'(.+)\.(.+?)\.(.+?)$', rf['filename'])
    
        if res:
            file_entry = (
                rf['variant_name'],
                res.groups()[0],
                rf['job'],
                res.groups()[1],
                rf['filename']
            )
    
            c.execute(f'''
                SELECT 1 FROM related_file
                WHERE
                    variant_name='{ file_entry[0] }' AND
                    pdf_filename='{ file_entry[1] }' AND
                    job='{ file_entry[2] }' AND
                    "type"='{ file_entry[3] }' AND
                    filename='{ file_entry[4] }'
            ''')
            
            entry_in_db = res is not c.fetchone()
            
            if entry_in_db == False:
                data_to_add.append(file_entry)
    
    c.close()

    value_lines = [
        f'("{ fe[0] }","{ fe[1] }","{ fe[2] }","{ fe[3] }","{ fe[4] }")'
        for fe in data_to_add
    ]

    if len(value_lines) > 0:
        c = con.cursor()
        c.execute(
            f'''
                INSERT INTO related_file (variant_name,pdf_filename,job,type,filename)
                VALUES { ",".join(value_lines) }
            '''
        )
        c.close()
        con.commit()