In [1]:
from helper import load_dotenv
import sqlite3
import pandas as pd
import re

In [2]:
config = load_dotenv()
con = sqlite3.connect(config['DB_PATH'])
con_ls = sqlite3.connect(config['LABEL_STUDIO_DB_PATH'])

In [3]:
pdfs = pd.read_sql(
    '''
        SELECT * FROM pdf_page
    ''',
    con
)

pdfs.loc[
    :,
    'variant_name'
] = pdfs.screen_ruling.apply(lambda val: f'vps2400dpi{val}lpi')
pdfs = pdfs.set_index([
    'job',
    'filename',
    'variant_name'
])

In [4]:
related_files = pd.read_sql(
    '''
        SELECT * FROM related_file
    ''',
    con
)

In [5]:
related_files.loc[
    related_files.type.str.contains('4c') == False,
    ['pdf_filename','job','variant_name','type']
].rename(
    columns={
        'pdf_filename':'filename'
    }).groupby(['job','filename','variant_name']).count().rename(
    columns={
        'type' : 'vps_count'
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,vps_count
job,filename,variant_name,Unnamed: 3_level_1
148903,133,vps2400dpi150lpi,4
148903,133,vps2400dpi175lpi,4
148903,140,vps2400dpi150lpi,4
148903,140,vps2400dpi175lpi,4
148903,151,vps2400dpi150lpi,4
...,...,...,...
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p95,vps2400dpi150lpi,4
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p96,vps2400dpi150lpi,4
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p97,vps2400dpi150lpi,4
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p98,vps2400dpi150lpi,4


In [6]:
pdfs.loc[
    :,
    'vps_count'
] = related_files.loc[
    related_files.type.str.contains('4c') == False,
    ['pdf_filename','job','variant_name','type']
].rename(
    columns={
        'pdf_filename':'filename'
    }).groupby(['job','filename','variant_name']).count().rename(
    columns={
        'type' : 'vps_count'
    }
).vps_count

pdfs.vps_count.fillna(0, inplace=True)

In [7]:
pdfs.loc[
    :,
    '4c'
] = related_files.loc[
    related_files.type == '4c',
    ['pdf_filename','job','variant_name','type']
].rename(
    columns={
        'pdf_filename':'filename'
    }
).groupby(['job','filename','variant_name']).count()['type']

pdfs['4c'].fillna(0, inplace=True)
pdfs['4c'] = pdfs['4c'] > 0

In [8]:
pdfs.loc[
    :,
    f'4c_{ config["LOFI_DPI"] }'
] = related_files.loc[
    related_files.type == f'4c_{ config["LOFI_DPI"] }',
    ['pdf_filename','job','variant_name','type']
].rename(
    columns={
        'pdf_filename':'filename'
    }
).groupby(['job','filename','variant_name']).count()['type']

pdfs[f'4c_{ config["LOFI_DPI"] }'].fillna(0, inplace=True)
pdfs[f'4c_{ config["LOFI_DPI"] }'] = pdfs[f'4c_{ config["LOFI_DPI"] }'] > 0

In [9]:
annotations = pd.read_sql(
    '''
        SELECT t.id, t.project_id, t.is_labeled, isl.key FROM task t 
        JOIN io_storages_localfilesimportstoragelink isl
        ON t.id = isl.task_id
    ''',
    con_ls
)

In [10]:
def process_key( key ):
    filename = key.split('/')[-1]
    res = re.match(r'^(.+?)\.(.+?)\.(.+)\.(.+)\.(.+)$', filename)
    
    if res:
        job, variant_name, pdf_name, rf_type, ext = res.groups()
        
        return {
            'job' : job,
            'variant_name' : variant_name,
            'filename' : pdf_name,
            'rf_type' : rf_type
        }
    
    
    return None

In [11]:
annotations.loc[
    :,
    ["job", "variant_name", "filename", "rf_type"]
] = annotations.apply( lambda row: process_key(row.key), axis='columns', result_type='expand' )

annotations.set_index(
    [
        'job',
        'filename',
        'variant_name'
    ],
    inplace=True
)

pdfs.loc[
    :,
    'is_labeled'
] = annotations.loc[
    :,
    'is_labeled'
]

pdfs.is_labeled = pdfs.is_labeled == 1

In [12]:
pdfs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,screen_ruling,import_timestamp,origin,vps_count,4c,4c_300,is_labeled
job,filename,variant_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
613233,tg46_UP_016_001_MRZ23_UPSUBHMRZ_SU_AS001_DE_DE_002_kor.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
613233,tg46_UP_006_007_MRZ23_UPSUBHMRZ_SU_AS004_DE_DE_002.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
613233,tg46_UP_004_005_MRZ23_UPSUBHMRZ_SU_AS003_DE_DE_002.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
613233,tg46_UP_002_003_MRZ23_UPSUBHMRZ_SU_AS002_DE_DE_001.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
613233,tg46_UP_004_005_MRZ23_UPSUBHMRZ_SU_AS003_NL_NL_001.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
...,...,...,...,...,...,...,...,...,...
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p73,vps2400dpi150lpi,150,2023-08-09 08:59:22.900893,,4.0,True,True,True
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p72,vps2400dpi150lpi,150,2023-08-09 08:59:22.900893,,4.0,True,True,True
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p66,vps2400dpi150lpi,150,2023-08-09 08:59:22.900893,,4.0,True,True,True
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p99,vps2400dpi150lpi,150,2023-08-09 08:59:22.900893,,4.0,True,True,True


In [13]:
pdfs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,screen_ruling,import_timestamp,origin,vps_count,4c,4c_300,is_labeled
job,filename,variant_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
613233,tg46_UP_016_001_MRZ23_UPSUBHMRZ_SU_AS001_DE_DE_002_kor.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
613233,tg46_UP_006_007_MRZ23_UPSUBHMRZ_SU_AS004_DE_DE_002.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
613233,tg46_UP_004_005_MRZ23_UPSUBHMRZ_SU_AS003_DE_DE_002.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
613233,tg46_UP_002_003_MRZ23_UPSUBHMRZ_SU_AS002_DE_DE_001.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
613233,tg46_UP_004_005_MRZ23_UPSUBHMRZ_SU_AS003_NL_NL_001.p1,vps2400dpi150lpi,150,,,4.0,True,True,True
...,...,...,...,...,...,...,...,...,...
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p73,vps2400dpi150lpi,150,2023-08-09 08:59:22.900893,,4.0,True,True,True
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p72,vps2400dpi150lpi,150,2023-08-09 08:59:22.900893,,4.0,True,True,True
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p66,vps2400dpi150lpi,150,2023-08-09 08:59:22.900893,,4.0,True,True,True
616466,tg45_000_INM0323_DE_MEN_KERN_TAUSCH_kor1.p99,vps2400dpi150lpi,150,2023-08-09 08:59:22.900893,,4.0,True,True,True


In [14]:
lofi_key = f'4c_{ config["LOFI_DPI"] }'

print( f"VPS erzeugt:\t{pdfs[pdfs.vps_count > 0].shape[0]}/{ pdfs.shape[0] }" )
print( f"4c erzeugt:\t{pdfs[pdfs['4c']].shape[0]}/{ pdfs.shape[0] }" )
print( f"{ lofi_key } erzeugt:\t{pdfs[pdfs[lofi_key]].shape[0]}/{ pdfs.shape[0] }" )

print( f"gelabelt:\t{pdfs[pdfs.is_labeled].shape[0]}/{ pdfs.shape[0] }" )

VPS erzeugt:	1292/1436
4c erzeugt:	1291/1436
4c_300 erzeugt:	1287/1436
gelabelt:	1287/1436
