# Загрузка библиотек

In [1]:
import os
import random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)

# Формирование датасета с метками

In [3]:
def walk_through_dir(dir_path):
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} unknown files\tin '{dirpath}'.")

In [4]:
data_path = Path("../data/sorted_data_merged")
walk_through_dir(data_path)

There are 10 directories and 0 unknown files	in '..\data\sorted_data_merged'.
There are 0 directories and 385 unknown files	in '..\data\sorted_data_merged\anticlass'.
There are 0 directories and 3080 unknown files	in '..\data\sorted_data_merged\anticlass_augm'.
There are 0 directories and 1222 unknown files	in '..\data\sorted_data_merged\damaged'.
There are 0 directories and 557 unknown files	in '..\data\sorted_data_merged\garbage'.
There are 0 directories and 302 unknown files	in '..\data\sorted_data_merged\other'.
There are 0 directories and 3308 unknown files	in '..\data\sorted_data_merged\receipts'.
There are 0 directories and 100 unknown files	in '..\data\sorted_data_merged\receipt_damaged'.
There are 0 directories and 577 unknown files	in '..\data\sorted_data_merged\receipt_undamaged'.
There are 0 directories and 267 unknown files	in '..\data\sorted_data_merged\terminal'.
There are 0 directories and 2472 unknown files	in '..\data\sorted_data_merged\undamaged'.


In [5]:
walk_through_dir(Path("../data/sorted_data_zharikov"))

In [6]:
walk_through_dir(Path("../data/sorted_data_smirnov"))

In [7]:
# dataset with classes
df_sorted_data = pd.DataFrame(
    columns=[
        'file_name', 'quality_photo', 'terminal', 
        'receipt', 'terminal_damaged', 'terminal_undamaged', 
        'terminal_unrecognized_defect', 'other', 'anticlass'
    ]
)
row_size = len(df_sorted_data.columns)
for dirpath, dirnames, filenames in tqdm(os.walk(data_path)):
    print(dirpath)
    if len(filenames)==0:
        continue
    df_temp = pd.DataFrame(
        columns=df_sorted_data.columns,
        data=np.concatenate((
            np.reshape(filenames, (-1, 1)), 
            np.zeros(shape=(len(filenames), row_size-1))
        ), axis=1)
    )
    splitted_dirpath = dirpath.split('\\')[-1]
    df_temp['file_name'] = df_temp['file_name'].apply(lambda x: f'{dirpath}\{x}')
    df_temp['quality_photo'] = 1
    df_temp['anticlass'] = 0
    if any(x == splitted_dirpath for x in 
           ['anticlass', 'anticlass_augm']
        ):
        df_temp['anticlass'] = 1
        df_sorted_data = pd.concat((df_sorted_data, df_temp))
        continue
    if 'garbage' == splitted_dirpath:
        df_temp['quality_photo'] = 0
        df_sorted_data = pd.concat((df_sorted_data, df_temp))
        continue
    if 'other' == splitted_dirpath:
        df_temp['other'] = 1
        df_sorted_data = pd.concat((df_sorted_data, df_temp))
        continue
    if any(x == splitted_dirpath for x in 
           ['damaged', 'receipt_damaged', 'receipt_undamaged', 'terminal', 'undamaged']
        ):
        df_temp['terminal'] = 1
    if any(
        x == splitted_dirpath for x in
        ['receipts', 'receipt_damaged', 'receipt_undamaged']
    ):
        df_temp['receipt'] = 1
    if any(
        x == splitted_dirpath for x in
        ['damaged', 'receipt_damaged']
    ):
        df_temp['terminal_damaged'] = 1
    if any(
        x == splitted_dirpath for x in
        ['receipt_undamaged', 'undamaged']
    ):
        df_temp['terminal_undamaged'] = 1
    if any(
        x == splitted_dirpath for x in
        ['terminal']
    ):
        df_temp['terminal_unrecognized_defect'] = 1
    df_sorted_data = pd.concat((df_sorted_data, df_temp))
df_sorted_data.reset_index(inplace=True, drop=True)

0it [00:00, ?it/s]

..\data\sorted_data_merged
..\data\sorted_data_merged\anticlass
..\data\sorted_data_merged\anticlass_augm
..\data\sorted_data_merged\damaged
..\data\sorted_data_merged\garbage
..\data\sorted_data_merged\other
..\data\sorted_data_merged\receipts
..\data\sorted_data_merged\receipt_damaged
..\data\sorted_data_merged\receipt_undamaged
..\data\sorted_data_merged\terminal
..\data\sorted_data_merged\undamaged


In [8]:
df_sorted_data[df_sorted_data.columns[1:]] = df_sorted_data[
    df_sorted_data.columns[1:]
].astype(float).astype(int)

## Проверки нелогичных сочетаний

In [9]:
df_sorted_data.loc[
    (df_sorted_data['terminal_damaged']==1) &
    (df_sorted_data['terminal_undamaged']==1)
].shape

(0, 9)

In [10]:
df_sorted_data.loc[
    df_sorted_data['other']==1
].sum(axis=0)

file_name                       ..\data\sorted_data_merged\other\Без поврежден...
quality_photo                                                                 302
terminal                                                                        0
receipt                                                                         0
terminal_damaged                                                                0
terminal_undamaged                                                              0
terminal_unrecognized_defect                                                    0
other                                                                         302
anticlass                                                                       0
dtype: object

In [11]:
df_sorted_data.loc[
    (df_sorted_data['terminal']==1) &
    (df_sorted_data['terminal_damaged']==0) & 
    (df_sorted_data['terminal_undamaged']==0) & 
    (df_sorted_data['terminal_unrecognized_defect']==0)
].shape

(0, 9)

In [12]:
df_sorted_data.loc[
    df_sorted_data['quality_photo']==0
].iloc[:, 1:].sum(axis=0)

quality_photo                   0
terminal                        0
receipt                         0
terminal_damaged                0
terminal_undamaged              0
terminal_unrecognized_defect    0
other                           0
anticlass                       0
dtype: int64

Проверки пройдены

In [13]:
df_sorted_data.to_pickle('../data/common_files/df_markup.pkl')

In [14]:
df_sorted_data

Unnamed: 0,file_name,quality_photo,terminal,receipt,terminal_damaged,terminal_undamaged,terminal_unrecognized_defect,other,anticlass
0,..\data\sorted_data_merged\anticlass\ЗНО030127...,1,0,0,0,0,0,0,1
1,..\data\sorted_data_merged\anticlass\ЗНО030127...,1,0,0,0,0,0,0,1
2,..\data\sorted_data_merged\anticlass\ЗНО030127...,1,0,0,0,0,0,0,1
3,..\data\sorted_data_merged\anticlass\ЗНО030128...,1,0,0,0,0,0,0,1
4,..\data\sorted_data_merged\anticlass\ЗНО030128...,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
12265,..\data\sorted_data_merged\undamaged\╨Ч╨Э╨Ю030...,1,1,0,0,1,0,0,0
12266,..\data\sorted_data_merged\undamaged\╨Ч╨Э╨Ю030...,1,1,0,0,1,0,0,0
12267,..\data\sorted_data_merged\undamaged\╨Ч╨Э╨Ю030...,1,1,0,0,1,0,0,0
12268,..\data\sorted_data_merged\undamaged\╨Ч╨Э╨Ю030...,1,1,0,0,1,0,0,0
