In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#1. import dsml base module
from dsml_s8e.notebook import CurrentComponent, ResourceComponent, RunResult
from dsml_s8e.module import DSMLModule, DSMLModuleRunResult

In [None]:
#2. specify parameters

# Parameters
run_parameters = {
    "env_name": "user",
    "product_name": "cv_example",
    "stand_name": "YOLOX_mmdet",
    "docker_image": "cv-no-gpu:latest",
    "conda_env": "gpu",
    "business_report_repo": "",
    "infra": {},
    "comment": {},
}

parameters = {}

In [None]:
import json

print(json.dumps(parameters, indent=4))

In [None]:
from timeit import default_timer as timer

import os
import csv
import json
import random
import cv2
from tqdm import tqdm
import numpy as np
import random

random.seed(42)
np.random.seed(42)

module = DSMLModule(parameters, run_parameters)

# temporary speed cache (like ssd 100Gb) for unzip parquents, need to use for train, test. Will be removed after job end
cache_urls = module.make_cache_urls(
    new_cache_entity_names=[],    
    last_cache_entity_names=["cache_data"])

resource_urls = module.make_component_resource_urls("1_data_import", 
    entity_names=[
        "split_config",
    ]
)

In [None]:
from dsml_s8e.spark import SparkEnvironment

SparkEnvironment.stopSparkSession()
spark = SparkEnvironment.runSparkSession(0)
SparkEnvironment.showSparkUI()

import atexit
_=atexit.register(SparkEnvironment.stopSparkSession)

In [None]:
!ls -lah {cache_urls.cache_data}

In [None]:
def get_files(path, extensions, relative=True):
    from pathlib import Path

    if type(extensions) is str:
        extensions = [extensions]

    all_files = []
    for ext in extensions:
        if relative:
            all_files.extend(Path(path).rglob(ext))
        else:
            all_files.extend(Path(path).glob(ext))

    for i in range(len(all_files)):
        all_files[i] = str(all_files[i])

    return all_files

Для того, чтобы загрузить один файл из *HDFS*в локальный кэш нужно использовать функцию
```
DSMLStore.copy_file_to_cache(resource_hdfs_path, target_cache_path)
```

In [None]:
from dsml_s8e.store import DSMLStore

DSMLStore.copy_file_to_cache(
    os.path.join(resource_urls.split_config, 'split_config.json'),
    os.path.join(cache_urls.cache_data, 'split_config.json')
)

In [None]:
split_config_path = os.path.join(cache_urls.cache_data, 'split_config.json')

with open(split_config_path) as f:
    SPLIT_CONFIG = json.load(f)

In [None]:
CONFIG = dict(
    TRAIN_DATASET_DIRS=[os.path.join(cache_urls.cache_data, d) for d in SPLIT_CONFIG['train_datasets']],
    EVAL_DATASET_DIRS=[os.path.join(cache_urls.cache_data, d) for d in SPLIT_CONFIG['eval_datasets']],
)

In [None]:
from utils import get_files
from utils.coco import preview_coco_file, load as load_coco, dump as dump_coco

In [None]:
real_files = []
for dir_path in CONFIG.get('TRAIN_DATASET_DIRS', []):
    real_files += get_files(dir_path, '*.json')

print(f"{len(real_files)=}")

eval_files = []
for dir_path in CONFIG.get('EVAL_DATASET_DIRS', []):
    eval_files += get_files(dir_path, '*.json')

print(f"{len(eval_files)=}")

In [None]:
for files in [real_files, eval_files]:
    if files:
        try:
            preview_coco_file(files[0], max_objects=2000)
        except:
            continue            

In [None]:
import matplotlib.pyplot as plt

areas  = []
counts = []
categories = []
categories_annotation = []

for files in [real_files, eval_files]:
    if files:
        for file in tqdm(files):
            coco_data = load_coco(file)
            
            anns = coco_data.get('annotations', [])
            
            for image in coco_data.get('images', []):
                image_anns = [ann for ann in anns if ann['image_id'] == image['id']]
                counts.append(len(image_anns))

                for ann in image_anns:
                    areas.append(ann.get('area'))
                    categories.append(ann.get('category_id'))
            
            categories_annotation += coco_data['categories']
            
counts = np.array(counts)
areas  = np.array(areas)

In [None]:
for _category in [json.loads(_data) for _data in set([json.dumps(_data) for _data in categories_annotation])]:
    print(_category)

In [None]:
np.unique(categories)

In [None]:
import plotly.express as px

fig = px.histogram(areas, title='Площать объектов на фотографиях датасета')
fig.layout.yaxis.title = 'Кол-во объектов'
fig.layout.xaxis.title = 'Площадь'
fig.show()

In [None]:
fig = px.histogram(counts, title='Кол-во объектов на фотографиях датасета')
fig.layout.yaxis.title = 'Кол-во объектов'
fig.layout.xaxis.title = 'Кол-во фотографий'
fig.show()

In [None]:
#11 #SparkEnvironment.stopSparkSession()

SparkEnvironment.stopSparkSession()