In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#1. import dsml base module
from dsml_s8e.notebook import CurrentComponent, ResourceComponent, RunResult
from dsml_s8e.module import DSMLModule, DSMLModuleRunResult

In [None]:
#2. specify parameters

# Parameters
run_parameters = {
    "env_name": "user",
    "product_name": "cv_example",
    "stand_name": "YOLOX_mmdet",
    "docker_image": "cv-no-gpu:latest",
    "conda_env": "gpu",
    "business_report_repo": "",
    "infra": {},
    "comment": {},
}

parameters = {
    "all_datasets" : ["/data/home/lakidon_pm/cv_example/YOLOX_mmdet/all_data", 
    ],  
}

In [None]:
import json

print(json.dumps(parameters, indent=4))

In [None]:
from timeit import default_timer as timer

import os
import csv
import json
import random
import cv2
from tqdm import tqdm
import numpy as np
import random

random.seed(42)
np.random.seed(42)

module = DSMLModule(parameters, run_parameters)

# temporary speed cache (like ssd 100Gb) for unzip parquents, need to use for train, test. Will be removed after job end
cache_urls = module.make_cache_urls(
    new_cache_entity_names=["cache_data"],    
    last_cache_entity_names=[])

data_custom_list = []
for i, dataset_link in enumerate(parameters['all_datasets']):
    data_custom_list.append(dataset_link)

resource_urls = module.make_component_resource_urls("1_data_import", 
    entity_names=[
        *(f'all_data_{i}' for i in range(len(data_custom_list))),
    ]
)

a7s_urls = module.make_artifacts_urls(
    entity_names=[
        "split_config",
    ]
)

module.print_urls()

In [None]:
from dsml_s8e.spark import SparkEnvironment

SparkEnvironment.stopSparkSession()
spark = SparkEnvironment.runSparkSession(0)
SparkEnvironment.showSparkUI()

import atexit
_=atexit.register(SparkEnvironment.stopSparkSession)

In [None]:
cache_urls.cache_data

In [None]:
# CACHE DATA

from timeit import default_timer as timer

from os import path, makedirs

t_begin = timer()
print(f"spark read start")

def save_file(file):
    file_name = path.join(cache_urls.cache_data, file.file_names)
    file_binary = file.files_binary

    makedirs(path.dirname(file_name), exist_ok=True)        
    with open(file_name, 'wb') as f_id:
        f_id.write(file_binary)
        
for i in range(len(parameters['all_datasets'])):
    df_spark = spark.read.parquet(eval(f'resource_urls.all_data_{i}'))
    df_spark.foreach(save_file)

process_time = timer() - t_begin
print(f"spark data to cache end\nt = {process_time:0.1f} sec\n{'=' * 100}")

In [None]:
!ls {cache_urls.cache_data}

In [None]:
# Split train and val

train_datasets = []
eval_datasets = []

for dataset in os.listdir(cache_urls.cache_data):
    if 'eval' in dataset:
        eval_datasets.append(dataset)
    else:
        train_datasets.append(dataset)

In [None]:
print(f'{train_datasets=}')
print(f'{eval_datasets=}')

In [None]:
SPLIT_CONFIG = {
    'train_datasets': train_datasets,
    'eval_datasets': eval_datasets,
}

with open(os.path.join(cache_urls.cache_data, 'split_config.json'), 'w') as f:
    json.dump(SPLIT_CONFIG, f)

Для того, чтобы скопировать **только один файл** из кэша на *HDFS* существует функция 
```
DSMLStore.copy_file_to_store(resource_cache_path, target_hdfs_path)
```
Если файлов много, то нужно конвертировать их в формат `parquet` и использовать *Spark* для загрузки. Пример для этого случая будет в модуле `3_aug_dataset_and_save_prepared.ipynb` 

Также существует функция 
```
DSMLStore.copy_file_to_cache(resource_hdfs_path, target_cache_path)
```
но ей мы воспользуемся в модуле `2_preview_dataset.ipynb`

In [None]:
from dsml_s8e.store import DSMLStore

DSMLStore.copy_file_to_store(os.path.join(cache_urls.cache_data, 'split_config.json'), os.path.join(a7s_urls.split_config, 'split_config.json'))

In [None]:
#11 #SparkEnvironment.stopSparkSession()

SparkEnvironment.stopSparkSession()