In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# load pipeline and step parameters - do not edit
from sinara.substep import get_pipeline_params, get_step_params
pipeline_params = get_pipeline_params(pprint=True)
step_params = get_step_params(pprint=True)

**Pipeline params:**


{'X': 'something',
 'env_name': 'user',
 'pipeline_name': 'pipeline',
 'zone_name': 'zone'}




**Step params:**


{'Y': 'something_else'}




In [3]:
# Specify sub_step parameters
substep_params={
}

In [4]:
# define substep interface
from sinara.substep import NotebookSubstep, ENV_NAME, PIPELINE_NAME, ZONE_NAME, STEP_NAME, RUN_ID, ENTITY_NAME, ENTITY_PATH, SUBSTEP_NAME

substep = NotebookSubstep(pipeline_params, step_params, substep_params)

substep.interface(
    tmp_entities =
    [
        { ENTITY_NAME: "tmp_dir_to_store" },
        { ENTITY_NAME: "tmp_dir_to_load" }
    ],
    
    # custom_inputs = 
    # [
    #     { ENTITY_NAME: "big_file", ENTITY_PATH: "/data/tmp/user/pipeline/zone/big_file" }
    # ],
    
    outputs = 
    [
        { ENTITY_NAME: "stored_files" }
    ]
)

substep.print_interface_info()

substep.exit_in_visualize_mode()

**STEP NAME:**


'sinara_quick_test'




**OUTPUTS:**


[{'user.pipeline.zone.sinara_quick_test.stored_files': '/data/home/jovyan/pipeline/zone/sinara_quick_test/run-24-06-11-113955/stored_files'}]




**TMP ENTITIES:**


[{'tmp:user.pipeline.zone.sinara_quick_test.tmp_dir_to_store': '/tmp/env/user/pipeline/zone/sinara_quick_test/run-24-06-11-113955/tmp_dir_to_store'},
 {'tmp:user.pipeline.zone.sinara_quick_test.tmp_dir_to_load': '/tmp/env/user/pipeline/zone/sinara_quick_test/run-24-06-11-113955/tmp_dir_to_load'}]




In [5]:
from sinara.spark import SinaraSpark

spark = SinaraSpark.run_session(0)
SinaraSpark.ui_url()

Session is run


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/11 11:39:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
import zlib
import shutil
import os

def crc32(fileName):
    with open(fileName, 'rb') as fh:
        hash = 0
        while True:
            s = fh.read(65536)
            if not s:
                break
            hash = zlib.crc32(s, hash)
        return "%08X" % (hash & 0xFFFFFFFF)

def rm_rf(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [7]:
# create test data
# import os, random, string
# from pathlib import Path
# tmp_entities = substep.tmp_entities()


# with open(f'{tmp_entities.tmp_dir_to_store}/big_file.bin', 'wb') as f:
#     f.write(os.urandom(1000000))    # generate random content file larger than ROW_SIZE
# crc1 = crc32(f'{tmp_entities.tmp_dir_to_store}/big_file.bin')

# Path(tmp_entities.tmp_dir_to_store, "subdir").mkdir(parents=True, exist_ok=True)

# with open(f'{tmp_entities.tmp_dir_to_store}/subdir/sub_big_file.bin', 'wb') as f:
#     f.write(os.urandom(1000000))    # generate random content file larger than ROW_SIZE in subdir
# crc1 = crc32(f'{tmp_entities.tmp_dir_to_store}/subdir/sub_big_file.bin')

In [8]:
from sinara.archive import SinaraArchive
arhive = SinaraArchive(spark)

In [9]:
import glob
import os, string, time, random
from pathlib import Path

tmp_entities = substep.tmp_entities()
outputs = substep.outputs()

stored_files = [os.path.basename(x) for x in glob.glob(tmp_entities.tmp_dir_to_store + '/**', recursive=True)]
print(stored_files)

total_size = 5000000000
arhive.BLOCK_SIZE = 100 * 1024 * 1024
#arhive.ROW_SIZE = 100 * 1024

for file_size in [100000, 100000, 10000000, 100000000]:
    files_count = int(total_size / file_size)
    for i in range(1, files_count):
        with open(f'{tmp_entities.tmp_dir_to_store}/big_file_{i}.bin', 'wb') as f:
            f.write(os.urandom(file_size))

    t_start = time.perf_counter()
    arhive.pack_files_from_tmp_to_store(tmp_entities.tmp_dir_to_store, outputs.stored_files)
    all_time = time.perf_counter() - t_start
    
    print(f"size: {file_size}; count: {files_count}; time: {all_time}")
    
    rm_rf(tmp_entities.tmp_dir_to_store)
    rm_rf(outputs.stored_files)
    

['']


                                                                                

size: 100000; count: 50000; time: 18.785607662051916


                                                                                

size: 100000; count: 50000; time: 12.21726895030588


                                                                                

size: 10000000; count: 500; time: 25.16897779982537


                                                                                

size: 100000000; count: 50; time: 9.512062037363648


In [10]:
SinaraSpark.stop_session()

# 10
```
24/06/11 08:15:24 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
size: 10000; count: 500000; time: 54.528294269926846
size: 10000; count: 500000; time: 50.58823824673891
size: 100000; count: 50000; time: 10.615476909093559
size: 10000000; count: 500; time: 10.243693131022155
size: 100000000; count: 50; time: 9.453986265696585

size: 10000; count: 500000; time: 54.31516759702936
size: 10000; count: 500000; time: 51.02222422324121
size: 100000; count: 50000; time: 9.990651411004364
size: 10000000; count: 500; time: 9.846800909843296
size: 100000000; count: 50; time: 9.265271169133484

size: 100000; count: 50000; time: 16.17284558620304
24/06/11 08:52:51 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
size: 100000; count: 50000; time: 9.992413585074246
size: 10000000; count: 500; time: 9.9931136877276
size: 100000000; count: 50; time: 9.151462461333722
```

# 20
```
size: 100000; count: 50000; time: 15.445336972828954
size: 100000; count: 50000; time: 9.1085306070745
size: 10000000; count: 500; time: 10.776335041038692
size: 100000000; count: 50; time: 8.808542616665363

24/06/11 09:09:24 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
size: 10000; count: 500000; time: 53.49461039388552
size: 10000; count: 500000; time: 49.78237945586443
size: 100000; count: 50000; time: 9.234182722866535
size: 100000; count: 50000; time: 8.735903637949377
size: 10000000; count: 500; time: 10.782224525231868
size: 100000000; count: 50; time: 8.998356034047902
```

# 50
```
24/06/11 09:17:50 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
size: 10000; count: 500000; time: 68.61532019311562
size: 10000; count: 500000; time: 65.46855010604486
size: 100000; count: 50000; time: 11.48712065583095
size: 100000; count: 50000; time: 10.554944942239672
size: 10000000; count: 500; time: 15.843907570000738

```

# 100
```
24/06/11 09:24:00 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
size: 10000; count: 500000; time: 68.16287109581754
size: 10000; count: 500000; time: 65.88318243110552
size: 100000; count: 50000; time: 12.077333530876786
size: 100000; count: 50000; time: 10.022179109975696
size: 10000000; count: 500; time: 25.557263953145593
size: 100000000; count: 50; time: 9.462536289356649
```

# 10
BLOCK_SIZE = 10 * 1024 * 1024 - defines number of partitions</br>
ROW_SIZE = 100 * 1024 - defines files chunk size
```
size: 1000000; count: 5000; time: 54.4906174539974
size: 1000000; count: 5000; time: 46.96132314600254
size: 10000000; count: 500; time: 65.37026316500123
size: 100000000; count: 50; time: 67.36531332600134

size: 100000; count: 50000; time: 59.05600920800134
size: 1000000; count: 5000; time: 48.675525227001344
size: 10000000; count: 500; time: 66.57178226700125
size: 100000000; count: 50; time: 66.9379985839987

size: 100000; count: 50000; time: 57.292460402997676
size: 100000; count: 50000; time: 54.66207834600209
size: 10000000; count: 500; time: 68.61515304999921
size: 100000000; count: 50; time: 64.979802793001


24/06/10 15:51:49 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
size: 10000; count: 500000; time: 161.07436843399773
size: 10000; count: 500000; time: 142.60217995700077
size: 100000; count: 50000; time: 54.30553206100012
size: 10000000; count: 500; time: 65.6778254469973
size: 100000000; count: 50; time: 62.57982315299887
```

# 20
arhive.BLOCK_SIZE = 20 * 1024 * 1024</br>
arhive.ROW_SIZE = 100 * 1024</br>
24/06/10 15:14:08 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
```
size: 10000; count: 500000; time: 150.41656158399564
size: 10000; count: 500000; time: 144.28740993400424
size: 100000; count: 50000; time: 51.9730229850029
size: 10000000; count: 500; time: 68.41051908200461
size: 100000000; count: 50; time: 63.28373109500535
```

# 30
BLOCK_SIZE = 30 * 1024 * 1024</br>
ROW_SIZE = 100 * 1024</br>
24/06/10 15:14:08 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
```
size: 10000; count: 500000; time: 156.74156803900405
size: 10000; count: 500000; time: 165.01108286800445
size: 100000; count: 50000; time: 51.56755971300299
size: 10000000; count: 500; time: 70.55579972799751
size: 100000000; count: 50; time: 64.03572041999723
```