In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# load pipeline and step parameters - do not edit
from sinara.substep import get_pipeline_params, get_step_params
pipeline_params = get_pipeline_params(pprint=True)
step_params = get_step_params(pprint=True)

**Pipeline params:**


{'X': 'something',
 'env_name': 'user',
 'pipeline_name': 'pipeline',
 'zone_name': 'zone'}




**Step params:**


{'Y': 'something_else'}




In [3]:
# Specify sub_step parameters
substep_params={
}

In [4]:
# define substep interface
from sinara.substep import NotebookSubstep, ENV_NAME, PIPELINE_NAME, ZONE_NAME, STEP_NAME, RUN_ID, ENTITY_NAME, ENTITY_PATH, SUBSTEP_NAME

substep = NotebookSubstep(pipeline_params, step_params, substep_params)

substep.interface(
    tmp_entities =
    [
        { ENTITY_NAME: "tmp_dir_to_store" },
        { ENTITY_NAME: "tmp_dir_to_load" }
    ],
    
    # custom_inputs = 
    # [
    #     { ENTITY_NAME: "big_file", ENTITY_PATH: "/data/tmp/user/pipeline/zone/big_file" }
    # ],
    
    outputs = 
    [
        { ENTITY_NAME: "stored_files" }
    ]
)

substep.print_interface_info()

substep.exit_in_visualize_mode()

**STEP NAME:**


'sinara_quick_test'




**OUTPUTS:**


[{'user.pipeline.zone.sinara_quick_test.stored_files': '/data/home/jovyan/pipeline/zone/sinara_quick_test/run-24-07-22-110101/stored_files'}]




**TMP ENTITIES:**


[{'tmp:user.pipeline.zone.sinara_quick_test.tmp_dir_to_store': '/tmp/env/user/pipeline/zone/sinara_quick_test/run-24-07-22-110101/tmp_dir_to_store'},
 {'tmp:user.pipeline.zone.sinara_quick_test.tmp_dir_to_load': '/tmp/env/user/pipeline/zone/sinara_quick_test/run-24-07-22-110101/tmp_dir_to_load'}]




In [5]:
from sinara.spark import SinaraSpark

spark = SinaraSpark.run_session(0)
SinaraSpark.ui_url()

Session is run


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/22 11:01:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/07/22 11:01:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
import zlib

def crc32(fileName):
    with open(fileName, 'rb') as fh:
        hash = 0
        while True:
            s = fh.read(65536)
            if not s:
                break
            hash = zlib.crc32(s, hash)
        return "%08X" % (hash & 0xFFFFFFFF)

In [7]:
# create test data
import os, random, string
from pathlib import Path
tmp_entities = substep.tmp_entities()

for i in range(1, 10):
    with open(f'{tmp_entities.tmp_dir_to_store}/file{i}.txt', 'w') as f:
        f.write(''.join(random.choices(string.ascii_lowercase, k=i*10))) # generate random content file size of i*10

with open(f'{tmp_entities.tmp_dir_to_store}/big_file.bin', 'wb') as f:
    f.write(os.urandom(1000000))    # generate random content file larger than ROW_SIZE
crc1 = crc32(f'{tmp_entities.tmp_dir_to_store}/big_file.bin')

Path(tmp_entities.tmp_dir_to_store, "subdir").mkdir(parents=True, exist_ok=True)

with open(f'{tmp_entities.tmp_dir_to_store}/subdir/sub_big_file.bin', 'wb') as f:
    f.write(os.urandom(1000000))    # generate random content file larger than ROW_SIZE in subdir
crc1 = crc32(f'{tmp_entities.tmp_dir_to_store}/subdir/sub_big_file.bin')

In [8]:
from sinara.archive import SinaraArchive
arhive = SinaraArchive(spark)

In [9]:
import glob
import os

outputs = substep.outputs()

stored_files = [os.path.basename(x) for x in glob.glob(tmp_entities.tmp_dir_to_store + '/**', recursive=True)]
print(stored_files)

arhive.pack_files_from_tmp_to_store(tmp_entities.tmp_dir_to_store, outputs.stored_files)

['', 'subdir', 'sub_big_file.bin', 'file4.txt', 'file3.txt', 'file6.txt', 'file8.txt', 'file9.txt', 'file2.txt', 'file5.txt', 'file7.txt', 'big_file.bin', 'file1.txt']


                                                                                

In [10]:
arhive.unpack_files_from_store_to_tmp(outputs.stored_files, tmp_entities.tmp_dir_to_load)
loaded_files = [os.path.basename(x) for x in glob.glob(tmp_entities.tmp_dir_to_load + '/**', recursive=True)]
print(loaded_files)

[Stage 4:>                                                          (0 + 3) / 3]

['', 'subdir', 'sub_big_file.bin', 'file4.txt', 'file3.txt', 'file6.txt', 'file8.txt', 'file9.txt', 'file2.txt', 'file5.txt', 'file7.txt', 'big_file.bin', 'file1.txt']


                                                                                

In [11]:
assert set(stored_files) == set(loaded_files), "stored and loaded files are not equal"

crc2 = crc32(f'{tmp_entities.tmp_dir_to_load}/big_file.bin')

assert set(crc1) == set(crc1), "stored and loaded big files files are not equal"

In [12]:
from pyspark.sql.functions import col,lit
df = arhive.pack_files_from_tmp_to_spark_df(tmp_entities.tmp_dir_to_load).withColumn("my_col", lit('myValue'))
df.printSchema()

root
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- relPath: string (nullable = true)
 |-- my_col: string (nullable = false)



In [13]:
df.show()

+--------------------+-------+--------------------+--------------------+-------+
|    modificationTime| length|             content|             relPath| my_col|
+--------------------+-------+--------------------+--------------------+-------+
|2024-07-22 11:01:...|     90|[75 64 66 75 66 6...|          /file9.txt|myValue|
|2024-07-22 11:01:...|     40|[74 6A 64 6A 7A 6...|          /file4.txt|myValue|
|2024-07-22 11:01:...|     70|[68 76 6E 62 6A 6...|          /file7.txt|myValue|
|2024-07-22 11:01:...|     60|[61 69 73 75 70 7...|          /file6.txt|myValue|
|2024-07-22 11:01:...|     50|[66 7A 73 63 6A 7...|          /file5.txt|myValue|
|2024-07-22 11:01:...|     20|[76 64 73 77 79 6...|          /file2.txt|myValue|
|2024-07-22 11:01:...|     80|[6B 78 76 64 6C 7...|          /file8.txt|myValue|
|2024-07-22 11:01:...|     10|[65 70 78 68 74 7...|          /file1.txt|myValue|
|2024-07-22 11:01:...|1000000|[C2 4E F0 65 AB 0...|       /big_file.bin|myValue|
|2024-07-22 11:01:...|     3

In [14]:
SinaraSpark.stop_session()