In [None]:
import findspark
findspark.init()
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType,IntegerType
from petastorm.unischema import Unischema,UnischemaField
from petastorm.codecs import ScalarCodec,NdarrayCodec
import psutil
import os
import sys
sys.path.append(os.path.join(os.getcwd(), ".."))
from operations.dataset_manager import get_dataset,get_name_files,save_to_parquet_petastorm

## Get your dataset 

In [None]:
name = "mit-bih-noise-stress-test-database-1.0.0"
ignore_folder = False
files_name = get_name_files(name,ignore_inner_folder=ignore_folder)
dataset = get_dataset(name,fs=500,time_window=10,ignore_subdfolder=ignore_folder)

## Create data parquet file (the parquet scheme will be given)

### Checking your cpu core and usage

In [None]:
##Check number of cpu core and memory
print(os.cpu_count())
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)

### Ceating a Spark session

In [None]:
spark = SparkSession.builder.appName('MyfirstSpark').master('local[2]').config('spark.driver.memory', '4g').getOrCreate()

### Unischema use (This is the one we use! please provide your unischema)

In [None]:
ECGSchemaPhysio = Unischema(
    "ECGSchema",
    [
        UnischemaField("noun_id", np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField("signal", np.float64, (None,None,None), NdarrayCodec(), False),
        UnischemaField("fs", np.int_, (), ScalarCodec(IntegerType()), False),
        UnischemaField("sig_len", np.int_, (), ScalarCodec(IntegerType()), False),
        UnischemaField("sig_name", np.string_, (None,), NdarrayCodec(), False),
        UnischemaField("n_sig", np.int_, (), ScalarCodec(IntegerType()), False),
        UnischemaField("base_date", np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField("base_time", np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField("units", np.string_, (None,), NdarrayCodec(), False),
        UnischemaField("comments", np.string_, (None,), NdarrayCodec(), False),
        UnischemaField("nb_time_window", np.int_, (), ScalarCodec(IntegerType()), False),
    ],
)

### Save dataset into parquet file (the folder containing the files will be in your Physionet folder)

#### Define a row generator (return a single entry of the dataset. This is the one we use. You can modify it, if needed)

In [None]:
def row_generator(x):
    """Returns a single entry in the generated dataset."""
    return dataset[x]

### Save the dataset in parquet file (Warning: Depending on the size of your dataset, this process can take time)

In [None]:
save_to_parquet_petastorm(dataset, name, spark, ECGSchemaPhysio, row_generator)