In [1]:
import joblib 
from Parameters import Parameters, Data


In [2]:
params = Parameters()
data = Data('Testing_Zarr_Data')

params.analysis_name = 'analysis_name'

print(params)
print(data)

print(params.__dict__)
print(data.__dict__)



<Parameters.Parameters object at 0x00000193C9CC0080>
Data(zarr_path=Testing_Zarr_Data, loaded=True, dataset_keys=['df_test', 'large_array'])
{'voxel_size_yx': 130, 'voxel_size_z': 500, 'spot_z': 500, 'spot_yx': 360, 'local_dataset_location': None, 'clear_after_error': True, 'analysis_name': 'analysis_name', 'number_of_cores': 4, 'num_chunks_to_run': 100000, 'connection_config_location': '', 'display_plots': True, 'load_in_mask': True, 'order': 'pt', 'state': 'global', 'share_name': 'share', 'log_location': 'Users\\Jack\\All_Analysis', 'initial_data_location': None, 'nucChannel': None, 'cytoChannel': None, 'FISHChannel': None, 'experimental_params': [{}], 'timestep_s': None}
{'_zarr_path': WindowsPath('Testing_Zarr_Data'), '_ds': <zarr.hierarchy.Group '/'>, '_loaded': True}


In [3]:
from joblib import Parallel, delayed
import time

# Example function that uses Parameters
def process_data(param):
    # Simulate some computation using the parameter
    time.sleep(0.1)
    return param.voxel_size_yx ** 2

parameters = Parameters()

# Time the parallel computation
start_parallel = time.time()
results_parallel = Parallel(n_jobs=5)(delayed(process_data)(parameters) for _ in range(5))
end_parallel = time.time()

# Time the non-parallel computation
start_non_parallel = time.time()
results_non_parallel = [process_data(parameters) for _ in range(5)]
end_non_parallel = time.time()

# Print results and timing
print("Results (Parallel):", results_parallel)
print("Time taken (Parallel):", end_parallel - start_parallel, "seconds")
print("Results (Non-Parallel):", results_non_parallel)
print("Time taken (Non-Parallel):", end_non_parallel - start_non_parallel, "seconds")

Results (Parallel): [16900, 16900, 16900, 16900, 16900]
Time taken (Parallel): 1.5078895092010498 seconds
Results (Non-Parallel): [16900, 16900, 16900, 16900, 16900]
Time taken (Non-Parallel): 0.5018739700317383 seconds


In [4]:
from dask import delayed

import dask.array as da

# Create a large Dask array with dimensions (position, time, channel, z, y, x)
# For example, let's assume 2 positions, 3 time points, 2 channels, 5 z-slices, and 100x100 yx dimensions
shape = (2, 3, 2, 5, 100, 100)
large_array = da.random.random(shape, chunks=(1, 1, 1, 1, 100, 100))

# Define a function to process each z, y, x slice
def process_slice(slice_data):
    # Example: Compute the mean of the slice
    return slice_data.mean().compute()

# Process each z, y, x slice in parallel using Dask delayed
results_parallel_dask = []

@delayed
def process_slice_delayed(pos, time, channel, z):
    slice_data = large_array[pos, time, channel, z, :, :]
    result = process_slice(slice_data)
    return (pos, time, channel, z, result)

tasks = [
    process_slice_delayed(pos, t, channel, z)
    for pos in range(shape[0])
    for t in range(shape[1])
    for channel in range(shape[2])
    for z in range(shape[3])
]

# Time the parallel computation with Dask
start_parallel_dask = time.time()
results_parallel_dask = list(da.compute(*tasks))
end_parallel_dask = time.time()

# Process each z, y, x slice sequentially (non-parallel)
results_non_parallel_dask = []

start_non_parallel_dask = time.time()
for pos in range(shape[0]):
    for t in range(shape[1]):
        for channel in range(shape[2]):
            for z in range(shape[3]):
                slice_data = large_array[pos, t, channel, z, :, :]
                result = process_slice(slice_data)
                results_non_parallel_dask.append((pos, t, channel, z, result))
end_non_parallel_dask = time.time()

# Print results and timing
print("Results (Parallel with Dask):", results_parallel_dask)
print("Time taken (Parallel with Dask):", end_parallel_dask - start_parallel_dask, "seconds")
print("Results (Non-Parallel with Dask):", results_non_parallel_dask)
print("Time taken (Non-Parallel with Dask):", end_non_parallel_dask - start_non_parallel_dask, "seconds")

Results (Parallel with Dask): [(0, 0, 0, 0, 0.5032544864951622), (0, 0, 0, 1, 0.5017434452197618), (0, 0, 0, 2, 0.499485056207333), (0, 0, 0, 3, 0.4990280758178719), (0, 0, 0, 4, 0.4995080166227442), (0, 0, 1, 0, 0.4976006162983698), (0, 0, 1, 1, 0.503976821362141), (0, 0, 1, 2, 0.503053756287268), (0, 0, 1, 3, 0.5030243222424299), (0, 0, 1, 4, 0.5037906020858618), (0, 1, 0, 0, 0.49606832888313795), (0, 1, 0, 1, 0.5001357713241376), (0, 1, 0, 2, 0.4973797589443111), (0, 1, 0, 3, 0.4989699894295459), (0, 1, 0, 4, 0.4964646516890511), (0, 1, 1, 0, 0.5009779480384454), (0, 1, 1, 1, 0.4986610856353986), (0, 1, 1, 2, 0.5020718424318736), (0, 1, 1, 3, 0.4942228261307645), (0, 1, 1, 4, 0.4991573917934207), (0, 2, 0, 0, 0.500942210256239), (0, 2, 0, 1, 0.5045896662894759), (0, 2, 0, 2, 0.49956605965208717), (0, 2, 0, 3, 0.5040931250749611), (0, 2, 0, 4, 0.4966006082338183), (0, 2, 1, 0, 0.49603426428511066), (0, 2, 1, 1, 0.49948941427501586), (0, 2, 1, 2, 0.49679667157508967), (0, 2, 1, 3, 0.5

In [5]:
# Assuming the Data class provides access to the large array or similar data
data_instance = Data('Testing_Zarr_Data')

# Access the large array from the Data class
data_instance.large_array = large_array  # Replace 'large_array' with the actual attribute name in Data

# Define a function to process each z, y, x slice
def process_slice_with_data(slice_data):
    # Example: Compute the mean of the slice
    return slice_data.mean().compute()

# Process each z, y, x slice in parallel using Dask delayed
results_parallel_dask_with_data = []

@delayed
def process_slice_delayed_with_data(pos, time, channel, z):
    slice_data = data_instance.large_array[pos, time, channel, z, :, :]
    result = process_slice_with_data(slice_data)
    return (pos, time, channel, z, result)

tasks_with_data = [
    process_slice_delayed_with_data(pos, t, channel, z)
    for pos in range(shape[0])
    for t in range(shape[1])
    for channel in range(shape[2])
    for z in range(shape[3])
]

# Time the parallel computation with Dask using Data class
start_parallel_dask_with_data = time.time()
results_parallel_dask_with_data = list(da.compute(*tasks_with_data))
end_parallel_dask_with_data = time.time()

# Print results and timing
print("Results (Parallel with Dask using Data):", results_parallel_dask_with_data)
print("Time taken (Parallel with Dask using Data):", end_parallel_dask_with_data - start_parallel_dask_with_data, "seconds")

Results (Parallel with Dask using Data): [(0, 0, 0, 0, 0.5032544864951622), (0, 0, 0, 1, 0.5017434452197618), (0, 0, 0, 2, 0.499485056207333), (0, 0, 0, 3, 0.4990280758178719), (0, 0, 0, 4, 0.4995080166227442), (0, 0, 1, 0, 0.4976006162983698), (0, 0, 1, 1, 0.503976821362141), (0, 0, 1, 2, 0.503053756287268), (0, 0, 1, 3, 0.5030243222424299), (0, 0, 1, 4, 0.5037906020858618), (0, 1, 0, 0, 0.49606832888313795), (0, 1, 0, 1, 0.5001357713241376), (0, 1, 0, 2, 0.4973797589443111), (0, 1, 0, 3, 0.4989699894295459), (0, 1, 0, 4, 0.4964646516890511), (0, 1, 1, 0, 0.5009779480384454), (0, 1, 1, 1, 0.4986610856353986), (0, 1, 1, 2, 0.5020718424318736), (0, 1, 1, 3, 0.4942228261307645), (0, 1, 1, 4, 0.4991573917934207), (0, 2, 0, 0, 0.500942210256239), (0, 2, 0, 1, 0.5045896662894759), (0, 2, 0, 2, 0.49956605965208717), (0, 2, 0, 3, 0.5040931250749611), (0, 2, 0, 4, 0.4966006082338183), (0, 2, 1, 0, 0.49603426428511066), (0, 2, 1, 1, 0.49948941427501586), (0, 2, 1, 2, 0.49679667157508967), (0, 2

In [6]:
data_instance.large_array[0,0, 0, 0, 0, 0].compute()

array(0.16669672)

In [7]:
# Example: Multiply all elements in the large_array by 2
altered_array = large_array * 2

# Assign the altered array back to the data_instance
data_instance.large_array = altered_array

# Print a confirmation message
print("The large_array has been altered and updated in the data_instance.")

The large_array has been altered and updated in the data_instance.


In [8]:
data_instance.large_array[0,0, 0, 0, 0, 0].compute()

array(0.33339344)

In [9]:
# Example: Set all elements in the first position, first time point, and first channel to 0
a = data_instance.large_array
a[0, 0, 0, 0, 0, 0] = 3.1415
data_instance.large_array = a

# Print a confirmation message
print("A portion of the large_array has been altered.")

A portion of the large_array has been altered.


In [10]:
data_instance.large_array[0, 0, 0, 1, 1, 1].compute()

array(0.34713467)

In [11]:
data_instance.large_array[0,0, 0, 0, 0, 0].compute()

array(3.1415)

In [12]:
import pandas as pd

# Create a sample pandas DataFrame
df_test = pd.DataFrame({'Column1': [1, 2, 3], 'Column2': [4, 5, 6]})



data.df_test = df_test

In [13]:
data.df_test

Unnamed: 0,Column1,Column2
0,1,4
1,2,5
2,3,6


In [14]:
print(data)

Data(zarr_path=Testing_Zarr_Data, loaded=True, dataset_keys=['df_test', 'large_array'])


In [17]:
from joblib import Parallel, delayed
import pandas as pd

# Function to generate a DataFrame
def generate_dataframe(index):
    return pd.DataFrame({'Column1': [index, index + 1], 'Column2': [index + 2, index + 3]})

# Generate DataFrames in parallel
num_dataframes = 5
dataframes = Parallel(n_jobs=5)(delayed(generate_dataframe)(i) for i in range(num_dataframes))

# Concatenate all DataFrames into one
concatenated_df = pd.concat(dataframes, ignore_index=True)

# Assign the concatenated DataFrame to the data object
data.df_combined = concatenated_df

# Print the concatenated DataFrame
print(data.df_combined)

   Column1  Column2
0        0        2
1        1        3
2        1        3
3        2        4
4        2        4
5        3        5
6        3        5
7        4        6
8        4        6
9        5        7
