In [2]:
from phi.tf.flow import * 
import tensorflow as tf



import numpy as np
import os
from scipy.signal import convolve
from phi.tf.flow import *

import tensorflow as tf
from tensorflow.keras import layers, models
import concurrent.futures

import math
import random
import pickle

import json

class DataLoader:
    def __init__(self, simulation_path, k):
        self.simulation_path = simulation_path
        self.k = k
        self.num_simulations, self.num_timesteps = self._count_simulations_and_timesteps()
        self.data = self._load_velocity_data()
        self.batched_data = None
        self.batched_std = None
        

    def _count_simulations_and_timesteps(self):
        # Count the number of simulation directories
        simulation_dirs = [d for d in os.listdir(self.simulation_path) if os.path.isdir(os.path.join(self.simulation_path, d))]
        num_simulations = len(simulation_dirs)

        # Count the number of velocity files in the first simulation directory
        first_sim_dir = os.path.join(self.simulation_path, simulation_dirs[0])
        velocity_files = [f for f in os.listdir(first_sim_dir) if f.startswith('velocity') and f.endswith('.npz')]
        num_timesteps = len(velocity_files)
        print(f"Number of simulations: {num_simulations}, Number of timesteps: {num_timesteps}")
        return num_simulations, num_timesteps
    

    def _load_velocity_data(self):
            def load_data_with_progress(sim):
                result = self._load_simulation_data(sim)
                progress = (sim + 1) / self.num_simulations * 100  # Calculate progress
                print(f"Progress: {progress:.2f}%")
                return result

            with concurrent.futures.ThreadPoolExecutor() as executor:
                data = list(executor.map(load_data_with_progress, range(self.num_simulations)))
            # Filter out empty arrays from each array in data before flattening
            return [element for array in data if array for element in array]                

    def _load_simulation_data(self, sim):

        sim_dir = os.path.join(self.simulation_path, f'sim_{sim:06d}')
        print(f"Loading simulation {sim + 1}/{self.num_simulations}")

        if not os.path.exists(sim_dir):
            print(f"Simulation directory {sim_dir} not found, skipping to the next simulation.")
            return []  # Return empty list for missing simulations
        sim_data = []
        for t in range(self.num_timesteps):
            try:
                velocity_data = np.load(os.path.join(sim_dir, f'velocity_{(t):06d}.npz'))['data']
                advection_diffusion_data = np.load(os.path.join(sim_dir, f'advection_diffusion_sum_{(t):06d}.npz'))['data']


                sim_data.append((velocity_data, advection_diffusion_data))
    

            except FileNotFoundError:
                print(f"File not found for simulation {sim}, timestep {t}, skipping to the next timestep.")


        print(f"Loaded simulation {sim + 1}/{self.num_simulations}")
        return sim_data
    
    def _compute_batch_std(self, batch_data):

        velocity_data_list = []
        advection_diffusion_data_list = []

        for sim, velocity_data, advection_diffusion_data, _ , _ in batch_data:
            velocity_data_list.append(velocity_data)
            advection_diffusion_data_list.append(advection_diffusion_data)

        # Convert lists to numpy arrays
        velocity_data_array = np.array(velocity_data_list)
        advection_diffusion_data_array = np.array(advection_diffusion_data_list)


        velocity_std = np.std(velocity_data_array.flatten())
        advection_diffusion_std = np.std(advection_diffusion_data_array.flatten())

        return velocity_std, advection_diffusion_std
    def prepare_batches(self, batch_size):
        self.batched_data = []
        self.batched_std = []

        total_batches = int(math.ceil(len(self.data) / batch_size))

        for batch_idx in range(total_batches):
            start_idx = batch_idx * batch_size
            end_idx = start_idx + batch_size
            batch_data = self.data[start_idx:end_idx]

            self.batched_data.append(batch_data)
            self.batched_std.append(self._compute_batch_std(batch_data))

        random.shuffle(self.batched_data)  # Shuffle the batches
        random.shuffle(self.batched_std)   # Shuffle the corresponding std deviations

        return self.batched_data
    
    def save_data_as_pickle(self, data, filename):
        with open(filename, 'wb') as file:
            pickle.dump(data, file)

    def load_from_pickle(self, filename):
        with open(filename, 'rb') as file:
            self.data = pickle.load(file)
        return self.data




In [3]:
dataloader = DataLoader('./simulation_output', 3)

Number of simulations: 22, Number of timesteps: 1001
Loading simulation 1/22
Loading simulation 2/22
Loading simulation 3/22
Loading simulation 4/22
Loading simulation 5/22
Loading simulation 6/22
Loading simulation 7/22
Loading simulation 8/22
Loading simulation 9/22
Loading simulation 10/22
Loading simulation 11/22
Loading simulation 12/22
Loading simulation 13/22
Loading simulation 14/22
Loading simulation 15/22
Loading simulation 16/22
Loading simulation 17/22
Loading simulation 18/22
Loading simulation 19/22
Loading simulation 20/22
Loading simulation 21/22
Loading simulation 22/22
Loaded simulation 17/22
Progress: 77.27%
Loaded simulation 12/22
Progress: 54.55%
Loaded simulation 13/22
Progress: 59.09%
Loaded simulation 7/22
Progress: 31.82%
Loaded simulation 5/22
Progress: 22.73%
Loaded simulation 16/22
Progress: 72.73%
Loaded simulation 15/22
Progress: 68.18%
Loaded simulation 11/22
Progress: 50.00%
Loaded simulation 3/22
Progress: 13.64%
Loaded simulation 21/22
Progress: 95.45%

In [4]:
dataloader.save_data_as_pickle(dataloader.data, 'data.pkl')

: 