# Working with data that doesn't fit in memory

In this notebook we will explore the necessary steps to do an arbitrary computation based on data from a dfs file, without reading the entire file in memory first.

In [58]:

import numpy as np
import dask.array as da
import dask

import mikeio
from mikecore.DfsFileFactory import DfsFileFactory


def my_dfs_reader(filename, item, t):
    dfs = DfsFileFactory.DfsGenericOpenEdit(str(filename))
    data = dfs.ReadItemTimeStep(itemNumber = item+1, timestepIndex=t)
    return data.Data

reader = dask.delayed(my_dfs_reader,pure=True)


item = 0
testfile = "../tests/testdata/wind_north_sea.dfsu"
filename = testfile
dfs = mikeio.open(filename)
shape = (dfs.n_elements,)
n_timesteps = dfs.n_timesteps

lazy_arrays = [reader(testfile,item,t) for t in range(n_timesteps)]

arrays = [da.from_delayed(a, dtype=np.float64, shape=shape) for a in lazy_arrays]

stack = da.stack(arrays, axis=0)
stack

Unnamed: 0,Array,Chunk
Bytes,44.91 kiB,7.48 kiB
Shape,"(6, 958)","(1, 958)"
Count,13 Graph Layers,6 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 44.91 kiB 7.48 kiB Shape (6, 958) (1, 958) Count 13 Graph Layers 6 Chunks Type float64 numpy.ndarray",958  6,

Unnamed: 0,Array,Chunk
Bytes,44.91 kiB,7.48 kiB
Shape,"(6, 958)","(1, 958)"
Count,13 Graph Layers,6 Chunks
Type,float64,numpy.ndarray


Calling e.g. `.mean()` on the `stack` only creates a computational graph, it doesn't exectute it.

In [59]:
stack.mean()

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,(),()
Count,17 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
Array Chunk Bytes 8 B 8 B Shape () () Count 17 Graph Layers 1 Chunks Type float64 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,(),()
Count,17 Graph Layers,1 Chunks
Type,float64,numpy.ndarray


In [62]:
stack.mean(axis=1).compute()

array([10.23455371, 10.26429205, 10.53168586, 10.79467749, 10.85831951,
       10.90603493])

As an example of a computation we choose the trimmed mean in each timestep, i.e. ignoring a fraction from the tails (e.g. caused by to numerical instability)
<https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.trim_mean.html>

In [61]:
from scipy import stats

tm = stats.trim_mean(stack, 0.1, axis=1)
tm

array([10.331429, 10.351207, 10.670218, 10.930687, 11.019432, 11.058955],
      dtype=float32)