In [127]:
import iris
from distributed import Client

c = Client('172.31.18.5:8786')
c

<Client: scheduler="172.31.18.5:8786" processes=10 cores=10>

In [8]:
# one day of data is 2.7k files and approx 100GB
# iris loads about 1 file / sec / core
month = '01'
day = '01'
prefix = 'prods_op_mogreps-g_2016{}{}_00'.format(month, day)
prefix

'prods_op_mogreps-g_20160101_00'

In [9]:
from boto.s3.connection import S3Connection
import os

os.environ['S3_USE_SIGV4'] = 'True'

def list_files(bucket, prefix='prods'):
    conn = S3Connection(host='s3.eu-west-2.amazonaws.com')
    bucket = conn.get_bucket(bucket)
    results = []
    keys = iter(bucket.list(prefix=prefix))
    return ['/usr/local/share/notebooks/data/mogreps-g/{}'.format(k.key) for k in keys]


in_files = list_files('mogreps-g', prefix)
print(len(in_files))
in_files[:10]

696


['/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_003.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_006.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_009.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_012.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_015.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_018.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_021.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_024.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_027.nc',
 '/usr/local/share/notebooks/data/mogreps-g/prods_op_mogreps-g_20160101_00_00_030.nc']

In [10]:
# create a dask bag (db). 
# What we end up with is a list of instructions to run the 'load_cubes' function on each input file.
from iris.cube import CubeList
from dask import delayed
import dask.bag as db

@delayed
def load_cubes(address):
    def add_realization(cube, field, filename):
        if not cube.coords('realization'):
            realization = int(filename.split('_')[-2])
            realization_coord = iris.coords.AuxCoord(realization, standard_name='realization')
            cube.add_aux_coord(realization_coord)
    return iris.load(address, 'visibility_in_air', callback=add_realization)

delayed_cubes = db.from_delayed([load_cubes(f) for f in in_files])
delayed_cubes

dask.bag<bag-fro..., npartitions=696>

In [62]:
delayed_cubes.take(10)

(<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,)

In [63]:
p_cubes = c.persist(delayed_cubes)

In [67]:
p_cubes.take(1)

(<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,)

Working with metadata only is not very expensive, so we can pull the cubes back locally:

In [111]:
pcl = iris.cube.CubeList(p_cubes)
len(pcl)

696

In [113]:
sample = iris.cube.CubeList(filter(lambda x: x.coord('realization').points == [0], pcl))
print(len(sample))
sample

58


[<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (

In [72]:
sample[0].coord('time').var_name = 'time'
sample[0].coord('forecast_period').var_name = 'forecast_period'

In [73]:
sample.merge_cube()

<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>

In [74]:
realisations = range(11)

In [77]:
new_cl = iris.cube.CubeList([])
for r in realisations:
    cubes = iris.cube.CubeList(filter(lambda x: x.coord('realization').points == [r], pcl))
    cubes[0].coord('time').var_name = 'time'
    cubes[0].coord('forecast_period').var_name = 'forecast_period'
    cube = cubes.merge_cube()
    new_cl.append(cube)
    
new_cl

0
1
2
3
4
5
6
7
8
9
10


[<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (time: 58; latitude: 600; longitude: 800)>]

In [84]:
new_cl[0].coord('realization').points.dtype = 'int32'
new_cl[0].coord('realization').var_name = 'realization'

In [86]:
new_cl[0].coord('realization')

DimCoord(array([0]), standard_name='realization', units=Unit('1'), var_name='realization')

In [87]:
new_cl.merge_cube()

<iris 'Cube' of visibility_in_air / (m) (realization: 11; time: 58; latitude: 600; longitude: 800)>

We should be able to merge chunks at a time then concatenate:

In [107]:
a = iris.cube.CubeList(pcl[1:10]).merge_cube()

In [108]:
b = iris.cube.CubeList(pcl[10:20]).merge_cube()

In [109]:
iris.cube.CubeList([a, b]).concatenate()

[<iris 'Cube' of visibility_in_air / (m) (time: 19; latitude: 600; longitude: 800)>]

In [115]:
pcl

[<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (m) (latitude: 600; longitude: 800)>,
<iris 'Cube' of visibility_in_air / (

In [116]:
p_cubes

dask.bag<bag-fro..., npartitions=696>

In [121]:
b_cubes = db.from_sequence(pcl)

In [122]:
b_cubes

dask.bag<from_se..., npartitions=116>

In [132]:
grouped_cubes = b_cubes.groupby(lambda x: x.name())

In [133]:
grouped_cubes.take(1)

distributed.utils - ERROR - ('take-d6dbef9ea7b37ee0e5b385b5c097d053', 0)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/distributed/client.py", line 893, in _gather
    st = self.futures[key]
KeyError: "('take-d6dbef9ea7b37ee0e5b385b5c097d053', 0)"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/distributed/utils.py", line 149, in f
    result[0] = yield gen.maybe_future(func(*args, **kwargs))
  File "/opt/conda/lib/python3.5/site-packages/tornado/gen.py", line 1015, in run
    value = future.result()
  File "/opt/conda/lib/python3.5/site-packages/tornado/concurrent.py", line 237, in result
    raise_exc_info(self._exc_info)
  File "<string>", line 3, in raise_exc_info
  File "/opt/conda/lib/python3.5/site-packages/tornado/gen.py", line 1021, in run
    yielded = self.gen.throw(*exc_info)
  File "/opt/conda/lib/python3.5/site-packages/distributed/cl

KeyboardInterrupt: 

In [134]:
b = db.from_sequence(range(10))

In [135]:
b

dask.bag<from_se..., npartitions=10>

In [150]:
iseven = lambda x: x % 2 == 0
add = lambda x, y: x + y
dict(b.foldby(iseven, add)) 

distributed.utils - ERROR - ('foldby-b-a2454c281b16eeae5ab95fc641463ab6', 0)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/distributed/client.py", line 893, in _gather
    st = self.futures[key]
KeyError: "('foldby-b-a2454c281b16eeae5ab95fc641463ab6', 0)"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/distributed/utils.py", line 149, in f
    result[0] = yield gen.maybe_future(func(*args, **kwargs))
  File "/opt/conda/lib/python3.5/site-packages/tornado/gen.py", line 1015, in run
    value = future.result()
  File "/opt/conda/lib/python3.5/site-packages/tornado/concurrent.py", line 237, in result
    raise_exc_info(self._exc_info)
  File "<string>", line 3, in raise_exc_info
  File "/opt/conda/lib/python3.5/site-packages/tornado/gen.py", line 1021, in run
    yielded = self.gen.throw(*exc_info)
  File "/opt/conda/lib/python3.5/site-packages/distri

KeyboardInterrupt: 

In [147]:
dict(b.groupby(iseven, method = 'disk', max_branch = 3))

distributed.utils - ERROR - ('groupby-collect-4c48099f637655e61cfe3e6251a8b537', 4)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/distributed/client.py", line 893, in _gather
    st = self.futures[key]
KeyError: "('groupby-collect-4c48099f637655e61cfe3e6251a8b537', 4)"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/distributed/utils.py", line 149, in f
    result[0] = yield gen.maybe_future(func(*args, **kwargs))
  File "/opt/conda/lib/python3.5/site-packages/tornado/gen.py", line 1015, in run
    value = future.result()
  File "/opt/conda/lib/python3.5/site-packages/tornado/concurrent.py", line 237, in result
    raise_exc_info(self._exc_info)
  File "<string>", line 3, in raise_exc_info
  File "/opt/conda/lib/python3.5/site-packages/tornado/gen.py", line 1021, in run
    yielded = self.gen.throw(*exc_info)
  File "/opt/conda/lib/python3.5/site-p

KeyboardInterrupt: 

In [148]:
c.restart()

<Client: scheduler="172.31.18.5:8786" processes=10 cores=10>