## Notebook to find the total floc value over time

In [None]:
import pandas as pd
import json

In [None]:
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=20)
cluster

In [None]:
url = []
filename = []
scene_tag = []
deployment = []
start_frame = []
end_frame = []

with open('/home/jovyan/floc_gsa_2019/region_listed.txt') as f:
    for line in f:
        with open(line.strip()) as l:
            data = json.load(l)
            for region in data['regions']:
                if region['type'] == 'static':
                    try:
                        if '_p2_z0' in region['sceneTag']:
                            url.append('https://rawdata.oceanobservatories.org/files' + data['movie']['URL'])
                            filename.append((data['movie']['URL'].split('/')[-1]))
                            scene_tag.append(region['sceneTag'])
                            deployment.append(int(scene_tag[-1].split('_')[0][1]))                            
                            start_frame.append(int(region['startFrame']))
                            end_frame.append(int(region['endFrame']))
                    except:
                        continue

In [None]:
scene_windows = pd.DataFrame({'filename': filename, 'url': url, 'scene_tag': scene_tag, 'deployment': deployment, 'start_frame': start_frame, 'end_frame': end_frame})
scene_windows.tail()

In [None]:
frame_interval = 150
window_buffer = 20
frame_lists = []
for i, start_frame in enumerate(scene_windows.start_frame):
    frame_lists.append(list(range(start_frame+window_buffer, scene_windows.end_frame[i]-window_buffer, frame_interval)))
scene_windows['frame_list'] = frame_lists
scene_windows.head()

In [None]:
len(frame_lists)

#### Wrappers to deal with potential get_moov_atom and get_frame timeouts

In [None]:
import pycamhd as camhd
import numpy as np

In [None]:
def get_moov_atom_timeout(filename):
    try:
        return camhd.get_moov_atom(filename)
    except:
        return False

In [None]:
def get_frame_timeout(filename, frame_number, pix_fmt, moov_atom):
    if moov_atom:
        try:
            return camhd.get_frame(filename, frame_number, pix_fmt, moov_atom)
        except:
            return np.zeros((1080, 1920), dtype=np.uint16)
    else:
        return np.zeros((1080, 1920), dtype=np.uint16)

#### Set up a delayed Dask array of images

In [None]:
from dask import delayed
import dask.array as dsa

In [None]:
delayed_frame_list = []
for i, row in scene_windows[scene_windows.deployment == 5].iterrows():
    filename = row.url
    delayed_moov_atom = delayed(get_moov_atom_timeout)(filename)
    for frame_number in row.frame_list:
        delayed_frame = delayed(get_frame_timeout)(filename, frame_number, 'gray16le', delayed_moov_atom)
        delayed_frame_list.append(dsa.from_delayed(delayed_frame, (1080, 1920), np.uint16))
delayed_frame_array = dsa.stack(delayed_frame_list)
delayed_frame_array

In [None]:
len(delayed_frame_array)

In [None]:
frame = delayed_frame_array[0].compute()
frame.shape

A dask array is in many ways like a numpy array, except in this case it holds a set of instructions for how to acquire each chunk of the array, which makes it easy to farm this array out to workers in the cloud using the [distributed](http://distributed.readthedocs.io/en/latest/#) scheduler.

#### Show the filter that will be used to filter images in the frequency domain
To deal with variations in lighting and high-frequency noise, we filter each subimage using a Butterworth bandpass filter.

In [None]:
d1 = 20 # low cut wavenumber
d2 = 400 # high cut wavenumber
n = 4
bff = butterworth(d1, d2, n)

In [None]:
def butterworth(d1, d2, n):
    x = np.arange(-1024/2+0.5,1024/2+1-0.5)
    xx, yy = np.meshgrid(x, x)
    d = np.sqrt(xx**2+yy**2)
    bff = (1 - (1./(1 + (d/d1)**(2*n))))*(1/(1 + (d/d2)**(2*n)))
    return bff

#### Define the floc proxy function
The floc proxy is simply the number of pixels in each filtered subimage that have a value greater than 4000.

In [None]:
def frame_filter(frame, d1, d2, n):
    if frame.ndim == 3 and frame.shape[0] == 1:
        I = np.squeeze(frame[0, 0:1024, 0:1024])
    else:
        I = frame[0:1024, 0:1024]
    bff = butterworth(d1, d2, n)
    I_fft = np.fft.fft2(I)
    I_fft_shift = np.fft.fftshift(I_fft)
    I_fft_shift_filt = I_fft_shift*bff # filter with the Butterworth filter
    I_fft_filt = np.fft.ifftshift(I_fft_shift_filt)
    I_filt = np.fft.ifft2(I_fft_filt)
    return I_filt

In [None]:
threshold = 4000; # this is an arbitrary threshold that seems to work
def frame_thresh(frame, d1, d2, n, threshold):
    I_filt = frame_filter(frame, d1, d2, n)
    I_thresh = np.array(np.absolute(I_filt)>threshold)
    return I_thresh

In [None]:
def calc_floc_proxy(frame, d1, d2, n):
    return np.array()

In [None]:
def calc_floc_proxy(frame, d1, d2, n):
    I_filt = frame_filter(frame, d1, d2, n)
    return np.array([(np.absolute(I_filt)>4000).sum()])

## working Backwards

In [None]:
floc_proxy = []
for i, row in scene_windows[scene_windows.deployment == 5].iterrows():
    filename = row.url
    delayed_moov_atom = delayed(get_moov_atom_timeout)(filename)
    for frame_number in row.frame_list:
        delayed_frame = delayed(get_frame_timeout)(filename, frame_number, 'gray16le', delayed_moov_atom)
        delayed_frame_thresh = delayed(frame_thresh)(delayed_frame, d1, d2, n, threshold)
        floc_proxy.append(delayed(calc_floc_proxy)(delayed_frame_thresh))
floc_proxy[0]

In [None]:
from dask.distributed import Client
client = Client(cluster)
client

In [None]:
from dask import compute

In [None]:
%%time
floc_volume = compute(*floc_proxy)

In [None]:
import datetime, math
import matplotlib.dates as dates

In [None]:
url = []
frame_datenum = []
frame_datetime = []
frame_numbers = []

dbcamhd = pd.read_json('/home/jovyan/floc_gsa_2019/dbcamhd.json', orient="records", lines=True)
for i, row in scene_windows[scene_windows.deployment == 5].iterrows():
    for frame_number in row.frame_list:
        url.append(row.url)
        frame_epoch_seconds = dbcamhd['timestamp'][dbcamhd.filename == row.url].iloc[0] + frame_number/29.97
        frame_datetime.append(datetime.datetime.fromtimestamp(frame_epoch_seconds))
        frame_datenum.append(dates.date2num(frame_datetime[-1]))
        frame_numbers.append(frame_number)

In [None]:
regions_results = pd.DataFrame({'url': url, 'frame_number': frame_numbers, 'timestamp': frame_datenum,
                        'datetime': frame_datetime, 'floc': floc_volume})
regions_results.head()

# Save Data as a pickle file

In [None]:
import pickle

In [None]:
with open('total_floc_for_dep_5.pickle', 'wb') as f:
    pickle.dump(regions_results, f)