# Tutorial: Data class

In [None]:
import sys
from datetime import datetime
import numpy as np
import dask
import xarray as xr

In [None]:
# connect to dask client
from dask.distributed import Client

client = Client("tcp://127.0.0.1:49686")
client

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2
sys.path.insert(0, '../')
from clouddrift.data import erddap, local

# Remote ERDDAP server [(link)](https://data.pmel.noaa.gov/generic/erddap/tabledap/gdp_hourly_velocities.html)

## Unique drifter

In [None]:
# get drifter from errdap
e = erddap()
e.retrieve_drifter(10050130)
e.print_constraints()

In [None]:
%%time
ds = e.to_xarray()
ds

## list of drifters

In [None]:
# get drifter from errdap
e = erddap()

drifter_ids = [10050130, 10051120]

dfs = {}
for i in drifter_ids:
    e.retrieve_drifter(i)
    dfs[i] = e.to_xarray()

In [None]:
dfs[10050130]

In [None]:
dfs[10051120]

## region

In [None]:
e2 = erddap()

lon = [-98, -78]
lat = [18, 31]
day0 = "%sZ" % datetime(2015,1,1).isoformat()
day1 = "%sZ" % datetime(2020,12,31).isoformat()
time = [day0, day1]

e2.retrieve_region(lon, lat, time)

e2.print_constraints()

In [None]:
%%time
ds2 = e2.to_xarray()
ds2

# Local

In [None]:
path_clouddrift = '../data/process/gdp_v2.00_obs.nc'
path_traj_clouddrift = '../data/process/gdp_v2.00_traj.nc'

In [None]:
%%time
l = local(path_clouddrift, path_traj_clouddrift)

In [None]:
l.ds_obs.ve

In [None]:
%%time
l.ds_obs.ve.mean().compute()

## unique drifter

In [None]:
%%time
ds = l.retrieve_drifter(101509)

In [None]:
ds

## list of drifters

In [None]:
%%time
ds = l.retrieve_drifter([101509, 9927907])

In [None]:
ds

In [None]:
%%time
# retrieve 50 random drifters
idx_to_retrieve = l.ds_traj['ID'][np.random.randint(0, l.number_traj, 50)].values
ds = l.retrieve_drifter(idx_to_retrieve)

In [None]:
ds

## region

In [None]:
%%time

lon = [-98, -78]
lat = [18, 31]
day0 = "%sZ" % datetime(2015,1,1).isoformat()
day1 = "%sZ" % datetime(2020,12,31).isoformat()
time = [day0, day1]
ds2 = l.retrieve_region(lon, lat, time)
ds2

# Testing

## groupby and map()

In [None]:
%%time
#grouped_traj = l.ds_obs.groupby('ids')

# used smaller group for now
grouped_traj = ds2.groupby('ids')

In [None]:
grouped_traj

In [None]:
keys = list(grouped_traj.groups.keys())

In [None]:
grouped_traj[keys[150]]

In [None]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    # distance
    earthRadius = 6371 # km
    d = 2 * np.arcsin(np.sqrt(a)) * earthRadius # km
    return d

In [None]:
def distance(ds):
    return haversine(ds.longitude[:-1], ds.latitude[:-1], 
                     ds.longitude[1:], ds.latitude[1:])

In [None]:
grouped_traj

In [None]:
grouped_traj.mean()

In [None]:
distance(ds2)

In [None]:
grouped_traj.map(distance)

In [None]:
def mean(obj, dim):

    # note: apply always moves core dimensions to the end

    return xr.apply_ufunc(

        np.mean, obj, input_core_dims=[[dim]], kwargs={"axis": -1}

    )

In [None]:
ds2['ve']

In [None]:
mean(ds2['ve'], ds2.dims['obs'])

In [None]:
def mean_traj(ds):
    da = xr.DataArray(
    data=temperature,

    dims=["x", "y", "time"],

    coords=dict(

        lon=(["x", "y"], lon),

        lat=(["x", "y"], lat),

        time=time,

        reference_time=reference_time,

    ),

    attrs=dict(

        description="Ambient temperature.",

        units="degC",

    ),

)

In [None]:
grouped_traj

In [None]:
grouped_traj.apply(mean_traj)

In [None]:
value = np.zeros(l.number_traj)

for i in range(0, len(l.traj_idx)-1):
    r = l.traj_idx[i:i+1]
    value[i] = l.ds_obs['longitude'][r].mean()

In [None]:
%%time
mean_traj = grouped_traj.mean('obs')

In [None]:
def midpoint_trajectory():
    return ds.longitude.mean()

In [None]:
%%time
grouped_traj_longitude = l.ds_obs['longitude'].groupby('ids')

In [None]:
grouped_traj_longitude

In [None]:
grouped_traj_longitude[0]

In [None]:
%%time
grouped_traj_longitude.mean()

In [None]:
%%time
grouped_traj.mean('longitude')

In [None]:
%%time
grouped_traj = l.ds_obs[["ve", "vn"]].groupby('ids')

In [None]:
grouped_traj

In [None]:
grouped_traj[list(grouped_traj.groups.keys())[150]]

In [None]:
%%time
grouped_traj.map(lambda ve: ve - ve.mean(), shortcut=False)

In [None]:
grouped_traj[0].isel(traj=[100])

In [None]:
grouped_traj[1]

In [None]:
grouped_traj[100].isel(traj=[100])

# manually apply function at each trajectory

In [None]:
def distance(i):
    # get first and last point
    ds_subset = l.ds.isel(traj=[i], obs=slice(l.traj_idx[i], l.traj_idx[i + 1]))
    lon1 = ds_subset.longitude[0]
    lat1 = ds_subset.latitude[0]
    lon2 = ds_subset.longitude[-1]
    lat2 = ds_subset.latitude[-1]
    
    # convert to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [None]:
%%time
a = distance(10)

In [None]:
a.compute()

In [None]:
import dask.bag as db
b = db.from_sequence(range(10), npartitions=10).map(distance)

In [None]:
b

In [None]:
%%time
d = b.compute()

In [None]:
d

In [None]:
import matplotlib.pyplot as plt 
plt.plot(d)