# Preparation of the growing season length dataset

The data is described in Donat et al., ([2013](http://onlinelibrary.wiley.com/doi/10.1002/jgrd.50150/abstract)), and was obtained from http://www.climdex.org/. 

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import xarray as xr

In [None]:
fN = "/home/mathause/teaching/2018_PyVisWorkshop/rawdata/HadEX2_GSL.nc"

ds = xr.open_dataset(fN)

In [None]:
ds

In [None]:
# rename Ann -> GSL

ds = ds.rename(dict(Ann="GSL"))
ds

In [None]:
# the time axis is not CF-compiant...
ds.time[:10]

In [None]:
# use pandas to convert the time axis
time = pd.to_datetime(ds.time, format="%Y%m%d")

# alternatively: do it manually
# time = pd.date_range('01.01.1901', '01.01.2010', freq='AS')

ds["time"] = time

ds.time[:10]

In [None]:
# select the years 1956 to 2005 (inclusive)
ds = ds.sel(time=slice("1956", "2005"))

ds

Create a function that calculates the regression slope for one pixels. Then apply the function to each lon/ lat pixel. After a [example](https://gist.github.com/rabernat/bc4c6990eb20942246ce967e6c9c3dbe) by [Ryan Abernathey](https://github.com/rabernat).

In [None]:
# the linear regression can be very unstable if we use large x values (i.e. 1956 to 2010).
time_ind = np.arange(50)


def xr_regression(y):
    # s: slope
    # p: p-value

    # no need to calculate anything if we are over the ocean
    if np.all(np.isnan(y.values)):
        s = np.nan
        p = np.nan

    else:
        # smf.glm expects a pd DataFrame
        df = pd.DataFrame([y.values, time_ind]).T
        df.columns = ["GSL", "time"]

        # linear regression using R-like syntay
        glm = smf.glm("GSL ~ time", df, missing="drop")
        fit = glm.fit()
        # read slope and p-value
        s = fit.params["time"]
        p = fit.pvalues["time"]

    return xr.DataArray([s, p])

In [None]:
# combine lat and lon into one coordinate
stacked = ds.GSL.stack(allpoints=("lat", "lon"))
stacked = stacked.reset_coords(drop=True)

# apply the function to all points
coefs = stacked.groupby("allpoints").apply(xr_regression)
# create the lat/ lon grid again
coefs_unstacked = coefs.unstack("allpoints")

In [None]:
# add trend and p-values to the DataSet
ds = ds.assign(trend=coefs_unstacked.sel(dim_0=0))
ds = ds.assign(p_val=coefs_unstacked.sel(dim_0=1))

In [None]:
ds.attrs = dict(
    data="Growing season length",
    source="HadEX2 (http://www.climdex.org/)",
    reference="Donat et al., 2013",
)
ds.attrs

In [None]:
ds

In [None]:
ds.to_netcdf("./HadEX2_GSL.nc", format="NETCDF4_CLASSIC")