### In this notebook we explore the data lightly and create some preliminary plots

In [None]:
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import collections
import os

processed_folder_path = os.path.join("..", "data", "processed")
unprocessed_folder_path = os.path.join("..", "data", "unprocessed")

## Plot data density

### NDSI and NDVI data density given time

In [None]:
df_NDSI = pd.read_csv(os.path.join(processed_folder_path, "NDSI.csv"), index_col=0, parse_dates=["date"],
                     dtype={"Subsubwatershed": str})
df_NDVI = pd.read_csv(os.path.join(processed_folder_path, "NDVI.csv"), index_col=0, parse_dates=["date"],
                     dtype={"Subsubwatershed": str})

In [None]:
def plot_data_density(dfs, labels):
    plt.figure(figsize=(14, 8))
    for df, label in zip(dfs, labels):
        plt.plot(df.groupby(df["date"].dt.year).size(), label=label)
    
    plt.title("Frequency of years in data")
    plt.xlabel("Year")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(14, 8))
    for df, label in zip(dfs, labels):
        plt.plot(df.groupby(df["date"].dt.month).size(), label=label)

    plt.xticks(range(12), range(1, 13))
    plt.title("Frequency of months in data")
    plt.xlabel("Month")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()

In [None]:
plot_data_density([df_NDSI, df_NDVI], ["NDSI", "NDVI"])

## River Flow Data
### Data Density

In [None]:
df_DGA = pd.read_csv(os.path.join(processed_folder_path, "DGA.csv"), index_col=0, parse_dates=["date"])
df_DGA = df_DGA.loc[df_DGA["date"].dt.year >= 1965]

plot_data_density([df_DGA], ["DGA"])

### Monthly Flow Data

We convert the data from a daily scale to a monthly scale by calculating the mean and the median per month.

In [None]:
monthly_flow_data_mean = df_DGA.groupby(pd.PeriodIndex(df_DGA['date'], freq="M"))[['river_flow', 'river_height']].mean()
monthly_flow_data_median = df_DGA.groupby(pd.PeriodIndex(df_DGA['date'], freq="M"))['river_flow'].median()

flow_mean_df = monthly_flow_data_mean.reset_index()

In [None]:
station_list = ["03400", "03401", "03402", 
                "03403", "03404", "03410", 
                "03411", "03412", "03413", 
                "03414", "03420", "03421"]

keep_rows_ndsi = df_NDSI[df_NDSI.Subsubwatershed.isin(station_list)].index
keep_rows_ndvi = df_NDVI[df_NDVI.Subsubwatershed.isin(station_list)].index

df_NDSI = df_NDSI[df_NDSI.index.isin(keep_rows_ndsi)]
df_NDVI = df_NDVI[df_NDVI.index.isin(keep_rows_ndvi)]

In [None]:
print(df_NDSI.columns)

### Below we merge the NDSI and NDVI dataframes into the monthly_flow_data_mean

In [None]:
def generate_lags(df, values, n_lags):
    """
    generate_lags
    Generates a dataframe with columns denoting lagged value up to n_lags
    Args:
        df: dataframe to lag
        value: values to lag
        n_lags: amount of rows to lag
    """
    df_n = df.copy()

    for value in values:
        for n in range(1, n_lags + 1):
            df_n[f"{value}_{n}"] = df_n[f"{value}"].shift(n)

    df_n = df_n.iloc[n_lags:]

    return df_n

def generate_cyclical_features(df, col_name, period, start_num=0):

    kwargs = {
        f"sin_{col_name}":
            lambda x: np.sin(2 * np.pi * (df[col_name] - start_num) / period),
        f"cos_{col_name}":
            lambda x: np.cos(2 * np.pi * (df[col_name] - start_num) / period)
    }

    return df.assign(**kwargs).drop(columns=[col_name])


def gather_ndsi_ndvi_data(watersheds=None):
    """
    This function returns the full processed data using various arguments
    as a pd.DataFrame
    Args:
        watersheds: list of strings denoting what watersheds to use from data
        lag: amount of time lag to be used as features
    """
    processed_folder_path = os.path.join("..", "data", "processed")

    if watersheds == None:

        watersheds = ["03400", "03401", "03402",
                      "03403", "03404", "03410",
                      "03411", "03412", "03413",
                      "03414", "03420", "03421"]
    
    df_NDSI = pd.read_csv(os.path.join(processed_folder_path, "NDSI.csv"),
                          index_col=0, parse_dates=["date"],
                          dtype={"Subsubwatershed": str})
    df_NDVI = pd.read_csv(os.path.join(processed_folder_path, "NDVI.csv"),
                          index_col=0, parse_dates=["date"],
                          dtype={"Subsubwatershed": str})
    
    # Only preserve rows inside subsubwatershed list
    keep_rows_ndsi = df_NDSI[df_NDSI.Subsubwatershed.isin(watersheds)].index
    keep_rows_ndvi = df_NDVI[df_NDVI.Subsubwatershed.isin(watersheds)].index

    df_NDSI = df_NDSI[df_NDSI.index.isin(keep_rows_ndsi)]
    df_NDVI = df_NDVI[df_NDVI.index.isin(keep_rows_ndvi)]
    
    return df_NDSI, df_NDVI


def aggregate_ndsi_ndvi_area_data(df_NDSI, df_NDVI, column):
    """
    This function will correctly aggregate area data given the column
    Args:
        df_NDSI: dataframe containing filtered NDSI values
        df_NDVI: dataframe containing filtered NDVI values
        column: column name to aggregate, must contain 'Surf'
    """
    
    if "Surf" not in column:
        raise InputError("'Surf' must be found in column name, otherwise it is not an area column")
    
    # Take sum of each day and average over the months to aggregate area data
    daily_ndsi_surf_sum = df_NDSI.groupby(
                            pd.PeriodIndex(df_NDSI.date, freq="D")
                        )[[column]].sum()
    daily_ndvi_surf_sum = df_NDVI.groupby(
                            pd.PeriodIndex(df_NDVI.date, freq="D")
                        )[[column]].sum()

    monthly_ndsi_surf_mean = daily_ndsi_surf_sum.groupby(pd.PeriodIndex(
                                daily_ndsi_surf_sum.index, freq="M")
                            )[[column]].mean()
    monthly_ndvi_surf_mean = daily_ndvi_surf_sum.groupby(pd.PeriodIndex(
                                daily_ndvi_surf_sum.index, freq="M")
                            )[[column]].mean()

    surf_ndsi_mean_df = monthly_ndsi_surf_mean.reset_index()
    surf_ndvi_mean_df = monthly_ndvi_surf_mean.reset_index()
    surf_ndsi_mean_df = surf_ndsi_mean_df.rename({column: f"ndsi_{column}"},
                                                 axis="columns")
    surf_ndvi_mean_df = surf_ndvi_mean_df.rename({column: f"ndvi_{column}"},
                                                 axis="columns")

    surf_ndsi_ndvi_df = pd.merge(surf_ndsi_mean_df, surf_ndvi_mean_df)
    
    return surf_ndsi_ndvi_df

def aggregate_ndsi_ndvi_data(df_NDSI, df_NDVI, area=False, cloud=False):
    """
    Returns the aggregated NDSI NDVI data with lagged variables
    Args:
        df_NDSI: dataframe containing filtered NDSI values
        df_NDVI: dataframe containing filtered NDVI values
        area: denotes if we include the area as a feature
        cloud: denotes if we include the cloud area as a feature
    """

    # Take average of NDSI values for each month and aggregate
    monthly_ndsi_mean = df_NDSI.groupby(pd.PeriodIndex(
                            df_NDSI.date, freq="M")
                        )[["avg"]].mean()
    monthly_ndvi_mean = df_NDVI.groupby(pd.PeriodIndex(
                            df_NDVI.date, freq="M")
                        )[["avg"]].mean()

    # Rename columns to enable merging
    ndsi_mean_df = monthly_ndsi_mean.reset_index()
    ndvi_mean_df = monthly_ndvi_mean.reset_index()

    ndsi_mean_df = ndsi_mean_df.rename({"avg": "ndsi_avg"}, axis="columns")
    ndvi_mean_df = ndvi_mean_df.rename({"avg": "ndvi_avg"}, axis="columns")

    # Merge ndvi and ndsi dataframes into one
    ndsi_ndvi_df = pd.merge(ndsi_mean_df, ndvi_mean_df)

    if area:
        surf_ndsi_ndvi_df = aggregate_ndsi_ndvi_area_data(df_NDSI, df_NDVI, "Surfavg")
        ndsi_ndvi_df = pd.merge(ndsi_ndvi_df, surf_ndsi_ndvi_df)
        
    if cloud:
        cloud_ndsi_ndvi_df = aggregate_ndsi_ndvi_area_data(df_NDSI, df_NDVI, "Surfcloudavg")
        ndsi_ndvi_df = pd.merge(ndsi_ndvi_df, cloud_ndsi_ndvi_df)
    
    print(ndsi_ndvi_df.describe())
    
    return ndsi_ndvi_df


def merge_flow_ndsi_ndvi_df(df_features, area=False, cloud=False):
    """
    The ndsi_ndvi_mean_df will be merged into df_features and the
    full dataframe is returned.
    Args:
        df_features: dataframe to merge ndsi_ndvi_df into
    """
    watersheds = ["03400", "03401", "03402",
                  "03403", "03404", "03410",
                  "03411", "03412", "03413",
                  "03414", "03420", "03421"]
    df_ndsi, df_ndvi = gather_ndsi_ndvi_data(watersheds=watersheds)
    df_ndsi_ndvi = aggregate_ndsi_ndvi_data(df_ndsi, df_ndvi, area=area, cloud=cloud)
    
    df_features = pd.merge(df_features, df_ndsi_ndvi, how="left")
    df_features = df_features.dropna(subset=["river_flow"])
    df_features = df_features.fillna(-1, downcast="infer")

    return df_features

def gather_river_flow_data(lag=6, time_features=False, index_features=False, index_area_features=False, index_cloud_features=False):
    """
    This function returns the full preprocessed data using various arguments
    as a pd.DataFrame

    Args:
        int lag: amount of time lag to be used as features
        bool time_features: use (cyclical) time as a feature
    """
    processed_folder_path = os.path.join("..", "data", "processed")

    # Import river flow data and only preserve datapoints after 1965
    df_DGA = pd.read_csv(os.path.join(processed_folder_path, "DGA.csv"),
                         index_col=0, parse_dates=["date"])
    df_DGA = df_DGA.loc[df_DGA["date"].dt.year >= 1965]

    # Extract average monthly river flow
    monthly_flow_data_mean = df_DGA.groupby(
                                    pd.PeriodIndex(df_DGA['date'], freq="M")
                                )['river_flow'].mean()
    flow_mean_df = monthly_flow_data_mean.reset_index()
    
    print(flow_mean_df.describe())
    
    df_features = generate_lags(flow_mean_df, ["river_flow"], lag)

    # Add time as feature if boolean is True
    if time_features:

        df_features = (
            df_features
            .assign(month=df_features.date.dt.month)
        )

        df_features = generate_cyclical_features(df_features, "month", 12, 1)

    if index_features:
        df_features = merge_flow_ndsi_ndvi_df(df_features, area=index_area_features, cloud=index_cloud_features)

        # Convert dataset to lagged dataset
        df_features = generate_lags(df_features, ["ndsi_avg", "ndvi_avg"], lag)
        df_features = df_features.drop(columns=["ndsi_avg", "ndvi_avg"])
        
        # Lag the ndsi and ndvi according to the same lag parameter and remove current month, 
        # as this cannot be used as a feature
        if index_area_features:
            df_features = generate_lags(df_features, ["ndsi_Surfavg", "ndvi_Surfavg"], lag)
            df_features = df_features.drop(columns=["ndsi_Surfavg", "ndvi_Surfavg"])
        
        if index_cloud_features:
            df_features = generate_lags(df_features, ["ndsi_Surfcloudavg", "ndvi_Surfcloudavg"], lag)
            df_features = df_features.drop(columns=["ndsi_Surfcloudavg", "ndvi_Surfcloudavg"])

    return df_features

### Below we show a description of the aggregated data

In [None]:
df_features = gather_river_flow_data(6, time_features=True, index_features=True, index_area_features=True, index_cloud_features=True)