### In this notebook we explore the data lightly and create some preliminary plots

In [None]:
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import collections
import os

processed_folder_path = os.path.join("..", "data", "processed")
unprocessed_folder_path = os.path.join("..", "data", "unprocessed")

## Plot data density

### NDSI and NDVI data density given time

In [None]:
df_NDSI = pd.read_csv(os.path.join(processed_folder_path, "NDSI.csv"), index_col=0, parse_dates=["date"])
df_NDVI = pd.read_csv(os.path.join(processed_folder_path, "NDVI.csv"), index_col=0, parse_dates=["date"])

In [None]:
def plot_data_density(dfs, labels):
    plt.figure(figsize=(14, 8))
    for df, label in zip(dfs, labels):
        plt.plot(df.groupby(df["date"].dt.year).size(), label=label)
    
    plt.title("Frequency of years in data")
    plt.xlabel("Year")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(14, 8))
    for df, label in zip(dfs, labels):
        plt.plot(df.groupby(df["date"].dt.month).size(), label=label)

    plt.xticks(range(12), range(1, 13))
    plt.title("Frequency of months in data")
    plt.xlabel("Month")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()

In [None]:
plot_data_density([df_NDSI, df_NDVI], ["NDSI", "NDVI"])

## River Flow Data
### Data Density

In [None]:
df_DGA = pd.read_csv(os.path.join(processed_folder_path, "DGA.csv"), index_col=0, parse_dates=["date"])
df_DGA = df_DGA.loc[df_DGA["date"].dt.year >= 1965]

plot_data_density([df_DGA], ["DGA"])

### Monthly Flow Data

We convert the data from a daily scale to a monthly scale by calculating the mean and the median per month.

In [None]:
monthly_flow_data_mean = df_DGA.groupby(pd.PeriodIndex(df_DGA['date'], freq="M"))[['river_flow', 'river_height']].mean()
monthly_flow_data_median = df_DGA.groupby(pd.PeriodIndex(df_DGA['date'], freq="M"))['river_flow'].median()

flow_mean_df = monthly_flow_data_mean.reset_index()

#### Below we plot the yearly average measurements of all water years, distinguishing between the types of years (dry, normal or wet).

In [None]:
print("Average River Flow Measurements:")

print("Dry Years:   ", round(df_DGA.loc[df_DGA["year_type"] == -1]["river_flow"].mean(), 3))
print("Normal Years:", round(df_DGA.loc[df_DGA["year_type"] == 0]["river_flow"].mean(), 3))
print("Wet Years:   ", round(df_DGA.loc[df_DGA["year_type"] == 1]["river_flow"].mean(), 3))

dry_discharge_rate = df_DGA.loc[df_DGA["year_type"] == -1].groupby("water_year")[["river_flow"]].mean()
dry_discharge_rate.plot.bar(figsize=(8, 6), title="Dry Years", xlabel='Water Year', ylabel='Discharge (cubic feet / sec)', legend=False)

normal_discharge_rate = df_DGA.loc[df_DGA["year_type"] == 0].groupby("water_year")[["river_flow"]].mean()
normal_discharge_rate.plot.bar(figsize=(8, 6), title="Normal Years", xlabel='Water Year', ylabel='Discharge (cubic feet / sec)', legend=False)

wet_discharge_rate = df_DGA.loc[df_DGA["year_type"] == 1].groupby("water_year")[["river_flow"]].mean()
wet_discharge_rate.plot.bar(figsize=(8, 6), title="Wet Years", xlabel='Water Year', ylabel='Discharge (cubic feet / sec)', legend=False)

plt.show()

### Below we merge the NDSI and NDVI dataframes into the monthly_flow_data_mean

In [None]:
# Aggregate data based on a monthly average
monthly_flow_data_mean = df_DGA.groupby(pd.PeriodIndex(df_DGA['date'], freq="M"))[['river_flow', 'river_height']].mean()
monthly_ndsi_data_mean = df_NDSI.groupby(pd.PeriodIndex(df_NDSI.date, freq="M"))["avg"].mean()
monthly_ndvi_data_mean = df_NDVI.groupby(pd.PeriodIndex(df_NDVI.date, freq="M"))["avg"].mean()

# Reset indices to restore the grouped data into a dataframe
flow_mean_df = monthly_flow_data_mean.reset_index()

ndsi_mean_df = monthly_ndsi_data_mean.reset_index()
ndsi_mean_df = ndsi_mean_df.rename({"avg": "ndsi_avg"}, axis="columns")

ndvi_mean_df = monthly_ndvi_data_mean.reset_index()
ndvi_mean_df = ndvi_mean_df.rename({"avg": "ndvi_avg"}, axis="columns")

# Merge ndvi and ndsi dataframes into one
ndsi_ndvi_mean_df = pd.merge(ndsi_mean_df, ndvi_mean_df)

# Merge into river_flow river_height dataset
monthly_flow_data_mean = pd.merge(flow_mean_df, ndsi_ndvi_mean_df)