## In this notebook we implement some simple baselines

Including ...

In [None]:
# Standard imports

%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import collections
import os
from tqdm.notebook import tqdm

### First we import the data from NDSI, NDVI and DGA and only consider years after 1965, as there is insufficient data before 1965.

In [None]:
processed_folder_path = os.path.join("..", "data", "processed")

In [None]:
df_NDSI = pd.read_csv(os.path.join(processed_folder_path, "NDSI.csv"), index_col=0, parse_dates=["date"])
df_DGA = pd.read_csv(os.path.join(processed_folder_path, "DGA.csv"), index_col=0, parse_dates=["date"])
df_NDVI = pd.read_csv(os.path.join(processed_folder_path, "NDSI.csv"), index_col=0, parse_dates=["date"])

df_NDSI = df_NDSI.loc[df_NDSI["date"].dt.year >= 1965]
df_NDVI = df_NDVI.loc[df_NDVI["date"].dt.year >= 1965]
# df_DGA = df_DGA.loc[df_DGA["date"].dt.year >= 1965]


#### We will use Pytorch to implement some simple neural networks

https://pytorch.org/

First we refine the data and put it into a DataLoader object.

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
monthly_flow_data_mean = df_DGA.groupby(pd.PeriodIndex(df_DGA['date'], freq="M"))['river_flow'].mean()
monthly_flow_data_median = df_DGA.groupby(pd.PeriodIndex(df_DGA['date'], freq="M"))['river_flow'].median()

mean_df = monthly_flow_data_mean.reset_index()
mean_df.date = pd.to_datetime(mean_df.date.astype("str"))

# Offset date by 3 months, so 1 april aligns with the first day of a given water year
mean_df.date = mean_df.date + pd.tseries.offsets.DateOffset(months=-3)
# mean_df = mean_df.loc[mean_df.date.dt.year >= 1964]

X = []
y = []

# 1987 seems to have insufficient data, so ignore years that do not contain 12 values
for year in mean_df.date.dt.year.unique():
    year_rows = mean_df.loc[mean_df.date.dt.year == year]["river_flow"]
    
    if len(year_rows) == 12:
        X.append(year_rows[:6])
        y.append(year_rows[6:].mean())
    
X = np.array(X)
y = np.array(y)

In [None]:
class RiverFlowDataset(Dataset):
    def __init__(self, X, y):
        self.X = np.float32(X)
        self.y = np.float32(y)
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(6, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        return self.layers(x)

In [None]:
dataset = RiverFlowDataset(X, y)
train_set, val_set = torch.utils.data.random_split(dataset, [round(len(dataset) * 0.8), round(len(dataset) * 0.2)])                                              

dataloader = DataLoader(train_set, batch_size=2, shuffle=True, num_workers=2)

torch.manual_seed(42)

model = MLP().to(device)

loss_fn = nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in tqdm(range(0, 200), desc="Epoch"):
    
    current_loss = 0.0
    n = 0
    
    for i, data in enumerate(dataloader):
        model.train()
        inputs, targets = data
        inputs = inputs.to(device)
        targets = targets.to(device)
        
        optim.zero_grad()
        outputs = model(inputs)
        
        loss = loss_fn(outputs, targets.unsqueeze(1))
        
        loss.backward()
        
        optim.step()
        
        current_loss += loss.item()
        n += 1

In [None]:
def test(model, validation_set):
    testloader = DataLoader(validation_set, batch_size=2, num_workers=2)
    metric = nn.MSELoss()
    
    model.eval()
    
    total_loss = 0.0
    
    for data in tqdm(testloader):
        inputs, targets = data
        inputs = inputs.to(device)
        targets = targets.to(device)
        
        outputs = model(inputs)
    
        total_loss += metric(outputs, targets.unsqueeze(1))
    
    print("Test Loss: %3f: ", total_loss.item() / len(validation_set))
    
test(model, val_set)