# Forecasting of daily River Discharge (RD) based on temperature and previous precipitation levels

### Import of libraries

In [155]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import matplotlib.dates as mdates
from sklearn import linear_model,preprocessing
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_squared_error,r2_score
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from sklearn.model_selection import train_test_split

### LoadingData

In [156]:
def loadData(river,rdDaysBefore,precDaysBefore=0):
    data=pd.read_csv(river).values
    Xbefore = data[:,1:4]
    rd_column = Xbefore[:,0]
    rd_column = np.roll(rd_column,rdDaysBefore)
    Xbefore[:,0] = rd_column
    if precDaysBefore != 0:
        prec_colum = Xbefore[:,2]
        prec_colum = np.roll(prec_colum,precDaysBefore)
        new_column = np.expand_dims(prec_colum, axis=1)
        Xbefore = np.hstack((Xbefore, new_column))
    cut = rdDaysBefore if rdDaysBefore > precDaysBefore else precDaysBefore
    X = Xbefore[cut:,:]
    y = data[cut:,1]
    return X,y

### Prep of Data

In [157]:
def apply_boxcox(y, lambda_val=0):
    y_float64 = np.array(y, dtype=np.float64)
    if lambda_val == 0:
        res = boxcox(y_float64)
    else:
        res = boxcox(y_float64, lambda_val)
    y = res[0]
    lambda_val = res[1]
    return y, lambda_val

In [158]:
def preprocess_data(X, y):
    scaler = preprocessing.StandardScaler()

    X_trainN, X_testN, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)

    #plot_learning_curve(X_trainN, y_train)

    X_train_scaled = scaler.fit_transform(X_trainN)
    y_train_boxcox, lambda_val = apply_boxcox(y_train)
    
    X_test_scaled = scaler.transform(X_testN)
    y_test_boxcox, _ = apply_boxcox(y_test, lambda_val)
    
    return X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val

### Cost Function

In [159]:
def fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val):
    
    alphas = [0.01,0.1,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]
    ridge_cv = linear_model.RidgeCV(alphas=alphas)

    ridge_cv.fit(X_train_scaled,y_train_boxcox)
    best_alpha = ridge_cv.alpha_
    y_predict = inv_boxcox(ridge_cv.predict(X_test_scaled), lambda_val)

    r2_training = ridge_cv.score(X_train_scaled, y_train_boxcox)
    rmse_training = np.sqrt(-1*ridge_cv.best_score_)
    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))

    """ plt.plot(y_test, 'r', label='Original')
    plt.plot(y_predict, 'b', label='Predicted')
    ax = plt.gca()
    ax.set_ylim([0, ax.get_ylim()[1]*1.1])
    plt.legend()
    plt.show() """

    return r2, rmse, r2_training, rmse_training, best_alpha

In [160]:
def test_for_river(river):

    best_r2 = (-np.inf,0)
    best_rmse = (np.inf,0)

    memory = [x for x in range(1,11)] + [x for x in range(10,330,30)]

    for i in memory:
        for j in memory:
            X, y = loadData(river, i,j)
            X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val = preprocess_data(X, y)
            r2, rmse, _, _, _ = fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val)
            if r2 > best_r2[0]:
                best_r2 = (r2, i)
            if rmse < best_rmse[0]:
                best_rmse = (rmse, i)
            
    print("\n\n ----- TESTING FOR RIVER:", river + " ------ \n\n")
    print("Best i for best R2:", best_r2[1])
    print("Best i for best RMSE:", best_rmse[1])
    X, y = loadData(river, best_rmse[1])
    X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val = preprocess_data(X, y)
    r2, rmse, r2_train, rmse_train, best_alpha = fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val)
    print("RMSE training:", rmse_train)
    print("R2 training:", r2_train)
    print("RMSE test:", rmse)
    print("R2 test:", r2)
    print("Best alpha:", best_alpha)

In [161]:
test_for_river("RD_data/RD_AntuaR_pg.csv")
test_for_river("RD_data/RD_MondegoR_pg.csv")
test_for_river("RD_data/RD_NeivaR_pg.csv")
test_for_river("RD_data/RD_VougaR_pg.csv")



 ----- TESTING FOR RIVER: RD_data/RD_AntuaR_pg.csv ------ 


Best i for best R2: 100
Best i for best RMSE: 70
RMSE training: 0.6485162057984086
R2 training: 0.7419885283591812
RMSE test: 0.8445613612879315
R2 test: 0.7188937103599096
Best alpha: 20.0


 ----- TESTING FOR RIVER: RD_data/RD_MondegoR_pg.csv ------ 


Best i for best R2: 310
Best i for best RMSE: 1
RMSE training: 1.0574419168295757
R2 training: 0.702263015940394
RMSE test: 10.066360652990056
R2 test: 0.20536982853123076
Best alpha: 30.0


 ----- TESTING FOR RIVER: RD_data/RD_NeivaR_pg.csv ------ 


Best i for best R2: 100
Best i for best RMSE: 100
RMSE training: 0.328942633420975
R2 training: 0.9164559602748673
RMSE test: 0.2162493102301979
R2 test: 0.9174189387111072
Best alpha: 5.0


 ----- TESTING FOR RIVER: RD_data/RD_VougaR_pg.csv ------ 


Best i for best R2: 4
Best i for best RMSE: 1
RMSE training: 1.0671150188543783
R2 training: 0.7793850995084345
RMSE test: 20.010112372887225
R2 test: 0.527625113313861
Best alph