# Forecasting of daily River Discharge (RD) based on temperature and previous precipitation levels

### Import of libraries

In [386]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import matplotlib.dates as mdates
from sklearn import linear_model,preprocessing
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_squared_error,r2_score
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from sklearn.model_selection import train_test_split

### LoadingData

In [387]:
def loadData(river,daysBefore,daysBefore2):
    data=pd.read_csv(river).values
    Xbefore = data[:,2:4]
    second_colum = Xbefore[:,1]
    second_colum2 = Xbefore[:,1]
    
    second_colum = np.roll(second_colum,daysBefore)
    second_colum2 = np.roll(second_colum,daysBefore2)
    new_column = np.expand_dims(second_colum, axis=1)
    Xbefore = np.hstack((Xbefore, new_column))
    new_column = np.expand_dims(second_colum2, axis=1)
    Xbefore = np.hstack((Xbefore, new_column))
    split = daysBefore if daysBefore > daysBefore2 else daysBefore2

    X = Xbefore[split:,:]
    y = data[split:,1]

    return X,y


### Prep of Data

In [388]:
def apply_boxcox(y, lambda_val=0):
    y_float64 = np.array(y, dtype=np.float64)
    if lambda_val == 0:
        res = boxcox(y_float64)
    else:
        res = boxcox(y_float64, lambda_val)
    y = res[0]
    lambda_val = res[1]
    return y, lambda_val

In [389]:
def plot_learning_curve(X_trainN,y_train):

    # Define a range of alpha values to test
    alphas = [0.01,0.1,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]
    ridge_cv = linear_model.RidgeCV(alphas=alphas, scoring=None)

    # Create learning curve
    train_sizes, train_errors, valid_errors = learning_curve(
        ridge_cv, X_trainN, y_train, train_sizes=np.linspace(0.1, 1.0, 10),scoring=lambda estimator, X, y: mean_squared_error(y, estimator.predict(X)))

    # Calculate mean and standard deviation of training and validation errors
    train_errors_mean = np.mean(train_errors, axis=1)
    train_errors_std = np.std(train_errors, axis=1)
    valid_errors_mean = np.mean(valid_errors, axis=1)
    valid_errors_std = np.std(valid_errors, axis=1)

    # Plot learning curve
    plt.figure()
    plt.title("Learning Curve for RidgeCV")
    plt.xlabel("Training Examples")
    plt.ylabel("Error")

    plt.grid()

    plt.plot(train_sizes, train_errors_mean, '-', color="r",
            label="Training score")
    plt.plot(train_sizes, valid_errors_mean, '-', color="g",
            label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

In [390]:
def preprocess_data(X, y):
    scaler = preprocessing.StandardScaler()

    X_trainN, X_testN, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)

    #plot_learning_curve(X_trainN, y_train)

    X_train_scaled = scaler.fit_transform(X_trainN)
    y_train_boxcox, lambda_val = apply_boxcox(y_train)
    
    X_test_scaled = scaler.transform(X_testN)
    y_test_boxcox, _ = apply_boxcox(y_test, lambda_val)
    
    return X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val


### Cost Function

In [391]:
def fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val):
    
    alphas = [0.01,0.1,1,2,3,4,5,6,7,8,9,10,100,1000]
    ridge_cv = linear_model.RidgeCV(alphas=alphas)

    ridge_cv.fit(X_train_scaled,y_train_boxcox)
    best_alpha = ridge_cv.alpha_
    y_predict = inv_boxcox(ridge_cv.predict(X_test_scaled), lambda_val)

    r2_training = ridge_cv.score(X_train_scaled, y_train_boxcox)
    rmse_training = np.sqrt(-1*ridge_cv.best_score_)

    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))

    """ plt.plot(y_test, 'r', label='Original')
    plt.plot(y_predict, 'b', label='Predicted')
    ax = plt.gca()
    ax.set_ylim([0, ax.get_ylim()[1]*1.1])
    plt.legend()
    plt.show() """

    return r2, rmse, r2_training, rmse_training, best_alpha

### Test different memories

In [392]:
def test_for_river(river):

    best_r2 = (-np.inf,0,0)
    best_rmse = (np.inf,0,0)

    for i in range(0,11):
        for j in range(0,11):
            X, y = loadData(river, i, j)
            X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val = preprocess_data(X, y)
            r2, rmse, _, _, _ = fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val)
            if r2 > best_r2[0]:
                best_r2 = (r2, i, j)
            if rmse < best_rmse[0]:
                best_rmse = (rmse, i, j)
            
    print("\n\n ----- TESTING FOR RIVER:", river + " ------ \n\n")
    print("Best i and j for best R2:", best_r2[1:])
    print("Best i and j for best RMSE:", best_rmse[1:])
    X, y = loadData(river, best_r2[1], best_r2[2])
    X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val = preprocess_data(X, y)
    r2, rmse, r2_train, rmse_train, best_alpha = fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val)
    print("RMSE training:", rmse_train)
    print("R2 training:", r2_train)
    print("RMSE test:", rmse)
    print("R2 test:", r2)
    print("Best alpha:", best_alpha)

In [393]:
test_for_river("RD_data/RD_AntuaR_pg.csv")
test_for_river("RD_data/RD_MondegoR_pg.csv")
test_for_river("RD_data/RD_NeivaR_pg.csv")
test_for_river("RD_data/RD_VougaR_pg.csv")



 ----- TESTING FOR RIVER: RD_data/RD_AntuaR_pg.csv ------ 


Best i and j for best R2: (2, 6)
Best i and j for best RMSE: (2, 6)
RMSE training: 0.7397902509665072
R2 training: 0.6589631179006177
RMSE test: 2.0167090161015704
R2 test: -0.3416062941809679
Best alpha: 7.0


 ----- TESTING FOR RIVER: RD_data/RD_MondegoR_pg.csv ------ 


Best i and j for best R2: (7, 7)
Best i and j for best RMSE: (7, 7)
RMSE training: 1.2297896286401186
R2 training: 0.5954359471134865
RMSE test: 28.847990667256813
R2 test: -5.456159761461806
Best alpha: 10.0


 ----- TESTING FOR RIVER: RD_data/RD_NeivaR_pg.csv ------ 


Best i and j for best R2: (5, 5)
Best i and j for best RMSE: (5, 5)
RMSE training: 0.6274772790677154
R2 training: 0.6651424970428215
RMSE test: 1.2836456362934148
R2 test: -0.714287715970523
Best alpha: 8.0


 ----- TESTING FOR RIVER: RD_data/RD_VougaR_pg.csv ------ 


Best i and j for best R2: (4, 4)
Best i and j for best RMSE: (4, 4)
RMSE training: 1.1132379766542724
R2 training: 0.756