# Forecasting of daily River Discharge (RD) based on temperature and precipitation of the day, previous RD and previous Precipitation

### Import of libraries

In [93]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import matplotlib.dates as mdates
from sklearn import linear_model,preprocessing
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_squared_error,r2_score
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from sklearn.model_selection import train_test_split

### LoadingData

In [94]:
def loadData(river,rdDaysBefore,precDaysBefore=0):
    data=pd.read_csv(river).values
    Xbefore = data[:,1:4]
    rd_column = Xbefore[:,0]
    rd_column = np.roll(rd_column,rdDaysBefore)
    Xbefore[:,0] = rd_column
    if precDaysBefore != 0:
        prec_colum = Xbefore[:,2]
        prec_colum = np.roll(prec_colum,precDaysBefore)
        new_column = np.expand_dims(prec_colum, axis=1)
        Xbefore = np.hstack((Xbefore, new_column))
    cut = rdDaysBefore if rdDaysBefore > precDaysBefore else precDaysBefore
    X = Xbefore[cut:,:]
    y = data[cut:,1]
    return X,y

### Prep of Data

In [95]:
def apply_boxcox(y, lambda_val=0):
    y_float64 = np.array(y, dtype=np.float64)
    if lambda_val == 0:
        res = boxcox(y_float64)
    else:
        res = boxcox(y_float64, lambda_val)
    y = res[0]
    lambda_val = res[1]
    return y, lambda_val

In [96]:
def preprocess_data(X,y):
    scaler = preprocessing.StandardScaler()
    poly_features = preprocessing.PolynomialFeatures(degree=2)

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)

    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.transform(X_test)

    X_train_scaled = scaler.fit_transform(X_train_poly)
    y_train_boxcox, lambda_val = apply_boxcox(y_train)

    X_test_scaled = scaler.transform(X_test_poly)

    return X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val

### Cost Function

In [97]:
def fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val):
    
    alphas = [0.01,0.1,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]
    ridge_cv = linear_model.RidgeCV(alphas=alphas)

    ridge_cv.fit(X_train_scaled,y_train_boxcox)
    best_alpha = ridge_cv.alpha_
    y_predict = inv_boxcox(ridge_cv.predict(X_test_scaled), lambda_val)

    r2_training = ridge_cv.score(X_train_scaled, y_train_boxcox)
    rmse_training = np.sqrt(-1*ridge_cv.best_score_)
    r2 = r2_score(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))

    """ plt.plot(y_test, 'r', label='Original')
    plt.plot(y_predict, 'b', label='Predicted')
    ax = plt.gca()
    ax.set_ylim([0, ax.get_ylim()[1]*1.1])
    plt.legend()
    plt.show() """

    return r2, rmse, r2_training, rmse_training, best_alpha

In [98]:
def test_for_river(river):

    best_r2 = (-np.inf,0)
    best_rmse = (np.inf,0)

    memory = [x for x in range(1,11)] + [x for x in range(10,100,10)]
    memoryprec = [x for x in range(0,11)] + [x for x in range(10,100,10)]

    for i in memory:
        for j in memoryprec:
            X, y = loadData(river, i,j)
            X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val = preprocess_data(X, y)
            r2, rmse, _, _, _ = fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val)
            if r2 > best_r2[0]:
                best_r2 = (r2, i, j)
            if rmse < best_rmse[0]:
                best_rmse = (rmse, i, j)
            
    print("\n\n ----- TESTING FOR RIVER:", river + " ------ \n\n")
    print("Best memoryRD and memoryPrecipitacao for best R2:", best_r2[1:])
    print("Best memoryRD and memoryPrecipitacao for best RMSE:", best_rmse[1:])
    X, y = loadData(river, best_rmse[1], best_rmse[2])
    X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val = preprocess_data(X, y)
    r2, rmse, r2_train, rmse_train, best_alpha = fit(X_train_scaled, y_train_boxcox, X_test_scaled, y_test, lambda_val)
    print("RMSE training:", rmse_train)
    print("R2 training:", r2_train)
    print("RMSE test:", rmse)
    print("R2 test:", r2)
    print("Best alpha:", best_alpha)

In [99]:
test_for_river("RD_data/RD_AntuaR_pg.csv")
test_for_river("RD_data/RD_MondegoR_pg.csv")
test_for_river("RD_data/RD_NeivaR_pg.csv")
test_for_river("RD_data/RD_VougaR_pg.csv")



 ----- TESTING FOR RIVER: RD_data/RD_AntuaR_pg.csv ------ 


Best memoryRD and memoryPrecipitacao for best R2: (4, 40)
Best memoryRD and memoryPrecipitacao for best RMSE: (3, 90)
RMSE training: 0.32456157924489887
R2 training: 0.9348075847180769
RMSE test: 0.4990595633373029
R2 test: 0.905346518016247
Best alpha: 40.0


 ----- TESTING FOR RIVER: RD_data/RD_MondegoR_pg.csv ------ 


Best memoryRD and memoryPrecipitacao for best R2: (1, 90)
Best memoryRD and memoryPrecipitacao for best RMSE: (1, 90)
RMSE training: 0.7649879361916365
R2 training: 0.825404865886699
RMSE test: 5.947171240066045
R2 test: 0.7315402216044378
Best alpha: 10.0


 ----- TESTING FOR RIVER: RD_data/RD_NeivaR_pg.csv ------ 


Best memoryRD and memoryPrecipitacao for best R2: (30, 30)
Best memoryRD and memoryPrecipitacao for best RMSE: (30, 30)
RMSE training: 0.1406892036725693
R2 training: 0.9851151633596298
RMSE test: 0.11183709686313482
R2 test: 0.9839878539060348
Best alpha: 6.0


 ----- TESTING FOR RIVER: RD_d