In [18]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from statsmodels.api import OLS, add_constant

import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'statsmodels'

## Load data and build test/train split

In [2]:
data = pd.read_csv("./data/RRCA_baseflow.csv")

In [3]:
def test_train_split():
    data = data.sample(frac=1)
    split = int(0.8 * len(data))

    train = data.iloc[:split, :]
    test = data.iloc[split:, :]

    train.to_csv("./data/train.csv", index=None)
    test.to_csv("./data/test.csv", index=None)

# test_train_split()

In [4]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

split = int(0.8 * len(train))
train = train.sample(frac=1)

validation = train.iloc[split:, :]
validation_y = validation[["Observed"]]
validation_x = validation.drop(["Observed"], axis=1)

train = train.iloc[:split, :]
train_y = train[["Observed"]]
train_x = train.drop(["Observed"], axis=1)

## Explore data

In [5]:
print(f"Columns: {', '.join(data.columns.values)}")
print()

for header in data.columns:
    print(header)
    print(f"std: {data[[header]].values.flatten().std()}")
    print(f"avg: {data[[header]].values.flatten().mean()}")
    print(f"min: {data[[header]].values.flatten().min()}")
    print(f"max: {data[[header]].values.flatten().max()}")
    print()

Columns: Date, Segment_id, x, y, Evapotranspiration, Precipitation, Irrigation_pumping, Observed

Date
std: 6287.137751579983
avg: 719206.1753575781
min: 708479
max: 730729

Segment_id
std: 52.78791190766203
avg: 126.58989160413059
min: 40
max: 256

x
std: 292489.75358944904
avg: 1169435.2568789686
min: 721870
max: 1941550

y
std: 83307.80760590668
avg: 14570660.457956513
min: 14387040
max: 14767200

Evapotranspiration
std: 3.167133287231281
avg: 2.6874908601116028
min: 0.0
max: 12.1

Precipitation
std: 10.512571361268662
avg: 14.915468539542044
min: 0.0
max: 40.38

Irrigation_pumping
std: 0.27371714995088736
avg: -0.090712565133731
min: -3.1647
max: 0.0

Observed
std: 56.273487440683574
avg: 23.986753837712268
min: -22.0
max: 747.80328



In [10]:
model = LinearRegression(fit_intercept=True, n_jobs=6)
model.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=6, normalize=False)

In [17]:
prediction_y = model.predict(validation_x).flatten()
actual_y = validation_y.to_numpy().flatten()

mean_squared_error = 0.
mean_log_error = 0.
for index in range(n := len(prediction_y)):
    mean_squared_error += (prediction_y[index] - actual_y[index]) ** 2
    mean_log_error += np.log(np.abs(prediction_y[index] - actual_y[index]))
mean_squared_error /= n
mean_log_error /= n

r_squared = model.score(validation_x, validation_y)

print(f"MSE: {mean_squared_error:10.4f}")
print(f"MLE: {mean_log_error:10.4f}")
print(f"R^2: {r_squared:10.4f}")

MSE:  2661.7744
MLE:     2.7882
R^2:     0.2499
