In [202]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

#### 1. Loading in our data

In [203]:
weather_df: pd.DataFrame = pd.read_feather("./training_data/processed_data/preprocessed_weather_hartbeespoort.feather")
images_df: pd.DataFrame = pd.read_feather("./training_data/processed_data/preprocessed_image_hartbeespoort.feather")

In [204]:
weather_df

Unnamed: 0_level_0,windspeed,winddir
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-04,18.4,292.2
2021-01-09,15.4,330.5
2021-01-21,21.7,85.6
2021-02-10,13.6,46.0
2021-02-13,16.6,117.2
...,...,...
2024-05-10,14.8,320.4
2024-05-13,13.9,29.4
2024-05-15,16.4,183.5
2024-05-18,11.6,218.0


In [205]:
images_df

Unnamed: 0_level_0,center_x_1,center_y_1,x_axis_length_1,y_axis_length_1,angle_1,center_x_2,center_y_2,x_axis_length_2,y_axis_length_2,angle_2,center_x_3,center_y_3,x_axis_length_3,y_axis_length_3,angle_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-01-04,704.028564,442.499176,121.826691,451.231567,58.160427,960.999756,336.523529,112.876732,203.835342,117.014908,1092.642456,257.937164,133.023499,147.132126,142.699448
2021-01-09,715.279602,351.457184,100.243324,367.353882,66.100555,807.207031,124.622177,54.882431,117.928635,119.937805,184.712097,572.285339,51.416313,93.895416,133.982315
2021-01-21,325.976746,490.977356,136.785980,500.891998,74.853523,813.122131,146.077484,87.973320,149.937958,163.590485,1233.187134,566.028442,8.314092,24.543709,143.964798
2021-02-10,167.179871,545.132202,91.021637,191.275513,108.308769,881.044556,144.769745,63.307846,291.755676,111.700424,592.362976,408.737030,60.006161,177.342026,67.738052
2021-02-13,280.800171,504.206818,134.417603,405.863342,72.619293,787.453369,126.050583,35.755112,48.203991,129.081055,918.028381,556.878296,14.486405,27.316944,2.405948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-10,777.354004,107.631660,30.748890,90.560562,146.724838,243.170166,542.966003,13.301406,35.620926,173.263412,713.954590,443.647552,9.775264,37.554825,63.395508
2024-05-13,716.543701,305.779480,22.611246,165.610962,61.839939,120.704819,541.091553,13.267200,20.426884,71.220276,797.884949,141.748245,10.134691,28.270313,174.367538
2024-05-15,800.668274,145.144821,26.662622,81.719994,131.096573,756.216187,80.918289,15.954613,37.925011,145.252411,112.768906,556.176453,10.941070,24.187414,82.034576
2024-05-18,774.593811,107.590065,32.553211,89.817474,143.160324,233.447922,540.661499,23.895515,50.522202,166.417343,107.577980,557.288330,9.227895,16.054379,118.356796


#### 2. Preparing our data

In [206]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

result: list[pd.DataFrame] = train_test_split(weather_df, images_df, test_size=0.1, random_state=69, shuffle=False)
X_train: pd.DataFrame = result[0]
X_valid: pd.DataFrame = result[1]
y_train: pd.DataFrame = result[2]
y_valid: pd.DataFrame = result[3]

print(f"Training data rows: {len(X_train)}, Test data rows: {len(X_valid)}")

Training data rows: 287, Test data rows: 32


In [207]:
def print_metrics(actual, prediction, data_type) -> None:
    print("Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)")
    print(f"{data_type} DATA: MSE: {mean_squared_error(actual, prediction)}")
    print(f"{data_type} DATA: RMSE: {mean_squared_error(actual, prediction, squared=False)}")
    print(f"{data_type} DATA: MAE: {mean_absolute_error(actual, prediction)}\n")

#### 3. Testing different types of supervised regression models

##### 3.1. Decision Tree Regressor

In [208]:
from sklearn.tree import DecisionTreeRegressor

# tree_model = DecisionTreeRegressor(min_samples_split=10, min_samples_leaf=10, min_weight_fraction_leaf=0.3)
tree_model = DecisionTreeRegressor()

tree_model.fit(X_train, y_train)

tree_y_pred: np.ndarray = tree_model.predict(X_train)
print_metrics(y_train, tree_y_pred, "KNOWN")

tree_y_pred: np.ndarray = tree_model.predict(X_valid)
print_metrics(y_valid, tree_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 0.0
KNOWN DATA: RMSE: 0.0
KNOWN DATA: MAE: 0.0

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 68636.30911743698
NEW DATA: RMSE: 205.61684443192934
NEW DATA: MAE: 163.33334224050245



##### Use GridSearch for attempt at optimal hyperparameters

In [209]:
params_tree = {
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "max_depth": [None, 2, 4, 10, 20],
    "min_samples_split": [2, 4, 5, 10],
    "min_samples_leaf": [1, 2, 4, 5, 10],
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2, 0.3],
    "max_leaf_nodes": [None, 6, 8, 10, 20],
    "min_impurity_decrease": [0.0, 0.1, 0.2, 0.3]
}

grid_tree = GridSearchCV(tree_model, param_grid=params_tree, n_jobs=-1, verbose=True)

grid_tree.fit(X_train, y_train)

grid_tree.best_params_

Fitting 5 folds for each of 32000 candidates, totalling 160000 fits




KeyboardInterrupt: 

Hyperparamter tuning is found empiracally to be a good balance between being to correctly identify known data and handle new data.
The problem however is that we have very little data for training and validating.

##### Non-default hyperparameters:

- min_samples_split (default=2): The minimum number of samples required to split an internal node: If int, then consider min_samples_split as the minimum number. If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.
- min_samples_leaf (default=1): The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

- min_weight_fraction_leaf (default=0.0): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.


[scikit-learn DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

##### 3.2. Random Forest Regressor

In [200]:
from sklearn.ensemble import RandomForestRegressor

# forest_model = RandomForestRegressor(min_samples_leaf=10, min_weight_fraction_leaf=0.3)
forest_model = RandomForestRegressor()

forest_model.fit(X_train, y_train)

forest_y_pred: np.ndarray = forest_model.predict(X_train)
print_metrics(y_train, forest_y_pred, "KNOWN")

forest_y_pred: np.ndarray = forest_model.predict(X_valid)
print_metrics(y_valid, forest_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 5164.94117209328
KNOWN DATA: RMSE: 55.605345003855476
KNOWN DATA: MAE: 44.16194339686985

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 41848.94983863062
NEW DATA: RMSE: 156.2440174917567
NEW DATA: MAE: 129.2070686883852



##### Use GridSearch for attempt at optimal hyperparameters

In [201]:
params_forest = {
    "n_estimators": [10, 25, 50, 100, 200],
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "max_depth": [None, 2, 4, 10, 20],
    "min_samples_split": [2, 4, 5, 10],
    "min_samples_leaf": [1, 2, 4, 5, 10],
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2, 0.3],
    "max_leaf_nodes": [None, 6, 8, 10, 20],
    "min_impurity_decrease": [0.0, 0.1, 0.2, 0.3]
}

grid_forest = GridSearchCV(forest_model, param_grid=params_forest, n_jobs=-1, verbose=True)

grid_forest.fit(X_train, y_train)

grid_forest.best_params_

Fitting 5 folds for each of 192000 candidates, totalling 960000 fits


KeyboardInterrupt: 

##### Non-default hyperparameters:

- min_samples_leaf (default=1): The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.
- min_weight_fraction_leaf (default=0.0): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.

[scikit-learn RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

##### 3.3. Linear Regressor

In [None]:
from sklearn.linear_model import LinearRegression

# linear_model = LinearRegression(positive=True)
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

linear_y_pred: np.ndarray = linear_model.predict(X_train)
print_metrics(y_train, linear_y_pred, "KNOWN")

linear_y_pred: np.ndarray = linear_model.predict(X_valid)
print_metrics(y_valid, linear_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32412.473880083835
KNOWN DATA: RMSE: 140.13840079242888
KNOWN DATA: MAE: 114.45904001584911

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32201.900449802193
NEW DATA: RMSE: 139.62326538527105
NEW DATA: MAE: 114.78593845659337



##### Use GridSearch for attempt at optimal hyperparameters

In [None]:
params_linear = {
    "fit_intercept": [True, False],
    "copy_X": [True, False],
    "positive": [False, True]
}

grid_linear = GridSearchCV(linear_model, param_grid=params_linear, n_jobs=-1, verbose=True)

grid_linear.fit(X_train, y_train)

grid_linear.best_params_

##### Non-default hyperparameters:

- positive (default=False): When set to True, forces the coefficients to be positive. This option is only supported for dense arrays.

[scikit-learn LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

##### 3.4. Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

# ridge_model = Ridge(solver="lbfgs", positive=True)
ridge_model = Ridge()

ridge_model.fit(X_train, y_train)

ridge_y_pred: np.ndarray = ridge_model.predict(X_train)
print_metrics(y_train, ridge_y_pred, "KNOWN")

ridge_y_pred: np.ndarray = ridge_model.predict(X_valid)
print_metrics(y_valid, ridge_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32412.473880672176
KNOWN DATA: RMSE: 140.13840079471518
KNOWN DATA: MAE: 114.45905276457098

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32201.91389630856
NEW DATA: RMSE: 139.62331318445302
NEW DATA: MAE: 114.78600015441154



##### Use GridSearch for attempt at optimal hyperparameters

In [None]:
params_ridge = {
    "alpha": [1.0, 2.0, 2.5, 5.0, 10.0],
    "fit_intercept": [True, False],
    "max_iter": [None, 1000, 15000, 30000],
    "tol": [0.0001, 0.001, 0.01, 0.1, 0.00001],
    "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"],
    "positive": [False, True]
}

grid_ridge = GridSearchCV(ridge_model, param_grid=params_ridge, n_jobs=-1, verbose=True)

grid_ridge.fit(X_train, y_train)

grid_ridge.best_params_

##### Non-default hyperparameters:

- solver (default="auto"): Solver to use in the computational routines: ‘auto’ chooses the solver automatically based on the type of data. ‘svd’ uses a Singular Value Decomposition of X to compute the Ridge coefficients. It is the most stable solver, in particular more stable for singular matrices than ‘cholesky’ at the cost of being slower. ‘cholesky’ uses the standard scipy.linalg.solve function to obtain a closed-form solution. ‘sparse_cg’ uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than ‘cholesky’ for large-scale data (possibility to set tol and max_iter). ‘lsqr’ uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative procedure. ‘sag’ uses a Stochastic Average Gradient descent, and ‘saga’ uses its improved, unbiased version named SAGA. Both methods also use an iterative procedure, and are often faster than other solvers when both n_samples and n_features are large. Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. ‘lbfgs’ uses L-BFGS-B algorithm implemented in scipy.optimize.minimize. It can be used only when positive is True. All solvers except ‘svd’ support both dense and sparse data. However, only ‘lsqr’, ‘sag’, ‘sparse_cg’, and ‘lbfgs’ support sparse input when fit_intercept is True.
- positive (default=False): When set to True, forces the coefficients to be positive. Only ‘lbfgs’ solver is supported in this case.

[scikit-learn Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)

##### 3.5. Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

# lasso_model = Lasso(positive=True)
lasso_model = Lasso()

lasso_model.fit(X_train, y_train)

lasso_y_pred: np.ndarray = lasso_model.predict(X_train)
print_metrics(y_train, lasso_y_pred, "KNOWN")

lasso_y_pred: np.ndarray = lasso_model.predict(X_valid)
print_metrics(y_valid, lasso_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32412.484075229924
KNOWN DATA: RMSE: 140.1385092368401
KNOWN DATA: MAE: 114.46022118650104

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32202.681205668472
NEW DATA: RMSE: 139.62734251695647
NEW DATA: MAE: 114.7960288132233



##### Use GridSearch for attempt at optimal hyperparameters

In [None]:
params_lasso = {
    "alpha": [1.0, 2.0, 2.5, 5.0, 10.0],
    "fit_intercept": [True, False],
    "max_iter": [1000, 2000, 5000],
    "tol": [0.0001, 0.001, 0.01, 0.1],
    "positive": [False, True],
    "selection": ["cyclic", "random"]
}

grid_lasso = GridSearchCV(lasso_model, param_grid=params_lasso, n_jobs=-1, verbose=True)

grid_lasso.fit(X_train, y_train)

grid_lasso.best_params_

##### Non-default hyperparameters:

- positive (default=False): When set to True, forces the coefficients to be positive.

[scikit-learn Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)

##### 3.6. Nearest Neighbour Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# neighbour_model = KNeighborsRegressor(n_neighbors=25)
neighbour_model = KNeighborsRegressor()

neighbour_model.fit(X_train, y_train)

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_train)
print_metrics(y_train, neighbour_y_pred, "KNOWN")

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_valid)
print_metrics(y_valid, neighbour_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 30362.396808735983
KNOWN DATA: RMSE: 135.82851850619156
KNOWN DATA: MAE: 109.82000238432865

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 33495.226513995294
NEW DATA: RMSE: 141.74131716314858
NEW DATA: MAE: 116.26705300511918



##### Use GridSearch for attempt at optimal hyperparameters

In [None]:
params_neighbour = {
    "n_neighbours": [5, 10, 15, 20, 25, 50, 100],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [30, 10, 5, 50, 100],
    "p": [2.0, 3.0, 5.0, 10.0, 1.0]
}

grid_neighbour = GridSearchCV(neighbour_model, param_grid=params_neighbour, n_jobs=-1, verbose=True)

grid_neighbour.fit(X_train, y_train)

grid_neighbour.best_params_

##### Non-default hyperparameters:

- n_neighbours (default=5): Number of neighbors to use by default for kneighbors queries.

[scikit-learn KNeighboursRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)

##### 3.7. Multi-Layer Perceptron (MLP) Regressor

In [None]:
from sklearn.neural_network import MLPRegressor

# mlp_model = MLPRegressor(solver="lbfgs")
mlp_model = MLPRegressor()

mlp_model.fit(X_train, y_train)

mlp_y_pred: np.ndarray = mlp_model.predict(X_train)
print_metrics(y_train, mlp_y_pred, "KNOWN")

mlp_y_pred: np.ndarray = mlp_model.predict(X_valid)
print_metrics(y_valid, mlp_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 35837.88473947572
KNOWN DATA: RMSE: 148.61863051041124
KNOWN DATA: MAE: 121.48386304759583

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 34417.703978671394
NEW DATA: RMSE: 148.3825567469038
NEW DATA: MAE: 121.94880738534113



##### Use GridSearch for attempt at optimal hyperparameters

In [None]:
params_mlp = {
    "hidden_layer_sizes": [[100, 0], [500, 0], [1000, 0]],
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver": ["lbfgs", "sgd", "adam"],
    "alpha": [0.0001, 0.001, 0.01, 0.1],
    "learning_rate": ["constant", "invscaling", "adaptive"],
    "tol": [0.0001, 0.001, 0.01, 0.1]
}

grid_mlp = GridSearchCV(mlp_model, param_grid=params_mlp, n_jobs=-1, verbose=True)

grid_mlp.fit(X_train, y_train)

grid_mlp.best_params_

##### Non-default hyperparameters:

- solver (default="adam"): The solver for weight optimization. ‘lbfgs’ is an optimizer in the family of quasi-Newton methods. ‘sgd’ refers to stochastic gradient descent. ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba. Note: The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, ‘lbfgs’ can converge faster and perform better.

[scikit-learn MLP Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)

#### 5. Testing timeseries model

##### 5.1 Preparing data for timeseries analysis

In [None]:
# Converting the index to a datetimeindex
# weather_df.index = pd.to_datetime(weather_df.index)
# images_df.index = pd.to_datetime(images_df.index)

In [None]:
# from darts import TimeSeries

# weather_ts: TimeSeries = TimeSeries.from_dataframe(weather_df, fill_missing_dates=True)
# image_ts: TimeSeries = TimeSeries.from_dataframe(images_df)