In [260]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

#### 1. Loading in our data

In [261]:
weather_df: pd.DataFrame = pd.read_feather("./training_data/processed_data/preprocessed_weather_hartbeespoort.feather")
images_df: pd.DataFrame = pd.read_feather("./training_data/processed_data/preprocessed_image_test.feather")

In [262]:
weather_df

Unnamed: 0_level_0,windspeed,winddir
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-04,18.4,292.2
2021-01-09,15.4,330.5
2021-01-21,21.7,85.6
2021-02-10,13.6,46.0
2021-02-13,16.6,117.2
...,...,...
2024-04-10,19.6,194.2
2024-04-13,17.2,341.4
2024-04-18,16.2,241.7
2024-04-20,16.9,292.4


In [263]:
images_df

Unnamed: 0_level_0,center_x_1,center_y_1,x_axis_length_1,y_axis_length_1,angle_1,center_x_2,center_y_2,x_axis_length_2,y_axis_length_2,angle_2,center_x_3,center_y_3,x_axis_length_3,y_axis_length_3,angle_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-01-04,704.028564,442.499176,121.826691,451.231567,58.160427,960.999756,336.523529,112.876732,203.835342,117.014908,1092.642456,257.937164,133.023499,147.132126,142.699448
2021-01-09,715.279602,351.457184,100.243324,367.353882,66.100555,807.207031,124.622177,54.882431,117.928635,119.937805,184.712097,572.285339,51.416313,93.895416,133.982315
2021-01-21,325.976746,490.977356,136.785980,500.891998,74.853523,813.122131,146.077484,87.973320,149.937958,163.590485,1233.187134,566.028442,8.314092,24.543709,143.964798
2021-02-10,167.179871,545.132202,91.021637,191.275513,108.308769,881.044556,144.769745,63.307846,291.755676,111.700424,592.362976,408.737030,60.006161,177.342026,67.738052
2021-02-13,280.800171,504.206818,134.417603,405.863342,72.619293,787.453369,126.050583,35.755112,48.203991,129.081055,918.028381,556.878296,14.486405,27.316944,2.405948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-10,1324.350586,422.794281,17.715271,51.442062,68.193947,1134.241943,220.593307,11.787232,39.232216,79.759109,1073.128418,207.650436,9.741089,55.922703,114.016281
2024-04-13,752.452454,289.341919,13.216461,32.141590,62.891289,714.009460,306.258728,7.657600,39.223877,66.653549,627.675842,413.542480,11.451272,24.320898,0.623103
2024-04-18,668.551453,445.496552,32.465782,66.018997,27.909737,237.793610,557.887268,6.805774,15.332954,112.110847,360.941437,496.977661,7.062338,13.159757,43.582386
2024-04-20,768.659729,94.758255,21.042046,71.317238,135.777664,832.362061,284.696594,7.747338,49.398838,177.851105,915.509277,569.841309,6.616477,22.889042,163.148788


#### 2. Preparing our data

In [264]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

result: list[pd.DataFrame] = train_test_split(weather_df, images_df, test_size=0.1, random_state=69, shuffle=False)
X_train: pd.DataFrame = result[0]
X_valid: pd.DataFrame = result[1]
y_train: pd.DataFrame = result[2]
y_valid: pd.DataFrame = result[3]

print(f"Training data rows: {len(X_train)}, Test data rows: {len(X_valid)}")

Training data rows: 195, Test data rows: 22


In [265]:
def print_metrics(actual, prediction, data_type) -> None:
    print("Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)")
    print(f"{data_type} DATA: MSE: {mean_squared_error(actual, prediction)}")
    print(f"{data_type} DATA: RMSE: {mean_squared_error(actual, prediction, squared=False)}")
    print(f"{data_type} DATA: MAE: {mean_absolute_error(actual, prediction)}\n")

#### 3. Testing different types of regression models

##### 3.1. Decision Tree Regressor

In [266]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(min_samples_leaf=5, min_weight_fraction_leaf=0.2)

tree_model.fit(X_train, y_train)

tree_y_pred: np.ndarray = tree_model.predict(X_train)
print_metrics(y_train, tree_y_pred, "KNOWN")

tree_y_pred: np.ndarray = tree_model.predict(X_valid)
print_metrics(y_valid, tree_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 30328.38194337756
KNOWN DATA: RMSE: 136.26920459776045
KNOWN DATA: MAE: 109.17692018613012

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 27091.329001823015
NEW DATA: RMSE: 130.97358770167867
NEW DATA: MAE: 106.9764585669701



Hyperparamter tuning is found empiracally to be a good balance between being to correctly identify known data and handle new data.
The problem however is that we have very little data for training and validating.

##### Non-default hyperparameters:

- min_samples_leaf (default=1): The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

- min_weight_fraction_leaf (default=0.0): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.


[scikit-learn DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

##### 3.2. Random Forest Regressor

In [267]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(min_samples_leaf=5, min_weight_fraction_leaf=0.2, bootstrap=True)
# forest_model = RandomForestRegressor()

forest_model.fit(X_train, y_train)

forest_y_pred: np.ndarray = forest_model.predict(X_train)
print_metrics(y_train, forest_y_pred, "KNOWN")

forest_y_pred: np.ndarray = forest_model.predict(X_valid)
print_metrics(y_valid, forest_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 29756.166854553787
KNOWN DATA: RMSE: 135.08064005352622
KNOWN DATA: MAE: 108.63445739855385

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 26457.996264826143
NEW DATA: RMSE: 129.79570598325796
NEW DATA: MAE: 105.735142106958



##### Non-default hyperparameters:

- min_samples_leaf (default=1): The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.
- min_weight_fraction_leaf (default=0.0): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.
- bootstrap: Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

[scikit-learn RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

##### 3.3. Linear Regressor

In [268]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression(positive=True)

linear_model.fit(X_train, y_train)

linear_y_pred: np.ndarray = linear_model.predict(X_train)
print_metrics(y_train, linear_y_pred, "KNOWN")

linear_y_pred: np.ndarray = linear_model.predict(X_valid)
print_metrics(y_valid, linear_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 31984.128734162332
KNOWN DATA: RMSE: 139.22310287897565
KNOWN DATA: MAE: 112.97119338897396

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 25639.46828302515
NEW DATA: RMSE: 128.1039115737395
NEW DATA: MAE: 104.74038338867645



##### Non-default hyperparameters:

- positive (default=False): When set to True, forces the coefficients to be positive. This option is only supported for dense arrays.

[scikit-learn LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

##### 3.4. Nearest Neighbour Regressor

In [269]:
from sklearn.neighbors import KNeighborsRegressor

neighbour_model = KNeighborsRegressor(n_neighbors=8)

neighbour_model.fit(X_train, y_train)

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_train)
print_metrics(y_train, neighbour_y_pred, "KNOWN")

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_valid)
print_metrics(y_valid, neighbour_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 27381.96932025689
KNOWN DATA: RMSE: 129.11238810459676
KNOWN DATA: MAE: 102.93008175077603

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 29196.56018815573
NEW DATA: RMSE: 136.704882419424
NEW DATA: MAE: 113.3405628975594



##### Non-default hyperparameters:

- n_neighbours (default=5): Number of neighbors to use by default for kneighbors queries.

[scikit-learn KNeighboursRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)

##### 3.5. Multi-Layer Perceptron (MLP) Regressor

In [270]:
from sklearn.neural_network import MLPRegressor

# mlp_model = MLPRegressor(hidden_layer_sizes=50000)
mlp_model = MLPRegressor(solver="lbfgs")

mlp_model.fit(X_train, y_train)

mlp_y_pred: np.ndarray = mlp_model.predict(X_train)
print_metrics(y_train, mlp_y_pred, "KNOWN")

mlp_y_pred: np.ndarray = mlp_model.predict(X_valid)
print_metrics(y_valid, mlp_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 40173.402057024774
KNOWN DATA: RMSE: 154.7288386838697
KNOWN DATA: MAE: 125.27895480125989

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32793.71447499805
NEW DATA: RMSE: 145.66874306973276
NEW DATA: MAE: 116.01525308698764



In [271]:
mlp_model = MLPRegressor(solver="lbfgs")

mlp_model.fit(X_train, y_train)

mlp_y_pred: np.ndarray = mlp_model.predict(X_train)
print_metrics(y_train, mlp_y_pred, "KNOWN")

mlp_y_pred: np.ndarray = mlp_model.predict(X_valid)
print_metrics(y_valid, mlp_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 41635.39144129535
KNOWN DATA: RMSE: 156.24837889454292
KNOWN DATA: MAE: 126.19859863085487

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 33773.9289038194
NEW DATA: RMSE: 146.93868569958218
NEW DATA: MAE: 118.02405869960255



##### Non-default hyperparameters:

- solver (default="adam"): The solver for weight optimization. ‘lbfgs’ is an optimizer in the family of quasi-Newton methods. ‘sgd’ refers to stochastic gradient descent. ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba. Note: The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, ‘lbfgs’ can converge faster and perform better.


In [272]:
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.neuralforecast import NeuralForecastRNN

import prophet

In [273]:
naive_model = NaiveForecaster(strategy="last")



In [274]:
rnn_model = NeuralForecastRNN()