In [113]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

#### 1. Loading in our data

In [114]:
weather_df: pd.DataFrame = pd.read_feather("./training_data/processed_data/preprocessed_weather_hartbeespoort.feather")
images_df: pd.DataFrame = pd.read_feather("./training_data/processed_data/preprocessed_image_test.feather")

In [115]:
weather_df

Unnamed: 0_level_0,windspeed,winddir
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-04,19.1,292.4
2023-01-09,15.4,325.7
2023-01-11,23.8,69.0
2023-01-14,11.6,38.4
2023-01-16,16.3,47.8
...,...,...
2024-04-10,19.6,194.2
2024-04-13,17.2,341.4
2024-04-18,16.2,241.7
2024-04-20,16.9,292.4


In [116]:
images_df

Unnamed: 0_level_0,center_x_1,center_y_1,x_axis_length_1,y_axis_length_1,angle_1
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-04,774.568298,464.119568,291.609772,653.578308,73.510216
2023-01-09,646.504028,420.321869,278.589264,730.667725,82.895424
2023-01-11,863.524353,387.239929,464.276306,597.633362,20.470768
2023-01-14,906.893433,383.717712,466.869843,570.567200,66.743622
2023-01-16,624.036072,339.108948,325.282562,731.776062,57.742481
...,...,...,...,...,...
2024-04-10,1324.350586,422.794281,17.715271,51.442062,68.193947
2024-04-13,752.452454,289.341919,13.216461,32.141590,62.891289
2024-04-18,668.551453,445.496552,32.465782,66.018997,27.909737
2024-04-20,768.659729,94.758255,21.042046,71.317238,135.777664


#### 2. Preparing our data

In [117]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

result: list[pd.DataFrame] = train_test_split(weather_df, images_df, test_size=0.1, random_state=69, shuffle=False)
X_train: pd.DataFrame = result[0]
X_valid: pd.DataFrame = result[1]
y_train: pd.DataFrame = result[2]
y_valid: pd.DataFrame = result[3]

print(f"Training data rows: {len(X_train)}, Test data rows: {len(X_valid)}")

Training data rows: 109, Test data rows: 13


In [118]:
def print_metrics(actual, prediction, data_type) -> None:
    print("Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)")
    print(f"{data_type} DATA: MSE: {mean_squared_error(actual, prediction)}")
    print(f"{data_type} DATA: RMSE: {mean_squared_error(actual, prediction, squared=False)}")
    print(f"{data_type} DATA: MAE: {mean_absolute_error(actual, prediction)}\n")

#### 3. Testing different types of regression models

##### 3.1. Decision Tree Regressor

In [119]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(min_samples_leaf=5, min_weight_fraction_leaf=0.2)

tree_model.fit(X_train, y_train)

tree_y_pred: np.ndarray = tree_model.predict(X_train)
print_metrics(y_train, tree_y_pred, "KNOWN")

tree_y_pred: np.ndarray = tree_model.predict(X_valid)
print_metrics(y_valid, tree_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32043.18515966367
KNOWN DATA: RMSE: 161.4592185193721
KNOWN DATA: MAE: 138.59284102011105

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 28839.059434411603
NEW DATA: RMSE: 151.53751660503704
NEW DATA: MAE: 132.44716979524563



Hyperparamter tuning is found empiracally to be a good balance between being to correctly identify known data and handle new data.
The problem however is that we have very little data for training and validating.

- min_samples_leaf: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.
- min_weight_fraction_leaf: The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.

[scikit-learn DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

##### 3.2. Random Forest Regressor

In [120]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(min_samples_leaf=5, min_weight_fraction_leaf=0.2)

forest_model.fit(X_train, y_train)

forest_y_pred: np.ndarray = forest_model.predict(X_train)
print_metrics(y_train, forest_y_pred, "KNOWN")

forest_y_pred: np.ndarray = forest_model.predict(X_valid)
print_metrics(y_valid, forest_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 30571.456668111925
KNOWN DATA: RMSE: 157.46129864366827
KNOWN DATA: MAE: 134.5279319875471

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 33181.50205269339
NEW DATA: RMSE: 162.20623502835264
NEW DATA: MAE: 141.8095404589318



##### 3.3. Linear Regressor

In [121]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

linear_y_pred: np.ndarray = linear_model.predict(X_train)
print_metrics(y_train, linear_y_pred, "KNOWN")

linear_y_pred: np.ndarray = linear_model.predict(X_valid)
print_metrics(y_valid, linear_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 34118.12093391345
KNOWN DATA: RMSE: 165.68850079534394
KNOWN DATA: MAE: 140.75140784601382

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 31545.353160031467
NEW DATA: RMSE: 157.60853103398122
NEW DATA: MAE: 134.06559680626225



##### 3.4. Nearest Neighbour Regressor

In [122]:
from sklearn.neighbors import KNeighborsRegressor

neighbour_model = KNeighborsRegressor()

neighbour_model.fit(X_train, y_train)

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_train)
print_metrics(y_train, neighbour_y_pred, "KNOWN")

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_valid)
print_metrics(y_valid, neighbour_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 26414.827752691675
KNOWN DATA: RMSE: 145.38858181886513
KNOWN DATA: MAE: 119.16325077791825

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 39761.7333372892
NEW DATA: RMSE: 177.06280238977783
NEW DATA: MAE: 152.70546441885142



##### 3.5. Multi-Layer Perceptron (MLP) Regressor

In [123]:
from sklearn.neural_network import MLPRegressor

mlp_model = MLPRegressor()

mlp_model.fit(X_train, y_train)

mlp_y_pred: np.ndarray = mlp_model.predict(X_train)
print_metrics(y_train, mlp_y_pred, "KNOWN")

mlp_y_pred: np.ndarray = mlp_model.predict(X_valid)
print_metrics(y_valid, mlp_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 66977.3782916429
KNOWN DATA: RMSE: 228.0570859349259
KNOWN DATA: MAE: 181.0006193839465

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 57612.60579089132
NEW DATA: RMSE: 206.0266477391596
NEW DATA: MAE: 169.13270158142888



In [124]:
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.neuralforecast import NeuralForecastRNN

import prophet

In [125]:
naive_model = NaiveForecaster(strategy="last")



In [126]:
rnn_model = NeuralForecastRNN()