In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

#### 1. Loading in our data

In [2]:
weather_df: pd.DataFrame = pd.read_feather("./training_data/processed_data/preprocessed_weather_hartbeespoort.feather")
images_df: pd.DataFrame = pd.read_feather("./training_data/processed_data/preprocessed_image_hartbeespoort.feather")

In [3]:
weather_df

Unnamed: 0_level_0,windspeed,winddir
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-04,18.4,292.2
2021-01-09,15.4,330.5
2021-01-21,21.7,85.6
2021-02-10,13.6,46.0
2021-02-13,16.6,117.2
...,...,...
2024-05-10,14.8,320.4
2024-05-13,13.9,29.4
2024-05-15,16.4,183.5
2024-05-18,11.6,218.0


In [4]:
images_df

Unnamed: 0_level_0,center_x_1,center_y_1,x_axis_length_1,y_axis_length_1,angle_1,center_x_2,center_y_2,x_axis_length_2,y_axis_length_2,angle_2,center_x_3,center_y_3,x_axis_length_3,y_axis_length_3,angle_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-01-04,704.028564,442.499176,121.826691,451.231567,58.160427,960.999756,336.523529,112.876732,203.835342,117.014908,1092.642456,257.937164,133.023499,147.132126,142.699448
2021-01-09,715.279602,351.457184,100.243324,367.353882,66.100555,807.207031,124.622177,54.882431,117.928635,119.937805,184.712097,572.285339,51.416313,93.895416,133.982315
2021-01-21,325.976746,490.977356,136.785980,500.891998,74.853523,813.122131,146.077484,87.973320,149.937958,163.590485,1233.187134,566.028442,8.314092,24.543709,143.964798
2021-02-10,167.179871,545.132202,91.021637,191.275513,108.308769,881.044556,144.769745,63.307846,291.755676,111.700424,592.362976,408.737030,60.006161,177.342026,67.738052
2021-02-13,280.800171,504.206818,134.417603,405.863342,72.619293,787.453369,126.050583,35.755112,48.203991,129.081055,918.028381,556.878296,14.486405,27.316944,2.405948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-10,777.354004,107.631660,30.748890,90.560562,146.724838,243.170166,542.966003,13.301406,35.620926,173.263412,713.954590,443.647552,9.775264,37.554825,63.395508
2024-05-13,716.543701,305.779480,22.611246,165.610962,61.839939,120.704819,541.091553,13.267200,20.426884,71.220276,797.884949,141.748245,10.134691,28.270313,174.367538
2024-05-15,800.668274,145.144821,26.662622,81.719994,131.096573,756.216187,80.918289,15.954613,37.925011,145.252411,112.768906,556.176453,10.941070,24.187414,82.034576
2024-05-18,774.593811,107.590065,32.553211,89.817474,143.160324,233.447922,540.661499,23.895515,50.522202,166.417343,107.577980,557.288330,9.227895,16.054379,118.356796


#### 2. Preparing our data

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

result: list[pd.DataFrame] = train_test_split(weather_df, images_df, test_size=0.1, random_state=69, shuffle=False)
X_train: pd.DataFrame = result[0]
X_valid: pd.DataFrame = result[1]
y_train: pd.DataFrame = result[2]
y_valid: pd.DataFrame = result[3]

print(f"Training data rows: {len(X_train)}, Test data rows: {len(X_valid)}")

Training data rows: 287, Test data rows: 32


In [6]:
def print_metrics(actual, prediction, data_type) -> None:
    print("Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)")
    print(f"{data_type} DATA: MSE: {mean_squared_error(actual, prediction)}")
    print(f"{data_type} DATA: RMSE: {mean_squared_error(actual, prediction, squared=False)}")
    print(f"{data_type} DATA: MAE: {mean_absolute_error(actual, prediction)}\n")

#### 3. Testing different types of supervised regression models

##### 3.1. Decision Tree Regressor

In [7]:
from sklearn.tree import DecisionTreeRegressor

# tree_model = DecisionTreeRegressor(min_samples_split=10, min_samples_leaf=10, min_weight_fraction_leaf=0.3)
tree_model = DecisionTreeRegressor()

tree_model.fit(X_train, y_train)

tree_y_pred: np.ndarray = tree_model.predict(X_train)
print_metrics(y_train, tree_y_pred, "KNOWN")

tree_y_pred: np.ndarray = tree_model.predict(X_valid)
print_metrics(y_valid, tree_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 0.0
KNOWN DATA: RMSE: 0.0
KNOWN DATA: MAE: 0.0

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 70408.90494990522
NEW DATA: RMSE: 205.9825822913128
NEW DATA: MAE: 164.5662872088452



##### 3.1.1. Use GridSearch for attempt at optimal hyperparameters

In [8]:
params_tree = {
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "max_depth": [None, 2, 4, 10, 20],
    "min_samples_split": [2, 4, 5, 10],
    "min_samples_leaf": [1, 2, 4, 5, 10],
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2, 0.3],
    "max_leaf_nodes": [None, 6, 8, 10, 20],
    "min_impurity_decrease": [0.0, 0.1, 0.2, 0.3]
}

grid_tree = GridSearchCV(tree_model, param_grid=params_tree, n_jobs=-1, verbose=True)

grid_tree.fit(X_train, y_train)

grid_tree.best_params_

Fitting 5 folds for each of 32000 candidates, totalling 160000 fits




{'criterion': 'friedman_mse',
 'max_depth': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.2}

##### 3.1.2. Testing the newly acquired hyperparameters

In [8]:
tree_model = DecisionTreeRegressor(criterion="friedman_mse", min_weight_fraction_leaf=0.2)

tree_model.fit(X_train, y_train)

tree_y_pred: np.ndarray = tree_model.predict(X_train)
print_metrics(y_train, tree_y_pred, "KNOWN")

tree_y_pred: np.ndarray = tree_model.predict(X_valid)
print_metrics(y_valid, tree_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 31528.223894258812
KNOWN DATA: RMSE: 138.47504761231784
KNOWN DATA: MAE: 112.24267252072096

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32809.82534135598
NEW DATA: RMSE: 140.77275305614256
NEW DATA: MAE: 115.88433876852582



Here we can see that our hyperparameters are a good balance between good behaviour on the known values and the new values.

##### Non-default hyperparameters:

- criterion (default="squared_error"): The function to measure the quality of a split. 
- min_weight_fraction_leaf (default=0.0): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.


[scikit-learn DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

##### 3.2. Random Forest Regressor

In [9]:
from sklearn.ensemble import RandomForestRegressor

# forest_model = RandomForestRegressor(min_samples_leaf=10, min_weight_fraction_leaf=0.3)
forest_model = RandomForestRegressor()

forest_model.fit(X_train, y_train)

forest_y_pred: np.ndarray = forest_model.predict(X_train)
print_metrics(y_train, forest_y_pred, "KNOWN")

forest_y_pred: np.ndarray = forest_model.predict(X_valid)
print_metrics(y_valid, forest_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 5154.004980176732
KNOWN DATA: RMSE: 55.603586949002185
KNOWN DATA: MAE: 44.28999259398816

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 42192.78268957752
NEW DATA: RMSE: 156.54496602072805
NEW DATA: MAE: 129.90539178813003



##### 3.2.1. Use GridSearch for attempt at optimal hyperparameters

In [10]:
params_forest = {
    "n_estimators": [10, 25, 50],
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "max_depth": [None, 2, 4, 10],
    "min_samples_split": [2, 4, 5],
    "min_samples_leaf": [1, 2, 4, 5],
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2, 0.3],
    "max_leaf_nodes": [None, 6, 8, 10],
    "min_impurity_decrease": [0.0, 0.1, 0.2, 0.3]
}

grid_forest = GridSearchCV(forest_model, param_grid=params_forest, n_jobs=-1, verbose=True)

grid_forest.fit(X_train, y_train)

grid_forest.best_params_

Fitting 5 folds for each of 36864 candidates, totalling 184320 fits


{'criterion': 'friedman_mse',
 'max_depth': 2,
 'max_leaf_nodes': 6,
 'min_impurity_decrease': 0.3,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.1,
 'n_estimators': 10}

##### 3.2.2. Testing the newly acquired hyperparameters

In [11]:
forest_model = RandomForestRegressor(n_estimators=10, criterion="friedman_mse", max_depth=2, max_leaf_nodes=6,
                                     min_impurity_decrease=0.3, min_samples_split=5, min_weight_fraction_leaf=0.1)

forest_model.fit(X_train, y_train)

forest_y_pred: np.ndarray = forest_model.predict(X_train)
print_metrics(y_train, forest_y_pred, "KNOWN")

forest_y_pred: np.ndarray = forest_model.predict(X_valid)
print_metrics(y_valid, forest_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 31182.94395321399
KNOWN DATA: RMSE: 137.91316146030812
KNOWN DATA: MAE: 112.3408952198535

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32333.517574548016
NEW DATA: RMSE: 140.45262783358845
NEW DATA: MAE: 116.0986492141828



Once again these parameters result in a good balance between train and test data performance.

##### Non-default hyperparameters:

- n_estimators (default=100): The number of trees in the forest.
- criterion (default="squared_error"): The function to measure the quality of a split.
- max_depth (default=None): The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
- min_samples_split (default=2): The minimum number of samples required to split an internal node.
- max_leaf_nodes (default=None): Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.
- min_impurity_decrease (default=0.0): A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
- min_weight_fraction_leaf (default=0.0): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.

[scikit-learn RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

##### 3.3. Linear Regressor

In [12]:
from sklearn.linear_model import LinearRegression

# linear_model = LinearRegression(positive=True)
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

linear_y_pred: np.ndarray = linear_model.predict(X_train)
print_metrics(y_train, linear_y_pred, "KNOWN")

linear_y_pred: np.ndarray = linear_model.predict(X_valid)
print_metrics(y_valid, linear_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32201.504929329716
KNOWN DATA: RMSE: 139.7326368897543
KNOWN DATA: MAE: 113.8009497001413

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 33888.613503382105
NEW DATA: RMSE: 142.42283683389056
NEW DATA: MAE: 116.19241510558375



##### 3.3.1. Use GridSearch for attempt at optimal hyperparameters

In [12]:
params_linear = {
    "fit_intercept": [True, False],
    "positive": [False, True]
}

grid_linear = GridSearchCV(linear_model, param_grid=params_linear, n_jobs=-1, verbose=True)

grid_linear.fit(X_train, y_train)

grid_linear.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


{'fit_intercept': True, 'positive': True}

##### 3.3.2. Testing the newly acquired hyperparameters

In [14]:
linear_model = LinearRegression(positive=True)

linear_model.fit(X_train, y_train)

linear_y_pred: np.ndarray = linear_model.predict(X_train)
print_metrics(y_train, linear_y_pred, "KNOWN")

linear_y_pred: np.ndarray = linear_model.predict(X_valid)
print_metrics(y_valid, linear_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32412.473880083842
KNOWN DATA: RMSE: 140.1384007924289
KNOWN DATA: MAE: 114.45904001584911

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32201.900449802193
NEW DATA: RMSE: 139.62326538527108
NEW DATA: MAE: 114.78593845659337



Since there weren't many hyperparameters to tune this barely improved the model's performance.

##### Non-default hyperparameters:

- positive (default=False): When set to True, forces the coefficients to be positive.

[scikit-learn LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

##### 3.4. Ridge Regression

In [15]:
from sklearn.linear_model import Ridge

# ridge_model = Ridge(solver="lbfgs", positive=True)
ridge_model = Ridge()

ridge_model.fit(X_train, y_train)

ridge_y_pred: np.ndarray = ridge_model.predict(X_train)
print_metrics(y_train, ridge_y_pred, "KNOWN")

ridge_y_pred: np.ndarray = ridge_model.predict(X_valid)
print_metrics(y_valid, ridge_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32201.504931096286
KNOWN DATA: RMSE: 139.73263689419738
KNOWN DATA: MAE: 113.80098139933503

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 33888.47623139085
NEW DATA: RMSE: 142.42262954938604
NEW DATA: MAE: 116.19228669280068



##### 3.4.1. Use GridSearch for attempt at optimal hyperparameters

In [14]:
params_ridge = {
    "alpha": [1.0, 2.0, 2.5, 5.0, 10.0],
    "fit_intercept": [True, False],
    "max_iter": [None, 1000, 15000, 30000],
    "tol": [0.0001, 0.001, 0.01, 0.1, 0.00001],
    "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"],
    "positive": [False, True]
}

grid_ridge = GridSearchCV(ridge_model, param_grid=params_ridge, n_jobs=-1, verbose=True)

grid_ridge.fit(X_train, y_train)

grid_ridge.best_params_

Fitting 5 folds for each of 3200 candidates, totalling 16000 fits


{'alpha': 10.0,
 'fit_intercept': True,
 'max_iter': None,
 'positive': True,
 'solver': 'auto',
 'tol': 0.1}

##### 3.4.2. Testing the newly acquired hyperparameters

In [16]:
ridge_model = Ridge(alpha=10.0, positive=True, tol=0.1)

ridge_model.fit(X_train, y_train)

ridge_y_pred: np.ndarray = ridge_model.predict(X_train)
print_metrics(y_train, ridge_y_pred, "KNOWN")

ridge_y_pred: np.ndarray = ridge_model.predict(X_valid)
print_metrics(y_valid, ridge_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32476.768121780588
KNOWN DATA: RMSE: 140.40212292213081
KNOWN DATA: MAE: 114.74507282422867

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32431.311246263256
NEW DATA: RMSE: 140.4664504591828
NEW DATA: MAE: 115.54874335312698



We once again get fairly similar performance

##### Non-default hyperparameters:

- alpha (default=1.0): Constant that multiplies the L2 term, controlling regularization strength.
- positive (default=False): When set to True, forces the coefficients to be positive. Only ‘lbfgs’ solver is supported in this case.
- tol (default=0.0001): The precision of the solution (coef_) is determined by tol which specifies a different convergence criterion for each solver.

[scikit-learn Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)

##### 3.5. Lasso Regression

In [18]:
from sklearn.linear_model import Lasso

# lasso_model = Lasso(positive=True)
lasso_model = Lasso()

lasso_model.fit(X_train, y_train)

lasso_y_pred: np.ndarray = lasso_model.predict(X_train)
print_metrics(y_train, lasso_y_pred, "KNOWN")

lasso_y_pred: np.ndarray = lasso_model.predict(X_valid)
print_metrics(y_valid, lasso_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32201.526768166357
KNOWN DATA: RMSE: 139.73280033320987
KNOWN DATA: MAE: 113.80480170393797

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 33875.92975918508
NEW DATA: RMSE: 142.40251619534735
NEW DATA: MAE: 116.18344325520373



##### 3.5.1. Use GridSearch for attempt at optimal hyperparameters

In [16]:
params_lasso = {
    "alpha": [1.0, 2.0, 2.5, 5.0, 10.0],
    "fit_intercept": [True, False],
    "max_iter": [1000, 2000, 5000],
    "tol": [0.0001, 0.001, 0.01, 0.1],
    "positive": [False, True],
    "selection": ["cyclic", "random"]
}

grid_lasso = GridSearchCV(lasso_model, param_grid=params_lasso, n_jobs=-1, verbose=True)

grid_lasso.fit(X_train, y_train)

grid_lasso.best_params_

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


{'alpha': 10.0,
 'fit_intercept': True,
 'max_iter': 2000,
 'positive': True,
 'selection': 'random',
 'tol': 0.1}

##### 3.5.2. Testing the newly acquired hyperparameters

In [19]:
lasso_model = Lasso(alpha=10.0, max_iter=2000, tol=0.1, positive=True, selection="random")

lasso_model.fit(X_train, y_train)

lasso_y_pred: np.ndarray = lasso_model.predict(X_train)
print_metrics(y_train, lasso_y_pred, "KNOWN")

lasso_y_pred: np.ndarray = lasso_model.predict(X_valid)
print_metrics(y_valid, lasso_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 32413.45540763954
KNOWN DATA: RMSE: 140.14877074593727
KNOWN DATA: MAE: 114.4788770918859

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32210.878716307194
NEW DATA: RMSE: 139.6768423773087
NEW DATA: MAE: 114.88449697438517



Performance is similar to all previous models.

##### Non-default hyperparameters:

- alpha (default=1.0): Constant that multiplies the L1 term, controlling regularization strength.
- max_iter (default=1000): The maximum number of iterations.
- tol (default=0.0001): The tolerance for the optimization: if the updates are smaller than tol, the optimization code checks the dual gap for optimality and continues until it is smaller than tol.
- positive (default=False): When set to True, forces the coefficients to be positive.
- selection (default='cyclic'): If set to ‘random’, a random coefficient is updated every iteration rather than looping over features sequentially by default.

[scikit-learn Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)

##### 3.6. Nearest Neighbour Regressor

In [20]:
from sklearn.neighbors import KNeighborsRegressor

# neighbour_model = KNeighborsRegressor(n_neighbors=25)
neighbour_model = KNeighborsRegressor()

neighbour_model.fit(X_train, y_train)

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_train)
print_metrics(y_train, neighbour_y_pred, "KNOWN")

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_valid)
print_metrics(y_valid, neighbour_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 25548.988179546654
KNOWN DATA: RMSE: 124.1419373165785
KNOWN DATA: MAE: 99.74479885158914

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 43643.62094735951
NEW DATA: RMSE: 158.96222559059942
NEW DATA: MAE: 130.52321904371186



##### 3.6.1. Use GridSearch for attempt at optimal hyperparameters

In [18]:
params_neighbour = {
    "n_neighbors": [5, 10, 15, 20, 25, 50, 100],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [30, 10, 5, 50, 100],
    "p": [2.0, 3.0, 5.0, 10.0, 1.0]
}

grid_neighbour = GridSearchCV(neighbour_model, param_grid=params_neighbour, n_jobs=-1, verbose=True)

grid_neighbour.fit(X_train, y_train)

grid_neighbour.best_params_

Fitting 5 folds for each of 1400 candidates, totalling 7000 fits


{'algorithm': 'auto',
 'leaf_size': 5,
 'n_neighbors': 100,
 'p': 1.0,
 'weights': 'uniform'}

##### 3.6.2. Testing the newly acquired hyperparameters

In [21]:
neighbour_model = KNeighborsRegressor(n_neighbors=100, leaf_size=5, p=1)

neighbour_model.fit(X_train, y_train)

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_train)
print_metrics(y_train, neighbour_y_pred, "KNOWN")

neighbour_y_pred: np.ndarray = neighbour_model.predict(X_valid)
print_metrics(y_valid, neighbour_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 31776.91288723784
KNOWN DATA: RMSE: 138.99791257912293
KNOWN DATA: MAE: 113.1102131266104

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 32269.61832787351
NEW DATA: RMSE: 139.62731346708705
NEW DATA: MAE: 113.90359437822303



##### Non-default hyperparameters:

- n_neighbours (default=5): Number of neighbors to use by default for kneighbors queries.
- leaf_size (default=30): Leaf size passed to BallTree or KDTree.
- p (default=2): Power parameter for the Minkowski metric.

[scikit-learn KNeighboursRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)

##### 3.7. Multi-Layer Perceptron (MLP) Regressor

In [22]:
from sklearn.neural_network import MLPRegressor

# mlp_model = MLPRegressor(solver="lbfgs")
mlp_model = MLPRegressor()

mlp_model.fit(X_train, y_train)

mlp_y_pred: np.ndarray = mlp_model.predict(X_train)
print_metrics(y_train, mlp_y_pred, "KNOWN")

mlp_y_pred: np.ndarray = mlp_model.predict(X_valid)
print_metrics(y_valid, mlp_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 53827.40393238139
KNOWN DATA: RMSE: 175.2621974214565
KNOWN DATA: MAE: 140.3777905809457

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 48403.44156108747
NEW DATA: RMSE: 166.7486560330132
NEW DATA: MAE: 137.8768305000157



##### 3.7.1. Use GridSearch for attempt at optimal hyperparameters

In [20]:
params_mlp = {
    "hidden_layer_sizes": [100, 200, 400],
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver": ["lbfgs", "sgd", "adam"],
    "alpha": [0.0001, 0.001, 0.01, 0.1],
    "learning_rate": ["constant", "invscaling", "adaptive"],
    "tol": [0.0001, 0.001, 0.01, 0.1]
}

grid_mlp = GridSearchCV(mlp_model, param_grid=params_mlp, n_jobs=-1, verbose=True)

grid_mlp.fit(X_train, y_train)

grid_mlp.best_params_

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

{'activation': 'logistic',
 'alpha': 0.001,
 'hidden_layer_sizes': 400,
 'learning_rate': 'constant',
 'solver': 'sgd',
 'tol': 0.001}

##### 3.7.2. Testing the newly acquired hyperparameters

In [26]:
mlp_model = MLPRegressor(hidden_layer_sizes=400, activation="logistic", solver="sgd", alpha=0.001, tol=0.001)

mlp_model.fit(X_train, y_train)

mlp_y_pred: np.ndarray = mlp_model.predict(X_train)
print_metrics(y_train, mlp_y_pred, "KNOWN")

mlp_y_pred: np.ndarray = mlp_model.predict(X_valid)
print_metrics(y_valid, mlp_y_pred, "NEW")

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 31317.36598249627
KNOWN DATA: RMSE: 138.20512681964055
KNOWN DATA: MAE: 111.55584282935905

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 34739.84018246325
NEW DATA: RMSE: 144.67490206979568
NEW DATA: MAE: 119.35753852253461



With the last model we once again get the same results.

##### Non-default hyperparameters:

- hidden_layer_sizes (default=100): The ith element represents the number of neurons in the ith hidden layer.
- activation (default='identity'): Activation function for the hidden layer.
- solver (default="adam"): The solver for weight optimization.
- alpha (default=0.0001): Strength of the L2 regularization term. The L2 regularization term is divided by the sample size when added to the loss.
- tol (default=0.0001): Tolerance for the optimization.

[scikit-learn MLP Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)

#### 5. Trying to predict for one ellipse all parameters individually

##### 5.1. Preparing data in the ellipse dataframe

In [37]:
shrunk_images_df: pd.DataFrame = images_df.drop(columns=["center_x_2", "center_x_3", "center_y_2", "center_y_3",
                                                "x_axis_length_2", "x_axis_length_3", "y_axis_length_2", "y_axis_length_3", "angle_2", "angle_3"])

shrunk_images_df

Unnamed: 0_level_0,center_x_1,center_y_1,x_axis_length_1,y_axis_length_1,angle_1
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-04,704.028564,442.499176,121.826691,451.231567,58.160427
2021-01-09,715.279602,351.457184,100.243324,367.353882,66.100555
2021-01-21,325.976746,490.977356,136.785980,500.891998,74.853523
2021-02-10,167.179871,545.132202,91.021637,191.275513,108.308769
2021-02-13,280.800171,504.206818,134.417603,405.863342,72.619293
...,...,...,...,...,...
2024-05-10,777.354004,107.631660,30.748890,90.560562,146.724838
2024-05-13,716.543701,305.779480,22.611246,165.610962,61.839939
2024-05-15,800.668274,145.144821,26.662622,81.719994,131.096573
2024-05-18,774.593811,107.590065,32.553211,89.817474,143.160324


In [70]:
center_x_image_df: pd.DataFrame = shrunk_images_df.drop(columns=["center_y_1", "x_axis_length_1", "y_axis_length_1", "angle_1"])
center_y_image_df: pd.DataFrame = shrunk_images_df.drop(columns=["center_x_1", "x_axis_length_1", "y_axis_length_1", "angle_1"])
x_axis_length_image_df: pd.DataFrame = shrunk_images_df.drop(columns=["center_x_1", "center_y_1", "y_axis_length_1", "angle_1"])
y_axis_length_image_df: pd.DataFrame = shrunk_images_df.drop(columns=["center_x_1", "center_y_1", "x_axis_length_1", "angle_1"])
angle_image_df: pd.DataFrame = shrunk_images_df.drop(columns=["center_x_1", "center_y_1", "x_axis_length_1", "y_axis_length_1"])

center_x_X_train, center_x_X_valid, center_x_Y_train, center_x_Y_valid = train_test_split(weather_df, center_x_image_df, test_size=0.1, random_state=69, shuffle=False)
center_y_X_train, center_y_X_valid, center_y_Y_train, center_y_Y_valid = train_test_split(weather_df, center_y_image_df, test_size=0.1, random_state=69, shuffle=False)
x_axis_length_X_train, x_axis_length_X_valid, x_axis_length_Y_train, x_axis_length_Y_valid = train_test_split(weather_df, x_axis_length_image_df, test_size=0.1, random_state=69, shuffle=False)
y_axis_length_X_train, y_axis_length_X_valid, y_axis_length_Y_train, y_axis_length_Y_valid = train_test_split(weather_df, y_axis_length_image_df, test_size=0.1, random_state=69, shuffle=False)
angle_X_train, angle_X_valid, angle_Y_train, angle_Y_valid = train_test_split(weather_df, angle_image_df, test_size=0.1, random_state=69, shuffle=False)

In [71]:
center_x_model = RandomForestRegressor()
center_y_model = RandomForestRegressor()
x_axis_length_model = RandomForestRegressor()
y_axis_length_model = RandomForestRegressor()
angle_model = RandomForestRegressor()

center_x_model.fit(center_x_X_train, center_x_Y_train)
center_y_model.fit(center_y_X_train, center_y_Y_train)
x_axis_length_model.fit(x_axis_length_X_train, x_axis_length_Y_train)
y_axis_length_model.fit(y_axis_length_X_train, y_axis_length_Y_train)
angle_model.fit(angle_X_train, angle_Y_train)

In [72]:
print("\n===== CENTER X =====")
center_x_pred: np.ndarray = center_x_model.predict(center_x_X_train)
print_metrics(center_x_Y_train, center_x_pred, "KNOWN")

center_x_pred: np.ndarray = center_x_model.predict(center_x_X_valid)
print_metrics(center_x_Y_valid, center_x_pred, "NEW")

print("\n===== CENTER Y =====")
center_y_pred: np.ndarray = center_y_model.predict(center_y_X_train)
print_metrics(center_y_Y_train, center_y_pred, "KNOWN")

center_y_pred: np.ndarray = center_y_model.predict(center_y_X_valid)
print_metrics(center_y_Y_valid, center_y_pred, "NEW")


===== CENTER X =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 10463.37291969849
KNOWN DATA: RMSE: 102.29062967690878
KNOWN DATA: MAE: 79.51372909280065

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 87046.09334483337
NEW DATA: RMSE: 295.0357492658023
NEW DATA: MAE: 225.1138179111481


===== CENTER Y =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 3257.0280887968456
KNOWN DATA: RMSE: 57.07037838315816
KNOWN DATA: MAE: 44.698026533492346

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 29687.054057106066
NEW DATA: RMSE: 172.29931531235422
NEW DATA: MAE: 153.5937258934975



In [73]:
print("\n===== X AXIS LENGTH =====")
x_axis_length_pred: np.ndarray = x_axis_length_model.predict(x_axis_length_X_train)
print_metrics(x_axis_length_Y_train, x_axis_length_pred, "KNOWN")

x_axis_length_pred: np.ndarray = x_axis_length_model.predict(x_axis_length_X_valid)
print_metrics(x_axis_length_Y_valid, x_axis_length_pred, "NEW")

print("\n===== Y AXIS LENGTH =====")
y_axis_length_pred: np.ndarray = y_axis_length_model.predict(y_axis_length_X_train)
print_metrics(y_axis_length_Y_train, y_axis_length_pred, "KNOWN")

y_axis_length_pred: np.ndarray = y_axis_length_model.predict(y_axis_length_X_valid)
print_metrics(y_axis_length_Y_valid, y_axis_length_pred, "NEW")


===== X AXIS LENGTH =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 2404.508888790453
KNOWN DATA: RMSE: 49.03579191560439
KNOWN DATA: MAE: 37.92619679565629

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 11027.625727856399
NEW DATA: RMSE: 105.0125027216112
NEW DATA: MAE: 97.18082780748605


===== Y AXIS LENGTH =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 7479.999869367958
KNOWN DATA: RMSE: 86.48699248654654
KNOWN DATA: MAE: 67.6932058348639

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 34272.335470153776
NEW DATA: RMSE: 185.12788949845935
NEW DATA: MAE: 158.9556531110406



In [74]:
print("\n===== ANGLE =====")
angle_pred: np.ndarray = angle_model.predict(angle_X_train)
print_metrics(angle_Y_train, angle_pred, "KNOWN")

angle_pred: np.ndarray = angle_model.predict(angle_X_valid)
print_metrics(angle_Y_valid, angle_pred, "NEW")


===== ANGLE =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 243.76334378724664
KNOWN DATA: RMSE: 15.612922333350879
KNOWN DATA: MAE: 12.436283555844934

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 2296.8514767313436
NEW DATA: RMSE: 47.92547836726665
NEW DATA: MAE: 40.85496607705951



#### 5.2. Trying a GridSearch for a single value and applying this to all models

In [89]:
test_tree_model = DecisionTreeRegressor()

params_test_tree = {
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "max_depth": [None, 2, 4, 10, 20],
    "min_samples_split": [2, 4, 5, 10],
    "min_samples_leaf": [1, 2, 4, 5, 10],
    "min_weight_fraction_leaf": [0.0, 0.1, 0.2, 0.3],
    "max_leaf_nodes": [None, 6, 8, 10, 20],
    "min_impurity_decrease": [0.0, 0.1, 0.2, 0.3]
}

grid_test_tree = GridSearchCV(test_tree_model, param_grid=params_test_tree, n_jobs=-1, verbose=True)

grid_test_tree.fit(center_x_X_train, center_x_Y_train)

grid_test_tree.best_params_

Fitting 5 folds for each of 32000 candidates, totalling 160000 fits


{'criterion': 'squared_error',
 'max_depth': None,
 'max_leaf_nodes': 6,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0}

##### Applying this data to all random forests for hopelully improved performance

In [84]:
center_x_model = RandomForestRegressor(max_leaf_nodes=5, min_samples_leaf=10)
center_y_model = RandomForestRegressor(max_leaf_nodes=5, min_samples_leaf=10)
x_axis_length_model = RandomForestRegressor(max_leaf_nodes=5, min_samples_leaf=10)
y_axis_length_model = RandomForestRegressor(max_leaf_nodes=5, min_samples_leaf=10)
angle_model = RandomForestRegressor(max_leaf_nodes=5, min_samples_leaf=10)

center_x_model.fit(center_x_X_train, center_x_Y_train)
center_y_model.fit(center_y_X_train, center_y_Y_train)
x_axis_length_model.fit(x_axis_length_X_train, x_axis_length_Y_train)
y_axis_length_model.fit(y_axis_length_X_train, y_axis_length_Y_train)
angle_model.fit(angle_X_train, angle_Y_train)

In [85]:
print("\n===== CENTER X =====")
center_x_pred: np.ndarray = center_x_model.predict(center_x_X_train)
print_metrics(center_x_Y_train, center_x_pred, "KNOWN")

center_x_pred: np.ndarray = center_x_model.predict(center_x_X_valid)
print_metrics(center_x_Y_valid, center_x_pred, "NEW")

print("\n===== CENTER Y =====")
center_y_pred: np.ndarray = center_y_model.predict(center_y_X_train)
print_metrics(center_y_Y_train, center_y_pred, "KNOWN")

center_y_pred: np.ndarray = center_y_model.predict(center_y_X_valid)
print_metrics(center_y_Y_valid, center_y_pred, "NEW")


===== CENTER X =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 60737.04284767495
KNOWN DATA: RMSE: 246.44886456966069
KNOWN DATA: MAE: 196.76681869732712

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 72059.33505797872
NEW DATA: RMSE: 268.4386988829642
NEW DATA: MAE: 203.2985073483772


===== CENTER Y =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 17865.566423184067
KNOWN DATA: RMSE: 133.66213533826274
KNOWN DATA: MAE: 108.76277110916439

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 24554.540391455324
NEW DATA: RMSE: 156.69888446142596
NEW DATA: MAE: 141.14002858249296



In [86]:
print("\n===== X AXIS LENGTH =====")
x_axis_length_pred: np.ndarray = x_axis_length_model.predict(x_axis_length_X_train)
print_metrics(x_axis_length_Y_train, x_axis_length_pred, "KNOWN")

x_axis_length_pred: np.ndarray = x_axis_length_model.predict(x_axis_length_X_valid)
print_metrics(x_axis_length_Y_valid, x_axis_length_pred, "NEW")

print("\n===== Y AXIS LENGTH =====")
y_axis_length_pred: np.ndarray = y_axis_length_model.predict(y_axis_length_X_train)
print_metrics(y_axis_length_Y_train, y_axis_length_pred, "KNOWN")

y_axis_length_pred: np.ndarray = y_axis_length_model.predict(y_axis_length_X_valid)
print_metrics(y_axis_length_Y_valid, y_axis_length_pred, "NEW")


===== X AXIS LENGTH =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 12450.1291899983
KNOWN DATA: RMSE: 111.58014693483021
KNOWN DATA: MAE: 87.54511543198903

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 11448.275871432754
NEW DATA: RMSE: 106.99661616814222
NEW DATA: MAE: 99.24510786015308


===== Y AXIS LENGTH =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 40794.23990233246
KNOWN DATA: RMSE: 201.975839897579
KNOWN DATA: MAE: 163.48452567717683

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 42029.88434935939
NEW DATA: RMSE: 205.0119127010901
NEW DATA: MAE: 178.32324458708396



In [87]:
print("\n===== ANGLE =====")
angle_pred: np.ndarray = angle_model.predict(angle_X_train)
print_metrics(angle_Y_train, angle_pred, "KNOWN")

angle_pred: np.ndarray = angle_model.predict(angle_X_valid)
print_metrics(angle_Y_valid, angle_pred, "NEW")


===== ANGLE =====
Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
KNOWN DATA: MSE: 1307.2084175360492
KNOWN DATA: RMSE: 36.15533733124404
KNOWN DATA: MAE: 29.70421159686971

Mean Squared Error, Root Mean Squared Error, Mean Absolute Error (perfect = 0.0)
NEW DATA: MSE: 2079.5274416496272
NEW DATA: RMSE: 45.60183594604089
NEW DATA: MAE: 39.56751131710685

