# <font color="red">Forecasting with Random Forest Regressor</font>

# Library Import

In [1]:
# Data manipulation and visualization
import pandas as pd
import math
import json
import multiprocessing

# Preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

import sys
# sys.path.append('../../') # Uncomment this line if running locally
sys.path.append('/kaggle/input/weatherforecasting') # Uncomment this line if running on Kaggle
from historyManagement import *

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Datasets Import

In [2]:
# Load the weather dataset for visibility prediction
# weather_data = pd.read_csv('../../FinalDatasets/finalDataset.csv') # Uncomment this line if running locally
weather_data = pd.read_csv('/kaggle/input/weatherforecasting/finalDataset.csv') # Uncomment this line if running on Kaggle
weather_data['DateTime'] = pd.to_datetime(weather_data['DateTime'])
weather_data.head()

Unnamed: 0,DateTime,Year,Month,Day,Hour,Dew,Precip,PrecipProb,Windgust,WindSpeed,...,Conditions_t+3,Conditions_t+4,Conditions_t+5,Conditions_t+6,Icon_t+1,Icon_t+2,Icon_t+3,Icon_t+4,Icon_t+5,Icon_t+6
0,2023-01-02 00:00:00,2023,1,2,0,15.19,0.0,0.0,10.31,0.24,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
1,2023-01-02 01:00:00,2023,1,2,1,14.72,0.0,0.0,9.72,0.21,...,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,4.0
2,2023-01-02 02:00:00,2023,1,2,2,14.72,0.0,0.0,10.66,0.25,...,0.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,4.0,4.0
3,2023-01-02 03:00:00,2023,1,2,3,14.23,0.0,0.0,9.91,0.33,...,0.0,2.0,2.0,2.0,1.0,1.0,1.0,4.0,4.0,4.0
4,2023-01-02 04:00:00,2023,1,2,4,14.72,0.0,0.0,10.12,0.2,...,2.0,2.0,2.0,2.0,1.0,1.0,4.0,4.0,4.0,4.0


In [3]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14586 entries, 0 to 14585
Columns: 575 entries, DateTime to Icon_t+6
dtypes: datetime64[ns](1), float64(568), int64(6)
memory usage: 64.0 MB


In [4]:
weather_data.describe()

Unnamed: 0,DateTime,Year,Month,Day,Hour,Dew,Precip,PrecipProb,Windgust,WindSpeed,...,Conditions_t+3,Conditions_t+4,Conditions_t+5,Conditions_t+6,Icon_t+1,Icon_t+2,Icon_t+3,Icon_t+4,Icon_t+5,Icon_t+6
count,14586,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,...,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0
mean,2023-11-01 20:30:00,2023.40107,5.727684,15.753599,11.496298,20.789805,0.165143,19.505391,17.075919,8.096791,...,2.044289,2.044358,2.044426,2.044495,3.795283,3.795352,3.79542,3.795489,3.795557,3.795626
min,2023-01-02 00:00:00,2023.0,1.0,1.0,0.0,2.02,0.0,0.0,0.66,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2023-06-02 22:15:00,2023.0,3.0,8.0,5.0,17.1,0.0,0.0,9.17,2.88,...,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,2023-11-01 20:30:00,2023.0,6.0,16.0,11.0,21.85,0.0,0.0,14.66,7.67,...,2.0,2.0,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,2024-04-01 18:45:00,2024.0,8.0,23.0,17.0,24.98,0.01,6.45,23.4175,11.75,...,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0
max,2024-08-31 17:00:00,2024.0,12.0,31.0,23.0,28.88,44.5,100.0,84.12,71.97,...,5.0,5.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0
std,,0.490132,3.192277,8.786893,6.921353,4.824925,0.844688,37.410656,10.200773,6.248622,...,1.567964,1.567897,1.567829,1.567761,1.986011,1.985931,1.985852,1.985773,1.985694,1.985614


# Data Split

In [5]:
# Split the dataset by index: first 80% for training, last 20% for testing (no shuffle, maintain serial order)
split_index = int(len(weather_data) * 0.8)
weather_data_train = weather_data.iloc[:split_index]
weather_data_test = weather_data.iloc[split_index:]

# Some Reused Parameters

In [6]:
# For bootstrap=True cases
bootstrap_true_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True],
    'max_samples': [None, 0.8, 0.9],
    'min_impurity_decrease': [0.0, 0.01, 0.02]
}

# For bootstrap=False cases  
bootstrap_false_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [False],
    'min_impurity_decrease': [0.0, 0.01, 0.02]
}

grid_param = [bootstrap_true_params, bootstrap_false_params]

In [7]:
number_of_model_for_randomized_grid = 20
model_name_for_saving = "Random Forest"
horizon = 6
# core_to_use = max(1, multiprocessing.cpu_count() - 2) # uncomment if you want to use your laptop while training
core_to_use = -1 # uncomment if you want to use all available cores

columns_with_t_plus = [col for col in weather_data.columns if '_t+' in col]
column_to_exclude = columns_with_t_plus + ['DateTime', 'Year']

column_to_predict = [col for col in weather_data.columns if '_t+' in col]
# Remove columns containing 'Conditions' or 'Icon' from column_to_predict
column_to_predict = [col for col in column_to_predict if 'Conditions' not in col and 'Icon' not in col]

print(column_to_exclude)
print(column_to_predict)
print("Core to use:", core_to_use)

['Temp_t+1', 'Temp_t+2', 'Temp_t+3', 'Temp_t+4', 'Temp_t+5', 'Temp_t+6', 'FeelsLike_t+1', 'FeelsLike_t+2', 'FeelsLike_t+3', 'FeelsLike_t+4', 'FeelsLike_t+5', 'FeelsLike_t+6', 'Humidity_t+1', 'Humidity_t+2', 'Humidity_t+3', 'Humidity_t+4', 'Humidity_t+5', 'Humidity_t+6', 'Visibility_t+1', 'Visibility_t+2', 'Visibility_t+3', 'Visibility_t+4', 'Visibility_t+5', 'Visibility_t+6', 'SolarEnergy_t+1', 'SolarEnergy_t+2', 'SolarEnergy_t+3', 'SolarEnergy_t+4', 'SolarEnergy_t+5', 'SolarEnergy_t+6', 'SolarRadiation_t+1', 'SolarRadiation_t+2', 'SolarRadiation_t+3', 'SolarRadiation_t+4', 'SolarRadiation_t+5', 'SolarRadiation_t+6', 'Conditions_t+1', 'Conditions_t+2', 'Conditions_t+3', 'Conditions_t+4', 'Conditions_t+5', 'Conditions_t+6', 'Icon_t+1', 'Icon_t+2', 'Icon_t+3', 'Icon_t+4', 'Icon_t+5', 'Icon_t+6', 'DateTime', 'Year']
['Temp_t+1', 'Temp_t+2', 'Temp_t+3', 'Temp_t+4', 'Temp_t+5', 'Temp_t+6', 'FeelsLike_t+1', 'FeelsLike_t+2', 'FeelsLike_t+3', 'FeelsLike_t+4', 'FeelsLike_t+5', 'FeelsLike_t+6', 

# All Features

In [8]:
X_train_raw = weather_data_train.drop(columns=column_to_exclude)
y_train = weather_data_train[column_to_predict]

In [9]:
X_test_raw = weather_data_test.drop(columns=column_to_exclude)
y_test = weather_data_test[column_to_predict]

In [10]:
print("X_train: ", len(X_train_raw))
print("y_train: ", len(y_train))
print("\nX_test: ", len(X_test_raw))
print("y_test: ", len(y_test))

X_train:  11668
y_train:  11668

X_test:  2918
y_test:  2918


## MinMax Scaler

In [11]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [12]:
X_train

array([[0.        , 0.03333333, 0.        , ..., 0.16666667, 0.16666667,
        0.16666667],
       [0.        , 0.03333333, 0.04347826, ..., 0.16666667, 0.16666667,
        0.16666667],
       [0.        , 0.03333333, 0.08695652, ..., 0.5       , 0.16666667,
        0.16666667],
       ...,
       [0.36363636, 0.03333333, 0.04347826, ..., 0.83333333, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.08695652, ..., 0.83333333, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.13043478, ..., 0.83333333, 0.83333333,
        0.83333333]])

In [13]:
X_test

array([[0.36363636, 0.03333333, 0.17391304, ..., 0.66666667, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.2173913 , ..., 0.66666667, 0.66666667,
        0.83333333],
       [0.36363636, 0.03333333, 0.26086957, ..., 0.66666667, 0.66666667,
        0.66666667],
       ...,
       [0.63636364, 1.        , 0.65217391, ..., 0.66666667, 0.66666667,
        0.66666667],
       [0.63636364, 1.        , 0.69565217, ..., 1.        , 0.66666667,
        0.66666667],
       [0.63636364, 1.        , 0.73913043, ..., 0.83333333, 1.        ,
        0.66666667]])

### Default Parameters

In [14]:
training_model = RandomForestRegressor(random_state=42)
training_model

In [15]:
%%time
training_model.fit(X_train, y_train)

CPU times: user 9min 53s, sys: 516 ms, total: 9min 53s
Wall time: 9min 53s


In [16]:
y_pred = training_model.predict(X_test)
y_pred

array([[2.801660e+01, 2.805460e+01, 2.876600e+01, ..., 3.828136e+02,
        5.899274e+02, 7.666137e+02],
       [2.784090e+01, 2.863510e+01, 2.994040e+01, ..., 5.550000e+02,
        7.192977e+02, 8.364441e+02],
       [2.829170e+01, 2.958530e+01, 3.117650e+01, ..., 6.865952e+02,
        8.081852e+02, 8.433411e+02],
       ...,
       [3.167720e+01, 3.094010e+01, 3.004580e+01, ..., 6.196000e-01,
        3.000000e-04, 0.000000e+00],
       [3.142300e+01, 3.042430e+01, 2.991630e+01, ..., 3.000000e-04,
        0.000000e+00, 0.000000e+00],
       [3.016230e+01, 2.971980e+01, 2.950190e+01, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00]])

In [17]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [18]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [19]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 1625.2666921354387
MAE Score: 10.712404915848007
R2 Score: 0.5396461106675184
RMSE Score: 40.31459651460546


In [20]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,1.875642,1.029697,0.703045,1.369541
1,Temp_t+2,2.054255,1.075729,0.674756,1.433267
2,Temp_t+3,2.190086,1.10208,0.653256,1.479894
3,Temp_t+4,2.311314,1.130025,0.634049,1.520301
4,Temp_t+5,2.375095,1.147384,0.623876,1.541134
5,Temp_t+6,2.446355,1.162075,0.612348,1.564083
6,Temp,2.208791,1.107832,0.650222,1.4862
7,FeelsLike_t+1,13.019668,2.916643,0.478751,3.608278
8,FeelsLike_t+2,13.706951,2.99374,0.451253,3.70229
9,FeelsLike_t+3,14.024264,3.017834,0.438543,3.744898


In [21]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "MinMaxScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 1625.266692, R²: 0.539646
💾 Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Randomized Grid Search

In [22]:
grid_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42), 
    grid_param,
    n_iter=number_of_model_for_randomized_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=core_to_use,
    verbose=1,
    random_state=42
)

In [23]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: user 12min 29s, sys: 9.99 s, total: 12min 39s
Wall time: 2h 26min 33s


In [24]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 6, 'min_impurity_decrease': 0.0, 'max_samples': 0.8, 'max_features': None, 'max_depth': 20, 'bootstrap': True}
Best Cross-Validation Score (Negative MSE): -1117.5843634304424
Best Cross-Validation Score (MSE): 1117.5843634304424
Best Cross-Validation Score (RMSE): 33.43029110597816


In [25]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [26]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [27]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 1610.9724369810276
MAE Score: 10.777828466735874
R2 Score: 0.4804974123433882
RMSE Score: 40.13692111984958


In [28]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,2.519061,1.236955,0.601178,1.587155
1,Temp_t+2,2.618354,1.255582,0.585443,1.618133
2,Temp_t+3,2.694439,1.261352,0.573404,1.641475
3,Temp_t+4,2.784537,1.280085,0.559123,1.668693
4,Temp_t+5,2.806722,1.278785,0.555523,1.675327
5,Temp_t+6,2.851657,1.286928,0.548123,1.688685
6,Temp,2.712462,1.266615,0.570466,1.646955
7,FeelsLike_t+1,17.484345,3.423181,0.300006,4.181429
8,FeelsLike_t+2,17.799246,3.463127,0.287422,4.218915
9,FeelsLike_t+3,17.970784,3.463019,0.280545,4.239196


In [29]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "MinMaxScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 1610.972437, R²: 0.480497
💾 Registry updated: Model_Training_History/History_Regression.csv


## Standard Scaler

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [31]:
X_train

array([[-1.29540078, -1.55866853, -1.66051296, ..., -1.24648639,
        -1.24626261, -1.24603889],
       [-1.29540078, -1.55866853, -1.51607747, ..., -1.24648639,
        -1.24626261, -1.24603889],
       [-1.29540078, -1.55866853, -1.37164198, ..., -0.26491096,
        -1.24626261, -1.24603889],
       ...,
       [-0.15173127, -1.55866853, -1.51607747, ...,  0.71666448,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -1.37164198, ...,  0.71666448,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -1.2272065 , ...,  0.71666448,
         0.71680079,  0.71693711]])

In [32]:
X_test

array([[-0.15173127, -1.55866853, -1.08277101, ...,  0.22587676,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -0.93833552, ...,  0.22587676,
         0.22603494,  0.71693711],
       [-0.15173127, -1.55866853, -0.79390003, ...,  0.22587676,
         0.22603494,  0.22619311],
       ...,
       [ 0.70602087,  1.74193343,  0.50601936, ...,  0.22587676,
         0.22603494,  0.22619311],
       [ 0.70602087,  1.74193343,  0.65045485, ...,  1.2074522 ,
         0.22603494,  0.22619311],
       [ 0.70602087,  1.74193343,  0.79489033, ...,  0.71666448,
         1.20756664,  0.22619311]])

### Default Parameters

In [33]:
training_model = RandomForestRegressor(random_state=42)
training_model

In [34]:
%%time
training_model.fit(X_train, y_train)

CPU times: user 9min 53s, sys: 454 ms, total: 9min 54s
Wall time: 9min 54s


In [35]:
y_pred = training_model.predict(X_test)
y_pred

array([[2.801550e+01, 2.805530e+01, 2.876590e+01, ..., 3.827500e+02,
        5.897448e+02, 7.664492e+02],
       [2.784090e+01, 2.863510e+01, 2.994040e+01, ..., 5.550000e+02,
        7.192977e+02, 8.364441e+02],
       [2.829170e+01, 2.958530e+01, 3.117650e+01, ..., 6.865952e+02,
        8.081852e+02, 8.433411e+02],
       ...,
       [3.167720e+01, 3.094010e+01, 3.004580e+01, ..., 6.196000e-01,
        3.000000e-04, 0.000000e+00],
       [3.143090e+01, 3.042600e+01, 2.992280e+01, ..., 3.000000e-04,
        0.000000e+00, 0.000000e+00],
       [3.016230e+01, 2.971980e+01, 2.950190e+01, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00]])

In [36]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [37]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [38]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 1625.042996853293
MAE Score: 10.713261916457249
R2 Score: 0.5397243027552328
RMSE Score: 40.311822048293635


In [39]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,1.874114,1.028966,0.703287,1.368983
1,Temp_t+2,2.053223,1.075278,0.674919,1.432907
2,Temp_t+3,2.189103,1.101695,0.653412,1.479562
3,Temp_t+4,2.308363,1.128961,0.634516,1.51933
4,Temp_t+5,2.37304,1.146295,0.624202,1.540467
5,Temp_t+6,2.443835,1.160925,0.612747,1.563277
6,Temp,2.206946,1.10702,0.650514,1.485579
7,FeelsLike_t+1,13.011026,2.915889,0.479097,3.60708
8,FeelsLike_t+2,13.703549,2.993029,0.451389,3.701831
9,FeelsLike_t+3,14.021586,3.01764,0.43865,3.744541


In [40]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "StandardScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 1625.042997, R²: 0.539724
💾 Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Randomized Grid Search

In [41]:
grid_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42), 
    grid_param,
    n_iter=number_of_model_for_randomized_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=core_to_use,
    verbose=1,
    random_state=42
)

In [42]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: user 12min 36s, sys: 9.66 s, total: 12min 46s
Wall time: 2h 27min 29s


In [43]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 6, 'min_impurity_decrease': 0.0, 'max_samples': 0.8, 'max_features': None, 'max_depth': 20, 'bootstrap': True}
Best Cross-Validation Score (Negative MSE): -1117.3323382290187
Best Cross-Validation Score (MSE): 1117.3323382290187
Best Cross-Validation Score (RMSE): 33.42652147964276


In [44]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [45]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [46]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 1610.9885459712182
MAE Score: 10.778194183868235
R2 Score: 0.4804692618028704
RMSE Score: 40.137121794807584


In [47]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,2.518576,1.236939,0.601255,1.587002
1,Temp_t+2,2.618164,1.255599,0.585473,1.618074
2,Temp_t+3,2.694413,1.261409,0.573409,1.641467
3,Temp_t+4,2.784686,1.280135,0.5591,1.668738
4,Temp_t+5,2.806731,1.278743,0.555522,1.67533
5,Temp_t+6,2.851769,1.287046,0.548105,1.688718
6,Temp,2.71239,1.266645,0.570477,1.646933
7,FeelsLike_t+1,17.48429,3.423252,0.300008,4.181422
8,FeelsLike_t+2,17.799261,3.462939,0.287421,4.218917
9,FeelsLike_t+3,17.972137,3.462948,0.280491,4.239356


In [48]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "StandardScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 1610.988546, R²: 0.480469
💾 Registry updated: Model_Training_History/History_Regression.csv


## Robust Scaler

In [49]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [50]:
X_train

array([[-0.66666667, -0.93333333, -0.91666667, ..., -0.75      ,
        -0.75      , -0.75      ],
       [-0.66666667, -0.93333333, -0.83333333, ..., -0.75      ,
        -0.75      , -0.75      ],
       [-0.66666667, -0.93333333, -0.75      , ..., -0.25      ,
        -0.75      , -0.75      ],
       ...,
       [ 0.        , -0.93333333, -0.83333333, ...,  0.25      ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.75      , ...,  0.25      ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.66666667, ...,  0.25      ,
         0.25      ,  0.25      ]])

In [51]:
X_test

array([[ 0.        , -0.93333333, -0.58333333, ...,  0.        ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.5       , ...,  0.        ,
         0.        ,  0.25      ],
       [ 0.        , -0.93333333, -0.41666667, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.5       ,  1.        ,  0.33333333, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.5       ,  1.        ,  0.41666667, ...,  0.5       ,
         0.        ,  0.        ],
       [ 0.5       ,  1.        ,  0.5       , ...,  0.25      ,
         0.5       ,  0.        ]])

### Default Parameters

In [52]:
training_model = RandomForestRegressor(random_state=42)
training_model

In [53]:
%%time
training_model.fit(X_train, y_train)

CPU times: user 9min 53s, sys: 450 ms, total: 9min 53s
Wall time: 9min 54s


In [54]:
y_pred = training_model.predict(X_test)
y_pred

array([[2.801660e+01, 2.805460e+01, 2.876600e+01, ..., 3.828136e+02,
        5.899274e+02, 7.666137e+02],
       [2.785110e+01, 2.861120e+01, 2.990390e+01, ..., 5.599537e+02,
        7.288701e+02, 8.445670e+02],
       [2.829170e+01, 2.958530e+01, 3.117650e+01, ..., 6.865952e+02,
        8.081852e+02, 8.433411e+02],
       ...,
       [3.167720e+01, 3.094010e+01, 3.004580e+01, ..., 6.196000e-01,
        3.000000e-04, 0.000000e+00],
       [3.143090e+01, 3.042600e+01, 2.992280e+01, ..., 3.000000e-04,
        0.000000e+00, 0.000000e+00],
       [3.017180e+01, 2.972660e+01, 2.949360e+01, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00]])

In [55]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [56]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [57]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 1625.0802990538205
MAE Score: 10.712886442388253
R2 Score: 0.5395144185729712
RMSE Score: 40.312284716371764


In [58]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,1.875901,1.029834,0.703004,1.369635
1,Temp_t+2,2.054798,1.075507,0.67467,1.433457
2,Temp_t+3,2.190932,1.10222,0.653122,1.48018
3,Temp_t+4,2.311782,1.13,0.633975,1.520455
4,Temp_t+5,2.376022,1.147149,0.623729,1.541435
5,Temp_t+6,2.447438,1.162124,0.612176,1.564429
6,Temp,2.209479,1.107806,0.650113,1.486432
7,FeelsLike_t+1,13.029369,2.917476,0.478363,3.609622
8,FeelsLike_t+2,13.71551,2.994056,0.450911,3.703446
9,FeelsLike_t+3,14.028857,3.018045,0.438359,3.745512


In [59]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "RobustScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 1625.080299, R²: 0.539514
💾 Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Randomized Grid Search

In [60]:
grid_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42), 
    grid_param,
    n_iter=number_of_model_for_randomized_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=core_to_use,
    verbose=1,
    random_state=42
)

In [61]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: user 12min 35s, sys: 9.81 s, total: 12min 45s
Wall time: 2h 26min 33s


In [62]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 6, 'min_impurity_decrease': 0.0, 'max_samples': 0.8, 'max_features': None, 'max_depth': 20, 'bootstrap': True}
Best Cross-Validation Score (Negative MSE): -1117.4549552773517
Best Cross-Validation Score (MSE): 1117.4549552773517
Best Cross-Validation Score (RMSE): 33.428355557480714


In [63]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [64]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [65]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 1611.0999851864644
MAE Score: 10.778633336077654
R2 Score: 0.4804598977745771
RMSE Score: 40.13851000207238


In [66]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,2.519323,1.237062,0.601137,1.587238
1,Temp_t+2,2.618324,1.255645,0.585448,1.618124
2,Temp_t+3,2.694446,1.261359,0.573403,1.641477
3,Temp_t+4,2.785005,1.280215,0.559049,1.668833
4,Temp_t+5,2.807115,1.278878,0.555461,1.675445
5,Temp_t+6,2.852265,1.287236,0.548027,1.688865
6,Temp,2.712746,1.266733,0.570421,1.647042
7,FeelsLike_t+1,17.488306,3.423752,0.299847,4.181902
8,FeelsLike_t+2,17.801418,3.463304,0.287335,4.219173
9,FeelsLike_t+3,17.972927,3.463194,0.28046,4.239449


In [67]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "RobustScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 1611.099985, R²: 0.480460
💾 Registry updated: Model_Training_History/History_Regression.csv


# All Performance

In [68]:
show_model_history(model_type='regression', model_name=model_name_for_saving)


📊 REGRESSION Model Performance History
🔍 Filters Applied:
   • Model Name contains: 'Random Forest'
   • Target: 'Overall' only
----------------------------------------------------------------------------------------------------
              Model         Scaler  Target         MSE       MAE       R2      RMSE
      Random Forest StandardScaler Overall 1625.042997 10.713262 0.539724 40.311822
      Random Forest   MinMaxScaler Overall 1625.266692 10.712405 0.539646 40.314597
      Random Forest   RobustScaler Overall 1625.080299 10.712886 0.539514 40.312285
Random Forest Tuned   MinMaxScaler Overall 1610.972437 10.777828 0.480497 40.136921
Random Forest Tuned StandardScaler Overall 1610.988546 10.778194 0.480469 40.137122
Random Forest Tuned   RobustScaler Overall 1611.099985 10.778633 0.480460 40.138510

📈 Total models shown: 6
🏆 Best R² Score: 0.539724
    Model: Random Forest
    Scaler: StandardScaler
    Target: Overall
    MSE: 1625.042997


# <center><font size="50" color="red">Thank You</font></center>