# <font color="red">Forecasting with Decision Tree Regressor</font>

# Library Import

In [1]:
# Data manipulation and visualization
import pandas as pd
import math
import json
import multiprocessing

# Preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

import sys
# sys.path.append('../../') # Uncomment this line if running locally
sys.path.append('/kaggle/input/weatherforecasting') # Uncomment this line if running on Kaggle
from historyManagement import *

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Datasets Import

In [2]:
# Load the weather dataset for visibility prediction
# weather_data = pd.read_csv('../../FinalDatasets/finalDataset.csv') # Uncomment this line if running locally
weather_data = pd.read_csv('/kaggle/input/weatherforecasting/finalDataset.csv') # Uncomment this line if running on Kaggle
weather_data['DateTime'] = pd.to_datetime(weather_data['DateTime'])
weather_data.head()

Unnamed: 0,DateTime,Year,Month,Day,Hour,Dew,Precip,PrecipProb,Windgust,WindSpeed,...,Conditions_t+3,Conditions_t+4,Conditions_t+5,Conditions_t+6,Icon_t+1,Icon_t+2,Icon_t+3,Icon_t+4,Icon_t+5,Icon_t+6
0,2023-01-02 00:00:00,2023,1,2,0,15.19,0.0,0.0,10.31,0.24,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
1,2023-01-02 01:00:00,2023,1,2,1,14.72,0.0,0.0,9.72,0.21,...,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,4.0
2,2023-01-02 02:00:00,2023,1,2,2,14.72,0.0,0.0,10.66,0.25,...,0.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,4.0,4.0
3,2023-01-02 03:00:00,2023,1,2,3,14.23,0.0,0.0,9.91,0.33,...,0.0,2.0,2.0,2.0,1.0,1.0,1.0,4.0,4.0,4.0
4,2023-01-02 04:00:00,2023,1,2,4,14.72,0.0,0.0,10.12,0.2,...,2.0,2.0,2.0,2.0,1.0,1.0,4.0,4.0,4.0,4.0


In [3]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14586 entries, 0 to 14585
Columns: 575 entries, DateTime to Icon_t+6
dtypes: datetime64[ns](1), float64(568), int64(6)
memory usage: 64.0 MB


In [4]:
weather_data.describe()

Unnamed: 0,DateTime,Year,Month,Day,Hour,Dew,Precip,PrecipProb,Windgust,WindSpeed,...,Conditions_t+3,Conditions_t+4,Conditions_t+5,Conditions_t+6,Icon_t+1,Icon_t+2,Icon_t+3,Icon_t+4,Icon_t+5,Icon_t+6
count,14586,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,...,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0
mean,2023-11-01 20:30:00,2023.40107,5.727684,15.753599,11.496298,20.789805,0.165143,19.505391,17.075919,8.096791,...,2.044289,2.044358,2.044426,2.044495,3.795283,3.795352,3.79542,3.795489,3.795557,3.795626
min,2023-01-02 00:00:00,2023.0,1.0,1.0,0.0,2.02,0.0,0.0,0.66,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2023-06-02 22:15:00,2023.0,3.0,8.0,5.0,17.1,0.0,0.0,9.17,2.88,...,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,2023-11-01 20:30:00,2023.0,6.0,16.0,11.0,21.85,0.0,0.0,14.66,7.67,...,2.0,2.0,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,2024-04-01 18:45:00,2024.0,8.0,23.0,17.0,24.98,0.01,6.45,23.4175,11.75,...,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0
max,2024-08-31 17:00:00,2024.0,12.0,31.0,23.0,28.88,44.5,100.0,84.12,71.97,...,5.0,5.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0
std,,0.490132,3.192277,8.786893,6.921353,4.824925,0.844688,37.410656,10.200773,6.248622,...,1.567964,1.567897,1.567829,1.567761,1.986011,1.985931,1.985852,1.985773,1.985694,1.985614


# Data Split

In [5]:
# Split the dataset by index: first 80% for training, last 20% for testing (no shuffle, maintain serial order)
split_index = int(len(weather_data) * 0.8)
weather_data_train = weather_data.iloc[:split_index]
weather_data_test = weather_data.iloc[split_index:]

# Some Reused Parameters

In [6]:
grid_param = {
    'max_depth': [None, 5, 10, 15, 25],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None, 0.5],
    'criterion': ['squared_error', 'friedman_mse'],
    'min_impurity_decrease': [0.0, 0.01, 0.02],
    'ccp_alpha': [0.0, 0.01, 0.02]
}

In [7]:
number_of_model_for_randomized_grid = 3600
model_name_for_saving = "Decision Tree"
horizon = 6
# core_to_use = max(1, multiprocessing.cpu_count() - 2) # uncomment if you want to use your laptop while training
core_to_use = -1 # uncomment if you want to use all available cores

columns_with_t_plus = [col for col in weather_data.columns if '_t+' in col]
column_to_exclude = columns_with_t_plus + ['DateTime', 'Year']

column_to_predict = [col for col in weather_data.columns if '_t+' in col]
# Remove columns containing 'Conditions' or 'Icon' from column_to_predict
column_to_predict = [col for col in column_to_predict if 'Conditions' not in col and 'Icon' not in col]

print(column_to_exclude)
print(column_to_predict)
print("Core to use:", core_to_use)

['Temp_t+1', 'Temp_t+2', 'Temp_t+3', 'Temp_t+4', 'Temp_t+5', 'Temp_t+6', 'FeelsLike_t+1', 'FeelsLike_t+2', 'FeelsLike_t+3', 'FeelsLike_t+4', 'FeelsLike_t+5', 'FeelsLike_t+6', 'Humidity_t+1', 'Humidity_t+2', 'Humidity_t+3', 'Humidity_t+4', 'Humidity_t+5', 'Humidity_t+6', 'Visibility_t+1', 'Visibility_t+2', 'Visibility_t+3', 'Visibility_t+4', 'Visibility_t+5', 'Visibility_t+6', 'SolarEnergy_t+1', 'SolarEnergy_t+2', 'SolarEnergy_t+3', 'SolarEnergy_t+4', 'SolarEnergy_t+5', 'SolarEnergy_t+6', 'SolarRadiation_t+1', 'SolarRadiation_t+2', 'SolarRadiation_t+3', 'SolarRadiation_t+4', 'SolarRadiation_t+5', 'SolarRadiation_t+6', 'Conditions_t+1', 'Conditions_t+2', 'Conditions_t+3', 'Conditions_t+4', 'Conditions_t+5', 'Conditions_t+6', 'Icon_t+1', 'Icon_t+2', 'Icon_t+3', 'Icon_t+4', 'Icon_t+5', 'Icon_t+6', 'DateTime', 'Year']
['Temp_t+1', 'Temp_t+2', 'Temp_t+3', 'Temp_t+4', 'Temp_t+5', 'Temp_t+6', 'FeelsLike_t+1', 'FeelsLike_t+2', 'FeelsLike_t+3', 'FeelsLike_t+4', 'FeelsLike_t+5', 'FeelsLike_t+6', 

# All Features

In [8]:
X_train_raw = weather_data_train.drop(columns=column_to_exclude)
y_train = weather_data_train[column_to_predict]

In [9]:
X_test_raw = weather_data_test.drop(columns=column_to_exclude)
y_test = weather_data_test[column_to_predict]

In [10]:
print("X_train: ", len(X_train_raw))
print("y_train: ", len(y_train))
print("\nX_test: ", len(X_test_raw))
print("y_test: ", len(y_test))

X_train:  11668
y_train:  11668

X_test:  2918
y_test:  2918


## MinMax Scaler

In [11]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [12]:
X_train

array([[0.        , 0.03333333, 0.        , ..., 0.16666667, 0.16666667,
        0.16666667],
       [0.        , 0.03333333, 0.04347826, ..., 0.16666667, 0.16666667,
        0.16666667],
       [0.        , 0.03333333, 0.08695652, ..., 0.5       , 0.16666667,
        0.16666667],
       ...,
       [0.36363636, 0.03333333, 0.04347826, ..., 0.83333333, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.08695652, ..., 0.83333333, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.13043478, ..., 0.83333333, 0.83333333,
        0.83333333]])

In [13]:
X_test

array([[0.36363636, 0.03333333, 0.17391304, ..., 0.66666667, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.2173913 , ..., 0.66666667, 0.66666667,
        0.83333333],
       [0.36363636, 0.03333333, 0.26086957, ..., 0.66666667, 0.66666667,
        0.66666667],
       ...,
       [0.63636364, 1.        , 0.65217391, ..., 0.66666667, 0.66666667,
        0.66666667],
       [0.63636364, 1.        , 0.69565217, ..., 1.        , 0.66666667,
        0.66666667],
       [0.63636364, 1.        , 0.73913043, ..., 0.83333333, 1.        ,
        0.66666667]])

### Default Parameters

In [14]:
training_model = DecisionTreeRegressor(random_state=42)
training_model

In [15]:
%%time
training_model.fit(X_train, y_train)

CPU times: user 9.73 s, sys: 7.61 ms, total: 9.74 s
Wall time: 9.74 s


In [16]:
y_pred = training_model.predict(X_test)
y_pred

array([[2.89000e+01, 2.93400e+01, 3.00700e+01, ..., 4.15550e+02,
        6.15770e+02, 7.77710e+02],
       [2.81700e+01, 2.89400e+01, 3.00800e+01, ..., 6.32100e+02,
        8.16900e+02, 9.48710e+02],
       [3.09000e+01, 3.19500e+01, 3.26900e+01, ..., 7.99710e+02,
        9.39550e+02, 1.00855e+03],
       ...,
       [2.90500e+01, 2.90000e+01, 2.85200e+01, ..., 6.00000e-02,
        0.00000e+00, 0.00000e+00],
       [3.31500e+01, 3.29500e+01, 3.19500e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.08400e+01, 2.99800e+01, 2.99600e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [17]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [18]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [19]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 3858.6751169741474
MAE Score: 15.152063532861165
R2 Score: 0.20596125336008775
RMSE Score: 62.11823497954644


In [20]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,5.284242,1.579044,0.16339,2.298748
1,Temp_t+2,5.555499,1.63599,0.120413,2.357011
2,Temp_t+3,5.820315,1.67962,0.078502,2.412533
3,Temp_t+4,5.943772,1.708639,0.058921,2.437985
4,Temp_t+5,6.081758,1.721097,0.036883,2.466122
5,Temp_t+6,6.440266,1.771532,-0.020532,2.537768
6,Temp,5.854309,1.682654,0.07293,2.419568
7,FeelsLike_t+1,25.32065,3.736566,-0.013724,5.031963
8,FeelsLike_t+2,26.604849,3.823482,-0.065104,5.157989
9,FeelsLike_t+3,26.932345,3.853314,-0.078228,5.189638


In [21]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "MinMaxScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 3858.675117, R²: 0.205961
💾 Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Grid Search

In [22]:
grid_search = RandomizedSearchCV(
    DecisionTreeRegressor(random_state=42), 
    grid_param,
    n_iter=number_of_model_for_randomized_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=core_to_use,
    verbose=1,
    random_state=42
)

In [23]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 3600 candidates, totalling 18000 fits
CPU times: user 1min 20s, sys: 27.1 s, total: 1min 47s
Wall time: 3h 25min 23s


In [24]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 8, 'min_impurity_decrease': 0.0, 'max_features': 0.5, 'max_depth': 10, 'criterion': 'squared_error', 'ccp_alpha': 0.01}
Best Cross-Validation Score (Negative MSE): -1538.8248422239342
Best Cross-Validation Score (MSE): 1538.8248422239342
Best Cross-Validation Score (RMSE): 39.2278579866902


In [25]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [26]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [27]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 2705.0079773277635
MAE Score: 13.378250841171914
R2 Score: 0.27984613138572334
RMSE Score: 52.00969118662178


In [28]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,5.736973,1.770456,0.091713,2.395198
1,Temp_t+2,5.68044,1.783974,0.100632,2.383367
2,Temp_t+3,5.576978,1.781816,0.117028,2.361563
3,Temp_t+4,5.583891,1.787134,0.115901,2.363026
4,Temp_t+5,5.333757,1.75359,0.155338,2.309493
5,Temp_t+6,5.292043,1.74879,0.161417,2.300444
6,Temp,5.534014,1.77096,0.123671,2.352448
7,FeelsLike_t+1,27.502741,4.23148,-0.101085,5.244306
8,FeelsLike_t+2,28.004495,4.288618,-0.121137,5.291927
9,FeelsLike_t+3,28.389497,4.314014,-0.136564,5.32818


In [29]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "MinMaxScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 2705.007977, R²: 0.279846
💾 Registry updated: Model_Training_History/History_Regression.csv


## Standard Scaler

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [31]:
X_train

array([[-1.29540078, -1.55866853, -1.66051296, ..., -1.24648639,
        -1.24626261, -1.24603889],
       [-1.29540078, -1.55866853, -1.51607747, ..., -1.24648639,
        -1.24626261, -1.24603889],
       [-1.29540078, -1.55866853, -1.37164198, ..., -0.26491096,
        -1.24626261, -1.24603889],
       ...,
       [-0.15173127, -1.55866853, -1.51607747, ...,  0.71666448,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -1.37164198, ...,  0.71666448,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -1.2272065 , ...,  0.71666448,
         0.71680079,  0.71693711]])

In [32]:
X_test

array([[-0.15173127, -1.55866853, -1.08277101, ...,  0.22587676,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -0.93833552, ...,  0.22587676,
         0.22603494,  0.71693711],
       [-0.15173127, -1.55866853, -0.79390003, ...,  0.22587676,
         0.22603494,  0.22619311],
       ...,
       [ 0.70602087,  1.74193343,  0.50601936, ...,  0.22587676,
         0.22603494,  0.22619311],
       [ 0.70602087,  1.74193343,  0.65045485, ...,  1.2074522 ,
         0.22603494,  0.22619311],
       [ 0.70602087,  1.74193343,  0.79489033, ...,  0.71666448,
         1.20756664,  0.22619311]])

### Default Parameters

In [33]:
training_model = DecisionTreeRegressor(random_state=42)
training_model

In [34]:
%%time
training_model.fit(X_train, y_train)

CPU times: user 9.51 s, sys: 0 ns, total: 9.51 s
Wall time: 9.51 s


In [35]:
y_pred = training_model.predict(X_test)
y_pred

array([[2.89000e+01, 2.93400e+01, 3.00700e+01, ..., 4.15550e+02,
        6.15770e+02, 7.77710e+02],
       [2.81700e+01, 2.89400e+01, 3.00800e+01, ..., 6.32100e+02,
        8.16900e+02, 9.48710e+02],
       [3.09000e+01, 3.19500e+01, 3.26900e+01, ..., 7.99710e+02,
        9.39550e+02, 1.00855e+03],
       ...,
       [2.90500e+01, 2.90000e+01, 2.85200e+01, ..., 6.00000e-02,
        0.00000e+00, 0.00000e+00],
       [3.31500e+01, 3.29500e+01, 3.19500e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.08400e+01, 2.99800e+01, 2.99600e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [36]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [37]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [38]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 3852.048725936717
MAE Score: 15.1457847840987
R2 Score: 0.20680656406295905
RMSE Score: 62.0648751383318


In [39]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,5.295156,1.581206,0.161662,2.301121
1,Temp_t+2,5.558349,1.636114,0.119962,2.357615
2,Temp_t+3,5.823461,1.678163,0.078004,2.413185
3,Temp_t+4,5.937007,1.707066,0.059992,2.436597
4,Temp_t+5,6.09405,1.72196,0.034937,2.468613
5,Temp_t+6,6.440588,1.771501,-0.020583,2.537831
6,Temp,5.858102,1.682668,0.072329,2.420352
7,FeelsLike_t+1,25.304029,3.732858,-0.013059,5.030311
8,FeelsLike_t+2,26.565635,3.816847,-0.063534,5.154186
9,FeelsLike_t+3,26.887415,3.84331,-0.076429,5.185308


In [40]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "StandardScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 3852.048726, R²: 0.206807
💾 Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Grid Search

In [41]:
grid_search = RandomizedSearchCV(
    DecisionTreeRegressor(random_state=42), 
    grid_param,
    n_iter=number_of_model_for_randomized_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=core_to_use,
    verbose=1,
    random_state=42
)

In [42]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 3600 candidates, totalling 18000 fits
CPU times: user 1min 17s, sys: 26.6 s, total: 1min 43s
Wall time: 3h 23min 44s


In [43]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 8, 'min_impurity_decrease': 0.0, 'max_features': 0.5, 'max_depth': 10, 'criterion': 'squared_error', 'ccp_alpha': 0.01}
Best Cross-Validation Score (Negative MSE): -1538.8539707044165
Best Cross-Validation Score (MSE): 1538.8539707044165
Best Cross-Validation Score (RMSE): 39.228229257824225


In [44]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [45]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [46]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 2705.316789392663
MAE Score: 13.380298681522449
R2 Score: 0.2796705104796413
RMSE Score: 52.012659895381844


In [47]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,5.735326,1.770002,0.091974,2.394854
1,Temp_t+2,5.678709,1.783757,0.100906,2.383004
2,Temp_t+3,5.574567,1.781472,0.11741,2.361052
3,Temp_t+4,5.581789,1.786501,0.116234,2.362581
4,Temp_t+5,5.332604,1.753147,0.15552,2.309243
5,Temp_t+6,5.290617,1.748203,0.161643,2.300134
6,Temp,5.532269,1.770514,0.123948,2.352078
7,FeelsLike_t+1,27.509628,4.233365,-0.101361,5.244962
8,FeelsLike_t+2,28.006056,4.288986,-0.1212,5.292075
9,FeelsLike_t+3,28.388,4.314082,-0.136505,5.328039


In [48]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "StandardScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 2705.316789, R²: 0.279671
💾 Registry updated: Model_Training_History/History_Regression.csv


## Robust Scaler

In [49]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [50]:
X_train

array([[-0.66666667, -0.93333333, -0.91666667, ..., -0.75      ,
        -0.75      , -0.75      ],
       [-0.66666667, -0.93333333, -0.83333333, ..., -0.75      ,
        -0.75      , -0.75      ],
       [-0.66666667, -0.93333333, -0.75      , ..., -0.25      ,
        -0.75      , -0.75      ],
       ...,
       [ 0.        , -0.93333333, -0.83333333, ...,  0.25      ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.75      , ...,  0.25      ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.66666667, ...,  0.25      ,
         0.25      ,  0.25      ]])

In [51]:
X_test

array([[ 0.        , -0.93333333, -0.58333333, ...,  0.        ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.5       , ...,  0.        ,
         0.        ,  0.25      ],
       [ 0.        , -0.93333333, -0.41666667, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.5       ,  1.        ,  0.33333333, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.5       ,  1.        ,  0.41666667, ...,  0.5       ,
         0.        ,  0.        ],
       [ 0.5       ,  1.        ,  0.5       , ...,  0.25      ,
         0.5       ,  0.        ]])

### Default Parameters

In [52]:
training_model = DecisionTreeRegressor(random_state=42)
training_model

In [53]:
%%time
training_model.fit(X_train, y_train)

CPU times: user 9.5 s, sys: 0 ns, total: 9.5 s
Wall time: 9.5 s


In [54]:
y_pred = training_model.predict(X_test)
y_pred

array([[2.89000e+01, 2.93400e+01, 3.00700e+01, ..., 4.15550e+02,
        6.15770e+02, 7.77710e+02],
       [2.81700e+01, 2.89400e+01, 3.00800e+01, ..., 6.32100e+02,
        8.16900e+02, 9.48710e+02],
       [3.09000e+01, 3.19500e+01, 3.26900e+01, ..., 7.99710e+02,
        9.39550e+02, 1.00855e+03],
       ...,
       [2.90500e+01, 2.90000e+01, 2.85200e+01, ..., 6.00000e-02,
        0.00000e+00, 0.00000e+00],
       [3.31500e+01, 3.29500e+01, 3.19500e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.08400e+01, 2.99800e+01, 2.99600e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [55]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [56]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [57]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 3853.7002826098565
MAE Score: 15.1456745487777
R2 Score: 0.20802393393041296
RMSE Score: 62.07817879585271


In [58]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,5.280756,1.578756,0.163942,2.29799
1,Temp_t+2,5.547633,1.63526,0.121659,2.355341
2,Temp_t+3,5.811506,1.678458,0.079897,2.410707
3,Temp_t+4,5.927846,1.705689,0.061442,2.434717
4,Temp_t+5,6.072603,1.718739,0.038333,2.464265
5,Temp_t+6,6.428213,1.768883,-0.018622,2.535392
6,Temp,5.844759,1.680964,0.074442,2.417594
7,FeelsLike_t+1,25.234099,3.726953,-0.010259,5.023355
8,FeelsLike_t+2,26.507996,3.815298,-0.061226,5.148592
9,FeelsLike_t+3,26.822381,3.842324,-0.073825,5.179033


In [59]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "RobustScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 3853.700283, R²: 0.208024
💾 Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Grid Search

In [60]:
grid_search = RandomizedSearchCV(
    DecisionTreeRegressor(random_state=42), 
    grid_param,
    n_iter=number_of_model_for_randomized_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=core_to_use,
    verbose=1,
    random_state=42
)

In [61]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 3600 candidates, totalling 18000 fits
CPU times: user 1min 15s, sys: 25.7 s, total: 1min 41s
Wall time: 3h 22min 32s


In [62]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 8, 'min_impurity_decrease': 0.0, 'max_features': 0.5, 'max_depth': 10, 'criterion': 'squared_error', 'ccp_alpha': 0.01}
Best Cross-Validation Score (Negative MSE): -1537.3474647233995
Best Cross-Validation Score (MSE): 1537.3474647233995
Best Cross-Validation Score (RMSE): 39.209022746345


In [63]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [64]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [65]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 2705.6112029726883
MAE Score: 13.382557141640595
R2 Score: 0.2797887325892947
RMSE Score: 52.01549002915082


In [66]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,5.736607,1.770583,0.091771,2.395122
1,Temp_t+2,5.679014,1.783789,0.100858,2.383068
2,Temp_t+3,5.571214,1.780631,0.117941,2.360342
3,Temp_t+4,5.578013,1.785915,0.116831,2.361782
4,Temp_t+5,5.331469,1.752868,0.1557,2.308997
5,Temp_t+6,5.29079,1.748426,0.161615,2.300172
6,Temp,5.531185,1.770369,0.124119,2.351847
7,FeelsLike_t+1,27.520072,4.234298,-0.101779,5.245958
8,FeelsLike_t+2,28.005767,4.288628,-0.121188,5.292048
9,FeelsLike_t+3,28.365117,4.311916,-0.135588,5.325891


In [67]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "RobustScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

✨ New regression model added:
   MSE: 2705.611203, R²: 0.279789
💾 Registry updated: Model_Training_History/History_Regression.csv


# All Performance

In [68]:
show_model_history(model_type='regression', model_name=model_name_for_saving)


📊 REGRESSION Model Performance History
🔍 Filters Applied:
   • Model Name contains: 'Decision Tree'
   • Target: 'Overall' only
----------------------------------------------------------------------------------------------------
              Model         Scaler  Target         MSE       MAE       R2      RMSE
Decision Tree Tuned   MinMaxScaler Overall 2705.007977 13.378251 0.279846 52.009691
Decision Tree Tuned   RobustScaler Overall 2705.611203 13.382557 0.279789 52.015490
Decision Tree Tuned StandardScaler Overall 2705.316789 13.380299 0.279671 52.012660
      Decision Tree   RobustScaler Overall 3853.700283 15.145675 0.208024 62.078179
      Decision Tree StandardScaler Overall 3852.048726 15.145785 0.206807 62.064875
      Decision Tree   MinMaxScaler Overall 3858.675117 15.152064 0.205961 62.118235

📈 Total models shown: 6
🏆 Best R² Score: 0.279846
    Model: Decision Tree Tuned
    Scaler: MinMaxScaler
    Target: Overall
    MSE: 2705.007977


# <center><font size="50" color="red">Thank You</font></center>