# <font color="red">Forecasting with Linear Regression</font>

# Library Import

In [1]:
# Data manipulation and visualization
import pandas as pd
import math
import json

# Preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

import sys
sys.path.append('../../') # Uncomment this line if running locally
# sys.path.append('/kaggle/input/weatherdata') # Uncomment this line if running on Kaggle
from historyManagement import *

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Datasets Import

In [2]:
# Load the weather dataset for visibility prediction
weather_data = pd.read_csv('../../FinalDatasets/finalDataset.csv') # Uncomment this line if running locally
# weather_data = pd.read_csv('/kaggle/input/weatherdata/finalDataset.csv') # Uncomment this line if running on Kaggle
weather_data['DateTime'] = pd.to_datetime(weather_data['DateTime'])
weather_data.head()

Unnamed: 0,DateTime,Year,Month,Day,Hour,Dew,Precip,PrecipProb,Windgust,WindSpeed,...,Conditions_t+3,Conditions_t+4,Conditions_t+5,Conditions_t+6,Icon_t+1,Icon_t+2,Icon_t+3,Icon_t+4,Icon_t+5,Icon_t+6
0,2023-01-02 00:00:00,2023,1,2,0,15.19,0.0,0.0,10.31,0.24,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
1,2023-01-02 01:00:00,2023,1,2,1,14.72,0.0,0.0,9.72,0.21,...,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,4.0
2,2023-01-02 02:00:00,2023,1,2,2,14.72,0.0,0.0,10.66,0.25,...,0.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,4.0,4.0
3,2023-01-02 03:00:00,2023,1,2,3,14.23,0.0,0.0,9.91,0.33,...,0.0,2.0,2.0,2.0,1.0,1.0,1.0,4.0,4.0,4.0
4,2023-01-02 04:00:00,2023,1,2,4,14.72,0.0,0.0,10.12,0.2,...,2.0,2.0,2.0,2.0,1.0,1.0,4.0,4.0,4.0,4.0


In [3]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14586 entries, 0 to 14585
Columns: 575 entries, DateTime to Icon_t+6
dtypes: datetime64[ns](1), float64(568), int64(6)
memory usage: 64.0 MB


In [4]:
weather_data.describe()

Unnamed: 0,DateTime,Year,Month,Day,Hour,Dew,Precip,PrecipProb,Windgust,WindSpeed,...,Conditions_t+3,Conditions_t+4,Conditions_t+5,Conditions_t+6,Icon_t+1,Icon_t+2,Icon_t+3,Icon_t+4,Icon_t+5,Icon_t+6
count,14586,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,...,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0,14586.0
mean,2023-11-01 20:30:00,2023.40107,5.727684,15.753599,11.496298,20.789805,0.165143,19.505391,17.075919,8.096791,...,2.044289,2.044358,2.044426,2.044495,3.795283,3.795352,3.79542,3.795489,3.795557,3.795626
min,2023-01-02 00:00:00,2023.0,1.0,1.0,0.0,2.02,0.0,0.0,0.66,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2023-06-02 22:15:00,2023.0,3.0,8.0,5.0,17.1,0.0,0.0,9.17,2.88,...,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,2023-11-01 20:30:00,2023.0,6.0,16.0,11.0,21.85,0.0,0.0,14.66,7.67,...,2.0,2.0,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,2024-04-01 18:45:00,2024.0,8.0,23.0,17.0,24.98,0.01,6.45,23.4175,11.75,...,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0
max,2024-08-31 17:00:00,2024.0,12.0,31.0,23.0,28.88,44.5,100.0,84.12,71.97,...,5.0,5.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0
std,,0.490132,3.192277,8.786893,6.921353,4.824925,0.844688,37.410656,10.200773,6.248622,...,1.567964,1.567897,1.567829,1.567761,1.986011,1.985931,1.985852,1.985773,1.985694,1.985614


# Data Split

In [5]:
# Split the dataset by index: first 80% for training, last 20% for testing (no shuffle, maintain serial order)
split_index = int(len(weather_data) * 0.8)
weather_data_train = weather_data.iloc[:split_index]
weather_data_test = weather_data.iloc[split_index:]

# Some Reused Parameters

In [6]:
grid_param = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

In [7]:
number_of_model_for_randomized_grid = 500
model_name_for_saving = "Linear Regression"
horizon = 6

columns_with_t_plus = [col for col in weather_data.columns if '_t+' in col]
column_to_exclude = columns_with_t_plus + ['DateTime', 'Year']

column_to_predict = [col for col in weather_data.columns if '_t+' in col]
# Remove columns containing 'Conditions' or 'Icon' from column_to_predict
column_to_predict = [col for col in column_to_predict if 'Conditions' not in col and 'Icon' not in col]

print(column_to_exclude)
print(column_to_predict)

['Temp_t+1', 'Temp_t+2', 'Temp_t+3', 'Temp_t+4', 'Temp_t+5', 'Temp_t+6', 'FeelsLike_t+1', 'FeelsLike_t+2', 'FeelsLike_t+3', 'FeelsLike_t+4', 'FeelsLike_t+5', 'FeelsLike_t+6', 'Humidity_t+1', 'Humidity_t+2', 'Humidity_t+3', 'Humidity_t+4', 'Humidity_t+5', 'Humidity_t+6', 'Visibility_t+1', 'Visibility_t+2', 'Visibility_t+3', 'Visibility_t+4', 'Visibility_t+5', 'Visibility_t+6', 'SolarEnergy_t+1', 'SolarEnergy_t+2', 'SolarEnergy_t+3', 'SolarEnergy_t+4', 'SolarEnergy_t+5', 'SolarEnergy_t+6', 'SolarRadiation_t+1', 'SolarRadiation_t+2', 'SolarRadiation_t+3', 'SolarRadiation_t+4', 'SolarRadiation_t+5', 'SolarRadiation_t+6', 'Conditions_t+1', 'Conditions_t+2', 'Conditions_t+3', 'Conditions_t+4', 'Conditions_t+5', 'Conditions_t+6', 'Icon_t+1', 'Icon_t+2', 'Icon_t+3', 'Icon_t+4', 'Icon_t+5', 'Icon_t+6', 'DateTime', 'Year']
['Temp_t+1', 'Temp_t+2', 'Temp_t+3', 'Temp_t+4', 'Temp_t+5', 'Temp_t+6', 'FeelsLike_t+1', 'FeelsLike_t+2', 'FeelsLike_t+3', 'FeelsLike_t+4', 'FeelsLike_t+5', 'FeelsLike_t+6', 

# All Features

In [8]:
X_train_raw = weather_data_train.drop(columns=column_to_exclude)
y_train = weather_data_train[column_to_predict]

In [9]:
X_test_raw = weather_data_test.drop(columns=column_to_exclude)
y_test = weather_data_test[column_to_predict]

In [10]:
print("X_train: ", len(X_train_raw))
print("y_train: ", len(y_train))
print("\nX_test: ", len(X_test_raw))
print("y_test: ", len(y_test))

X_train:  11668
y_train:  11668

X_test:  2918
y_test:  2918


## MinMax Scaler

In [11]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [12]:
X_train

array([[0.        , 0.03333333, 0.        , ..., 0.16666667, 0.16666667,
        0.16666667],
       [0.        , 0.03333333, 0.04347826, ..., 0.16666667, 0.16666667,
        0.16666667],
       [0.        , 0.03333333, 0.08695652, ..., 0.5       , 0.16666667,
        0.16666667],
       ...,
       [0.36363636, 0.03333333, 0.04347826, ..., 0.83333333, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.08695652, ..., 0.83333333, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.13043478, ..., 0.83333333, 0.83333333,
        0.83333333]], shape=(11668, 525))

In [13]:
X_test

array([[0.36363636, 0.03333333, 0.17391304, ..., 0.66666667, 0.83333333,
        0.83333333],
       [0.36363636, 0.03333333, 0.2173913 , ..., 0.66666667, 0.66666667,
        0.83333333],
       [0.36363636, 0.03333333, 0.26086957, ..., 0.66666667, 0.66666667,
        0.66666667],
       ...,
       [0.63636364, 1.        , 0.65217391, ..., 0.66666667, 0.66666667,
        0.66666667],
       [0.63636364, 1.        , 0.69565217, ..., 1.        , 0.66666667,
        0.66666667],
       [0.63636364, 1.        , 0.73913043, ..., 0.83333333, 1.        ,
        0.66666667]], shape=(2918, 525))

### Default Parameters

In [14]:
training_model = LinearRegression()
training_model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [15]:
%%time
training_model.fit(X_train, y_train)

CPU times: total: 1.17 s
Wall time: 380 ms


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [16]:
y_pred = training_model.predict(X_test)
y_pred

array([[ 29.02777413,  28.93167003,  29.5492641 , ..., 375.13327733,
        579.40726406, 800.40536381],
       [ 28.8415935 ,  29.41765613,  31.20762985, ..., 581.62178053,
        805.2357757 , 938.03962627],
       [ 29.87290259,  31.49440983,  33.84177335, ..., 795.32998083,
        926.89848905, 922.85891292],
       ...,
       [ 32.87063822,  32.64674879,  32.04905329, ..., 139.2448738 ,
        131.48384183, 126.11629848],
       [ 32.07503806,  31.56284557,  31.40609752, ..., 134.78953361,
        133.80416859, 103.99651278],
       [ 30.92533288,  30.89612454,  31.14368979, ..., 136.3536927 ,
        117.72055681, 126.07272464]], shape=(2918, 36))

In [17]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [18]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [19]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 1894.056931953848
MAE Score: 12.610107886888349
R2 Score: 0.7107453441342233
RMSE Score: 43.52076437694825


In [20]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,0.688806,0.57419,0.890947,0.829943
1,Temp_t+2,1.116152,0.728973,0.823283,1.056481
2,Temp_t+3,1.434547,0.819701,0.772876,1.197726
3,Temp_t+4,1.820589,0.935615,0.711746,1.349292
4,Temp_t+5,2.095911,1.008809,0.668088,1.447726
5,Temp_t+6,2.285337,1.0598,0.637863,1.511733
6,Temp,1.573557,0.854515,0.7508,1.254415
7,FeelsLike_t+1,3.7179,1.341853,0.851152,1.928186
8,FeelsLike_t+2,5.884714,1.697342,0.76441,2.425843
9,FeelsLike_t+3,7.291214,1.875807,0.708099,2.700225


In [21]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "MinMaxScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

‚ú® New regression model added:
   MSE: 1894.056932, R¬≤: 0.710745
üíæ Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Grid Search

In [22]:
grid_search = GridSearchCV(
    LinearRegression(), 
    grid_param, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1, # allow parallel processing, you cant use your own laptop while this runs
    verbose=1
)

In [23]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
CPU times: total: 1min 49s
Wall time: 3min 46s


0,1,2
,estimator,LinearRegression()
,param_grid,"{'fit_intercept': [True, False], 'positive': [True, False]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,True


In [24]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'fit_intercept': True, 'positive': True}
Best Cross-Validation Score (Negative MSE): -1258.5895097862162
Best Cross-Validation Score (MSE): 1258.5895097862162
Best Cross-Validation Score (RMSE): 35.476605105142404


In [25]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [26]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [27]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 2038.8765380801383
MAE Score: 11.981640682067352
R2 Score: 0.7015248600033489
RMSE Score: 45.15392051727223


In [28]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,0.763427,0.603215,0.879133,0.873743
1,Temp_t+2,1.283501,0.772036,0.796787,1.132917
2,Temp_t+3,1.679863,0.869595,0.734037,1.296095
3,Temp_t+4,2.156949,1.004648,0.65849,1.468656
4,Temp_t+5,2.512252,1.083491,0.602156,1.585009
5,Temp_t+6,2.756052,1.136103,0.563273,1.660136
6,Temp,1.858674,0.911515,0.705646,1.363332
7,FeelsLike_t+1,3.716938,1.332868,0.851191,1.927936
8,FeelsLike_t+2,5.926595,1.710116,0.762734,2.43446
9,FeelsLike_t+3,7.370851,1.912966,0.70491,2.714931


In [29]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "MinMaxScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

‚ú® New regression model added:
   MSE: 2038.876538, R¬≤: 0.701525
üíæ Registry updated: Model_Training_History/History_Regression.csv


## Standard Scaler

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [31]:
X_train

array([[-1.29540078, -1.55866853, -1.66051296, ..., -1.24648639,
        -1.24626261, -1.24603889],
       [-1.29540078, -1.55866853, -1.51607747, ..., -1.24648639,
        -1.24626261, -1.24603889],
       [-1.29540078, -1.55866853, -1.37164198, ..., -0.26491096,
        -1.24626261, -1.24603889],
       ...,
       [-0.15173127, -1.55866853, -1.51607747, ...,  0.71666448,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -1.37164198, ...,  0.71666448,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -1.2272065 , ...,  0.71666448,
         0.71680079,  0.71693711]], shape=(11668, 525))

In [32]:
X_test

array([[-0.15173127, -1.55866853, -1.08277101, ...,  0.22587676,
         0.71680079,  0.71693711],
       [-0.15173127, -1.55866853, -0.93833552, ...,  0.22587676,
         0.22603494,  0.71693711],
       [-0.15173127, -1.55866853, -0.79390003, ...,  0.22587676,
         0.22603494,  0.22619311],
       ...,
       [ 0.70602087,  1.74193343,  0.50601936, ...,  0.22587676,
         0.22603494,  0.22619311],
       [ 0.70602087,  1.74193343,  0.65045485, ...,  1.2074522 ,
         0.22603494,  0.22619311],
       [ 0.70602087,  1.74193343,  0.79489033, ...,  0.71666448,
         1.20756664,  0.22619311]], shape=(2918, 525))

### Default Parameters

In [33]:
training_model = LinearRegression()
training_model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [34]:
%%time
training_model.fit(X_train, y_train)

CPU times: total: 1.14 s
Wall time: 379 ms


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [35]:
y_pred = training_model.predict(X_test)
y_pred

array([[ 29.02777413,  28.93167003,  29.5492641 , ..., 375.13327733,
        579.40726406, 800.40536381],
       [ 28.8415935 ,  29.41765613,  31.20762985, ..., 581.62178053,
        805.2357757 , 938.03962627],
       [ 29.87290259,  31.49440983,  33.84177335, ..., 795.32998083,
        926.89848905, 922.85891292],
       ...,
       [ 32.87063822,  32.64674879,  32.04905329, ..., 139.2448738 ,
        131.48384183, 126.11629848],
       [ 32.07503806,  31.56284557,  31.40609752, ..., 134.78953361,
        133.80416859, 103.99651278],
       [ 30.92533288,  30.89612454,  31.14368979, ..., 136.3536927 ,
        117.72055681, 126.07272464]], shape=(2918, 36))

In [36]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [37]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [38]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 1894.0569319538517
MAE Score: 12.610107886888384
R2 Score: 0.7107453441342225
RMSE Score: 43.52076437694829


In [39]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,0.688806,0.57419,0.890947,0.829943
1,Temp_t+2,1.116152,0.728973,0.823283,1.056481
2,Temp_t+3,1.434547,0.819701,0.772876,1.197726
3,Temp_t+4,1.820589,0.935615,0.711746,1.349292
4,Temp_t+5,2.095911,1.008809,0.668088,1.447726
5,Temp_t+6,2.285337,1.0598,0.637863,1.511733
6,Temp,1.573557,0.854515,0.7508,1.254415
7,FeelsLike_t+1,3.7179,1.341853,0.851152,1.928186
8,FeelsLike_t+2,5.884714,1.697342,0.76441,2.425843
9,FeelsLike_t+3,7.291214,1.875807,0.708099,2.700225


In [40]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "StandardScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

‚ú® New regression model added:
   MSE: 1894.056932, R¬≤: 0.710745
üíæ Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Grid Search

In [41]:
grid_param = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'positive': [True, False]
}

In [42]:
grid_search = GridSearchCV(
    LinearRegression(), 
    grid_param, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1, # allow parallel processing, you cant use your own laptop while this runs
    verbose=1
)

In [43]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
CPU times: total: 1min 49s
Wall time: 7min 3s


0,1,2
,estimator,LinearRegression()
,param_grid,"{'copy_X': [True, False], 'fit_intercept': [True, False], 'positive': [True, False]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,True


In [44]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'copy_X': True, 'fit_intercept': True, 'positive': True}
Best Cross-Validation Score (Negative MSE): -1258.5650517817562
Best Cross-Validation Score (MSE): 1258.5650517817562
Best Cross-Validation Score (RMSE): 35.476260397366524


In [45]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [46]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [47]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 2038.8581171250528
MAE Score: 11.981485773221895
R2 Score: 0.7015245951055283
RMSE Score: 45.15371653723592


In [48]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,0.763427,0.603215,0.879133,0.873743
1,Temp_t+2,1.283501,0.772036,0.796787,1.132917
2,Temp_t+3,1.679863,0.869595,0.734037,1.296095
3,Temp_t+4,2.156949,1.004648,0.65849,1.468656
4,Temp_t+5,2.512252,1.083491,0.602156,1.585009
5,Temp_t+6,2.756052,1.136103,0.563273,1.660136
6,Temp,1.858674,0.911515,0.705646,1.363332
7,FeelsLike_t+1,3.716938,1.332868,0.851191,1.927936
8,FeelsLike_t+2,5.926595,1.710116,0.762734,2.43446
9,FeelsLike_t+3,7.370851,1.912966,0.70491,2.714931


In [49]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "StandardScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

‚ú® New regression model added:
   MSE: 2038.858117, R¬≤: 0.701525
üíæ Registry updated: Model_Training_History/History_Regression.csv


## Robust Scaler

In [50]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [51]:
X_train

array([[-0.66666667, -0.93333333, -0.91666667, ..., -0.75      ,
        -0.75      , -0.75      ],
       [-0.66666667, -0.93333333, -0.83333333, ..., -0.75      ,
        -0.75      , -0.75      ],
       [-0.66666667, -0.93333333, -0.75      , ..., -0.25      ,
        -0.75      , -0.75      ],
       ...,
       [ 0.        , -0.93333333, -0.83333333, ...,  0.25      ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.75      , ...,  0.25      ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.66666667, ...,  0.25      ,
         0.25      ,  0.25      ]], shape=(11668, 525))

In [52]:
X_test

array([[ 0.        , -0.93333333, -0.58333333, ...,  0.        ,
         0.25      ,  0.25      ],
       [ 0.        , -0.93333333, -0.5       , ...,  0.        ,
         0.        ,  0.25      ],
       [ 0.        , -0.93333333, -0.41666667, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.5       ,  1.        ,  0.33333333, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.5       ,  1.        ,  0.41666667, ...,  0.5       ,
         0.        ,  0.        ],
       [ 0.5       ,  1.        ,  0.5       , ...,  0.25      ,
         0.5       ,  0.        ]], shape=(2918, 525))

### Default Parameters

In [53]:
training_model = LinearRegression()
training_model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [54]:
%%time
training_model.fit(X_train, y_train)

CPU times: total: 1.36 s
Wall time: 409 ms


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [55]:
y_pred = training_model.predict(X_test)
y_pred

array([[ 29.02777413,  28.93167003,  29.5492641 , ..., 375.13327733,
        579.40726406, 800.40536381],
       [ 28.8415935 ,  29.41765613,  31.20762985, ..., 581.62178053,
        805.2357757 , 938.03962627],
       [ 29.87290259,  31.49440983,  33.84177335, ..., 795.32998083,
        926.89848905, 922.85891292],
       ...,
       [ 32.87063822,  32.64674879,  32.04905329, ..., 139.2448738 ,
        131.48384183, 126.11629848],
       [ 32.07503806,  31.56284557,  31.40609752, ..., 134.78953361,
        133.80416859, 103.99651278],
       [ 30.92533288,  30.89612454,  31.14368979, ..., 136.3536927 ,
        117.72055681, 126.07272464]], shape=(2918, 36))

In [56]:
y_test

Unnamed: 0,Temp_t+1,Temp_t+2,Temp_t+3,Temp_t+4,Temp_t+5,Temp_t+6,FeelsLike_t+1,FeelsLike_t+2,FeelsLike_t+3,FeelsLike_t+4,...,SolarEnergy_t+3,SolarEnergy_t+4,SolarEnergy_t+5,SolarEnergy_t+6,SolarRadiation_t+1,SolarRadiation_t+2,SolarRadiation_t+3,SolarRadiation_t+4,SolarRadiation_t+5,SolarRadiation_t+6
11668,28.78,29.63,30.87,31.94,33.42,34.10,35.16,38.06,40.18,40.30,...,0.29,0.63,0.86,2.05,0.00,3.13,79.55,181.94,229.71,574.29
11669,29.63,30.87,31.94,33.42,34.10,35.00,38.06,40.18,40.30,39.58,...,0.63,0.86,2.05,2.76,3.13,79.55,181.94,229.71,574.29,775.42
11670,30.87,31.94,33.42,34.10,35.00,35.63,40.18,40.30,39.58,40.38,...,0.86,2.05,2.76,3.19,79.55,181.94,229.71,574.29,775.42,887.00
11671,31.94,33.42,34.10,35.00,35.63,37.05,40.30,39.58,40.38,43.03,...,2.05,2.76,3.19,3.52,181.94,229.71,574.29,775.42,887.00,976.61
11672,33.42,34.10,35.00,35.63,37.05,37.94,39.58,40.38,43.03,42.41,...,2.76,3.19,3.52,2.99,229.71,574.29,775.42,887.00,976.61,835.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,33.45,33.12,32.45,31.42,30.19,29.50,39.85,39.42,38.52,37.19,...,1.44,0.78,0.29,0.00,659.35,538.10,395.90,210.35,82.81,0.00
14582,33.12,32.45,31.42,30.19,29.50,29.02,39.42,38.52,37.19,35.19,...,0.78,0.29,0.00,0.00,538.10,395.90,210.35,82.81,0.00,0.00
14583,32.45,31.42,30.19,29.50,29.02,28.61,38.52,37.19,35.19,34.10,...,0.29,0.00,0.00,0.00,395.90,210.35,82.81,0.00,0.00,0.00
14584,31.42,30.19,29.50,29.02,28.61,28.21,37.19,35.19,34.10,33.60,...,0.00,0.00,0.00,0.00,210.35,82.81,0.00,0.00,0.00,0.00


In [57]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = math.sqrt(mse)

In [58]:
print("MSE Score:", mse)
print("MAE Score:", mae)
print("R2 Score:", r2)
print("RMSE Score:", rmse)

MSE Score: 1894.0569319538652
MAE Score: 12.61010788688842
R2 Score: 0.710745344134221
RMSE Score: 43.52076437694845


In [60]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,0.688806,0.57419,0.890947,0.829943
1,Temp_t+2,1.116152,0.728973,0.823283,1.056481
2,Temp_t+3,1.434547,0.819701,0.772876,1.197726
3,Temp_t+4,1.820589,0.935615,0.711746,1.349292
4,Temp_t+5,2.095911,1.008809,0.668088,1.447726
5,Temp_t+6,2.285337,1.0598,0.637863,1.511733
6,Temp,1.573557,0.854515,0.7508,1.254415
7,FeelsLike_t+1,3.7179,1.341853,0.851152,1.928186
8,FeelsLike_t+2,5.884714,1.697342,0.76441,2.425843
9,FeelsLike_t+3,7.291214,1.875807,0.708099,2.700225


In [61]:
regression_params = {
    'model_name': model_name_for_saving,
    'scaler_name': "RobustScaler",
    'performance_metrics_df': performance_metrics,
    'parameters': json.dumps(training_model.get_params())
}

save_model_performance_if_better('regression', regression_params)

‚ú® New regression model added:
   MSE: 1894.056932, R¬≤: 0.710745
üíæ Registry updated: Model_Training_History/History_Regression.csv


### Tuning with Grid Search

In [62]:
grid_param = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'positive': [True, False]
}

In [63]:
grid_search = GridSearchCV(
    LinearRegression(), 
    grid_param, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1, # allow parallel processing, you cant use your own laptop while this runs
    verbose=1
)

In [64]:
%%time
print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

Starting hyperparameter tuning...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
CPU times: total: 2min 33s
Wall time: 9min 17s


0,1,2
,estimator,LinearRegression()
,param_grid,"{'copy_X': [True, False], 'fit_intercept': [True, False], 'positive': [True, False]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,True


In [65]:
print("TUNING RESULTS")
print("="*50)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)
print("Best Cross-Validation Score (RMSE):", math.sqrt(-grid_search.best_score_))

TUNING RESULTS
Best Parameters: {'copy_X': True, 'fit_intercept': True, 'positive': True}
Best Cross-Validation Score (Negative MSE): -1258.596020634298
Best Cross-Validation Score (MSE): 1258.596020634298
Best Cross-Validation Score (RMSE): 35.476696867581936


In [66]:
grid_search_best_model = grid_search.best_estimator_
y_pred = grid_search_best_model.predict(X_test)

In [67]:
mse_grid_search = mean_squared_error(y_test, y_pred)
mae_grid_search = mean_absolute_error(y_test, y_pred)
r2_grid_search = r2_score(y_test, y_pred)
rmse_grid_search = math.sqrt(mse_grid_search)

In [68]:
print("MSE Score:", mse_grid_search)
print("MAE Score:", mae_grid_search)
print("R2 Score:", r2_grid_search)
print("RMSE Score:", rmse_grid_search)

MSE Score: 2038.8765388441893
MAE Score: 11.981641836026355
R2 Score: 0.7015241811057934
RMSE Score: 45.15392052573275


In [69]:
performance_metrics = create_history_df_regression(y_test, y_pred, horizon)
performance_metrics

Unnamed: 0,Target,MSE,MAE,R2,RMSE
0,Temp_t+1,0.763427,0.603215,0.879133,0.873743
1,Temp_t+2,1.283501,0.772036,0.796787,1.132917
2,Temp_t+3,1.679863,0.869595,0.734037,1.296095
3,Temp_t+4,2.156949,1.004648,0.65849,1.468656
4,Temp_t+5,2.512252,1.083491,0.602156,1.585009
5,Temp_t+6,2.756052,1.136103,0.563273,1.660136
6,Temp,1.858674,0.911515,0.705646,1.363332
7,FeelsLike_t+1,3.716938,1.332868,0.851191,1.927936
8,FeelsLike_t+2,5.926595,1.710116,0.762734,2.43446
9,FeelsLike_t+3,7.370851,1.912966,0.70491,2.714931


In [70]:
regression_params = {
    'model_name': model_name_for_saving + " Tuned",
    'scaler_name': "RobustScaler",
    'performance_metrics_df': performance_metrics,
    'parameters' : json.dumps({
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "all_params": grid_search_best_model.get_params()
    })
}

save_model_performance_if_better('regression', regression_params)

‚ú® New regression model added:
   MSE: 2038.876539, R¬≤: 0.701524
üíæ Registry updated: Model_Training_History/History_Regression.csv


# All Performance

In [71]:
show_model_history(model_type='regression', model_name=model_name_for_saving)


üìä REGRESSION Model Performance History
üîç Filters Applied:
   ‚Ä¢ Model Name contains: 'Linear Regression'
   ‚Ä¢ Target: 'Overall' only
----------------------------------------------------------------------------------------------------
                  Model         Scaler  Target         MSE       MAE       R2      RMSE
      Linear Regression   MinMaxScaler Overall 1894.056932 12.610108 0.710745 43.520764
      Linear Regression   RobustScaler Overall 1894.056932 12.610108 0.710745 43.520764
      Linear Regression StandardScaler Overall 1894.056932 12.610108 0.710745 43.520764
Linear Regression Tuned   MinMaxScaler Overall 2038.876538 11.981641 0.701525 45.153921
Linear Regression Tuned StandardScaler Overall 2038.858117 11.981486 0.701525 45.153717
Linear Regression Tuned   RobustScaler Overall 2038.876539 11.981642 0.701524 45.153921

üìà Total models shown: 6
üèÜ Best R¬≤ Score: 0.710745
    Model: Linear Regression
    Scaler: MinMaxScaler
    Target: Overall
    MSE:

# <center><font size="50" color="red">Thank You</font></center>