In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor as rfr

In [2]:
path_train = './CH22_Demand_XY_Train.csv'
train = pd.read_csv(path_train)
path_test = './CH22_Demand_raw_X_Test.csv'
test = pd.read_csv(path_test)

In [3]:
train

Unnamed: 0,DateTime,X1,X2,X3,X4,Y
0,2022-01-01 00:00:00,2.186333,13.76,0.0663,0.1547,521163.83540
1,2022-01-01 00:10:00,2.138000,13.90,0.0910,0.1105,449066.62018
2,2022-01-01 00:20:00,2.104333,13.90,0.0806,0.1300,437394.72159
3,2022-01-01 00:30:00,2.040333,14.00,0.1183,0.1248,422107.63292
4,2022-01-01 00:40:00,1.973667,14.14,0.0624,0.1105,406923.83540
...,...,...,...,...,...,...
41927,2022-10-19 03:50:00,5.856667,17.66,0.1092,0.1391,365929.91028
41928,2022-10-19 04:00:00,5.860000,17.66,0.1183,0.1495,368822.51417
41929,2022-10-19 04:10:00,5.846667,17.68,0.1001,0.1976,373857.78769
41930,2022-10-19 04:20:00,5.856667,17.66,0.1183,0.1391,373536.38739


In [4]:
train.describe()

Unnamed: 0,X1,X2,X3,X4,Y
count,41932.0,41932.0,41932.0,41932.0,41932.0
mean,6.520033,12.528736,261.430021,106.26082,504322.861242
std,1.982503,3.212579,363.704356,169.071949,121948.222159
min,1.082333,1.268,0.0052,0.0247,178443.8354
25%,4.96,10.464,0.0858,0.1638,401060.77357
50%,6.64,12.87,18.2195,15.86,506391.83449
75%,7.983333,15.26,473.59,146.25,588871.076658
max,13.336667,17.76,1511.9,1216.8,829691.71704


In [5]:
test

Unnamed: 0,DateTime,X1,X2,X3,X4
0,2022-10-19 04:40:00,5.943333,17.72,0.1001,0.1690
1,2022-10-19 04:50:00,5.990000,17.68,0.1092,0.1872
2,2022-10-19 05:00:00,5.993333,17.66,0.1235,0.1014
3,2022-10-19 05:10:00,6.000000,17.68,0.0949,0.1690
4,2022-10-19 05:20:00,6.013333,17.68,0.1326,0.1300
...,...,...,...,...,...
10479,2022-12-30 23:10:00,2.336667,13.48,0.0520,0.1248
10480,2022-12-30 23:20:00,2.315667,13.52,0.0663,0.1209
10481,2022-12-30 23:30:00,2.300000,13.56,0.1092,0.0962
10482,2022-12-30 23:40:00,2.252667,13.60,0.0858,0.1157


In [6]:
test.describe()

Unnamed: 0,X1,X2,X3,X4
count,10484.0,10484.0,10484.0,10484.0
mean,5.270002,13.144527,141.817033,62.64219
std,1.348219,2.604375,223.97316,120.452849
min,1.513333,4.206,0.0091,0.0143
25%,4.389167,11.412,0.0715,0.1495
50%,5.376667,13.34,0.1092,0.2119
75%,6.386667,15.34,235.5925,80.1255
max,8.853333,17.96,1093.3,1084.2


### Data Prepartion

In [7]:
# parsing to datetime
train.DateTime = pd.to_datetime(train.DateTime)
test.DateTime = pd.to_datetime(train.DateTime)

In [8]:
train.dtypes

DateTime    datetime64[ns]
X1                 float64
X2                 float64
X3                 float64
X4                 float64
Y                  float64
dtype: object

In [9]:
test.dtypes

DateTime     object
X1          float64
X2          float64
X3          float64
X4          float64
dtype: object

In [10]:
train.set_index(train['DateTime'],inplace = True)
test.set_index(test['DateTime'],inplace = True)

In [11]:
# DateTime changed into the index
train_idx = train[['X1', 'X2', 'X3', 'X4', 'Y']]
test_idx = test[['X1', 'X2', 'X3', 'X4']]

In [12]:
test_idx['Y'] = 0

In [13]:
final = test_idx

### Splitting Train and Test

In [14]:
actual_arr = np.array(train_idx['Y'])
train_idx = train_idx.drop('Y', axis=1)
train_idx_arr = np.array(train_idx)

test_arr = np.array(test_idx['Y'])
test_idx = test_idx.drop('Y', axis=1)
test_idx_arr = np.array(test_idx)

In [15]:
X_train, X_test, y_train, y_test = train_idx_arr, test_idx_arr, actual_arr, test_arr

In [16]:
print('X_train Shape:', X_train.shape)
print('y_train Shape:', y_train.shape)
print('X_test Shape:', X_test.shape)
print('y_test Shape:', y_test.shape)

X_train Shape: (41932, 4)
y_train Shape: (41932,)
X_test Shape: (10484, 4)
y_test Shape: (10484,)


### Random Forest

In [17]:
rf = rfr(n_estimators = 100, random_state = 0)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [21]:
rf_pred = rf.predict(X_test)

final['Y'] = rf_pred

In [22]:
final

Unnamed: 0_level_0,X1,X2,X3,X4,Y
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-10-19 04:40:00,5.943333,17.72,0.1001,0.1690,417047.777609
2022-10-19 04:50:00,5.990000,17.68,0.1092,0.1872,443232.112480
2022-10-19 05:00:00,5.993333,17.66,0.1235,0.1014,449545.081698
2022-10-19 05:10:00,6.000000,17.68,0.0949,0.1690,454606.433705
2022-10-19 05:20:00,6.013333,17.68,0.1326,0.1300,459151.946070
...,...,...,...,...,...
2022-12-30 23:10:00,2.336667,13.48,0.0520,0.1248,348670.123269
2022-12-30 23:20:00,2.315667,13.52,0.0663,0.1209,351636.692945
2022-12-30 23:30:00,2.300000,13.56,0.1092,0.0962,350262.658194
2022-12-30 23:40:00,2.252667,13.60,0.0858,0.1157,382171.107467


### Apply the best param

In [24]:
bp_rf = rfr(n_estimators = 100, min_samples_split = 6, min_samples_leaf = 1, max_features = 'auto', max_depth = 110, bootstrap = True)
bp_rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=110, min_samples_split=6)

In [25]:
bp_rf_pred = bp_rf.predict(X_test)

bp_rf_mse = mean_squared_error(y_test, bp_rf_pred)
bp_r2 = r2_score(y_test, bp_rf_pred)
#mape = 100 * (bp_rf_mse / y_test)
#accuracy = 100 - np.mean(mape)

print('Mean Squared Error:', bp_rf_mse, 'degrees.')
print('Root Mean Squared Error:', np.sqrt(bp_rf_mse))
print('R-squared:', bp_r2)
#print('Accuracy:', round(accuracy, 2), '%.')

Mean Squared Error: 231066472869.07745 degrees.
Root Mean Squared Error: 480693.7412418404
R-squared: 0.0


In [26]:
final['Y'] = bp_rf_pred
final

Unnamed: 0_level_0,X1,X2,X3,X4,Y
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-10-19 04:40:00,5.943333,17.72,0.1001,0.1690,410727.679505
2022-10-19 04:50:00,5.990000,17.68,0.1092,0.1872,446310.212001
2022-10-19 05:00:00,5.993333,17.66,0.1235,0.1014,447746.885508
2022-10-19 05:10:00,6.000000,17.68,0.0949,0.1690,453872.057317
2022-10-19 05:20:00,6.013333,17.68,0.1326,0.1300,458314.439811
...,...,...,...,...,...
2022-12-30 23:10:00,2.336667,13.48,0.0520,0.1248,347219.937960
2022-12-30 23:20:00,2.315667,13.52,0.0663,0.1209,346876.644814
2022-12-30 23:30:00,2.300000,13.56,0.1092,0.0962,350232.908603
2022-12-30 23:40:00,2.252667,13.60,0.0858,0.1157,366069.678216


### Apply to test dataset