In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.model_selection import cross_val_score

In [2]:
path_train = './CH22_Demand_XY_Train.csv'
train = pd.read_csv(path_train)

In [3]:
train

Unnamed: 0,DateTime,X1,X2,X3,X4,Y
0,2022-01-01 00:00:00,2.186333,13.76,0.0663,0.1547,521163.83540
1,2022-01-01 00:10:00,2.138000,13.90,0.0910,0.1105,449066.62018
2,2022-01-01 00:20:00,2.104333,13.90,0.0806,0.1300,437394.72159
3,2022-01-01 00:30:00,2.040333,14.00,0.1183,0.1248,422107.63292
4,2022-01-01 00:40:00,1.973667,14.14,0.0624,0.1105,406923.83540
...,...,...,...,...,...,...
41927,2022-10-19 03:50:00,5.856667,17.66,0.1092,0.1391,365929.91028
41928,2022-10-19 04:00:00,5.860000,17.66,0.1183,0.1495,368822.51417
41929,2022-10-19 04:10:00,5.846667,17.68,0.1001,0.1976,373857.78769
41930,2022-10-19 04:20:00,5.856667,17.66,0.1183,0.1391,373536.38739


In [4]:
train.describe()

Unnamed: 0,X1,X2,X3,X4,Y
count,41932.0,41932.0,41932.0,41932.0,41932.0
mean,6.520033,12.528736,261.430021,106.26082,504322.861242
std,1.982503,3.212579,363.704356,169.071949,121948.222159
min,1.082333,1.268,0.0052,0.0247,178443.8354
25%,4.96,10.464,0.0858,0.1638,401060.77357
50%,6.64,12.87,18.2195,15.86,506391.83449
75%,7.983333,15.26,473.59,146.25,588871.076658
max,13.336667,17.76,1511.9,1216.8,829691.71704


### Data Prepartion

In [5]:
# parsing to datetime
train.DateTime = pd.to_datetime(train.DateTime)

In [6]:
train.dtypes

DateTime    datetime64[ns]
X1                 float64
X2                 float64
X3                 float64
X4                 float64
Y                  float64
dtype: object

In [7]:
train.set_index(train['DateTime'],inplace = True)

In [8]:
# DateTime changed into the index
train_idx = train[['X1', 'X2', 'X3', 'X4', 'Y']]

### Splitting Train and Test

In [9]:
actual_arr = np.array(train_idx['Y'])
train_idx = train_idx.drop('Y', axis=1)
train_idx_arr = np.array(train_idx)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train_idx_arr, actual_arr, test_size = 0.25, random_state = 0)

In [11]:
print('X_train Shape:', X_train.shape)
print('y_train Shape:', y_train.shape)
print('X_test Shape:', X_test.shape)
print('y_test Shape:', y_test.shape)

X_train Shape: (31449, 4)
y_train Shape: (31449,)
X_test Shape: (10483, 4)
y_test Shape: (10483,)


### Random Forest

In [12]:
rf = rfr(n_estimators = 100, random_state = 0)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [13]:
rf_pred = rf.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_pred)
r2 = r2_score(y_test, rf_pred)

accuracies_rf = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10)
print('Accuracy(mean):', accuracies_rf.mean())
print('Accuracy(std):', accuracies_rf.std())

print('Mean Squared Error:', rf_mse, 'degrees.')
print('Root Mean Squared Error:', np.sqrt(rf_mse))
print('R-squared:', r2)

Accuracy(mean): 0.44903477739752684
Accuracy(std): 0.008997314078882214
Mean Squared Error: 8016916538.706596 degrees.
Root Mean Squared Error: 89537.23548729096
R-squared: 0.45456268591890747


### Use CV to find the best parameters

In [15]:
rf2 = rfr()

n_estimators = [5,20,50,100]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)]
min_samples_split = [2, 6, 10]
min_samples_leaf = [1, 3, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

rf_cv = RandomizedSearchCV(estimator = rf2, param_distributions = random_grid, n_iter = 100, cv = 5, verbose = 2, random_state = 42, n_jobs = -1)

rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      120],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 3, 4],
                                        'min_samples_split': [2, 6, 10],
                                        'n_estimators': [5, 20, 50, 100]},
                   random_state=42, verbose=2)

In [16]:
print ('Best Parameters: ', rf_cv.best_params_)

Best Parameters:  {'n_estimators': 100, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 110, 'bootstrap': True}


### Apply the best param

In [19]:
bp_rf = rfr(n_estimators = 100, min_samples_split = 6, min_samples_leaf = 1, max_features = 'auto', max_depth = 110, bootstrap = True)
bp_rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=110, min_samples_split=6)

In [None]:
bp_rf_pred = bp_rf.predict(X_test)

bp_rf_mse = mean_squared_error(y_test, bp_rf_pred)
bp_r2 = r2_score(y_test, bp_rf_pred)

accuracies_bp = cross_val_score(estimator = bp_rf, X = X_train, y = y_train, cv = 10)
print('Accuracy(mean):', accuracies_bp.mean())
print('Accuracy(std):',accuracies_bp.std())

print('Mean Squared Error:', bp_rf_mse, 'degrees.')
print('Root Mean Squared Error:', np.sqrt(bp_rf_mse))
print('R-squared:', bp_r2)

### Find important features

In [None]:
rf.feature_importances_

In [None]:
order = rf.feature_importances_.argsort()
plt.barh(train_idx.columns[order], rf.feature_importances_[order])

### Only with the two most important inputs

In [None]:
Xs = list(train_idx.columns)

rf_impt = rfr(n_estimators = 100, min_samples_split = 6, min_samples_leaf = 1, max_features = 'auto', max_depth = 110, bootstrap = True)
impt = [Xs.index('X1'), Xs.index('X2')]
train_important = X_train[:, impt]
test_important = X_test[:, impt]

In [None]:
rf_impt.fit(train_important, y_train)

In [None]:
pred_impt = rf_impt.predict(test_important)

rf_mse = mean_squared_error(y_test, pred_better)
r2 = r2_score(y_test, pred_better)

accuracies_impt = cross_val_score(estimator = rf_impt, X = X_train, y = y_train, cv = 10)
print('Accuracy(mean):', accuracies_impt.mean())
print('Accuracy(std):', accuracies_impt.std())

print('Mean Squared Error:', rf_mse, 'degrees.')
print('Root Mean Squared Error:', np.sqrt(rf_mse))
print('R-squared:', r2)
print('Accuracy:', round(accuracy, 2), '%.')

# Apply to test dataset

In [None]:
path_test = './CH22_Demand_raw_X_Test.csv'
test = pd.read_csv(path_test)

In [None]:
test

In [None]:
test.describe()

### Data Prepartion

In [None]:
# parsing to datetime
test.DateTime = pd.to_datetime(test.DateTime)

In [None]:
test.dtypes

In [None]:
test.set_index(test['DateTime'],inplace = True)

In [None]:
# DateTime changed into the index
test_idx = test[['X1', 'X2', 'X3', 'X4']]

In [None]:
test_idx['yhat'] = 0

In [None]:
final = test_idx

### Splitting Train and Test

In [None]:
test_arr = np.array(test_idx['yhat'])
test_idx = test_idx.drop('yhat', axis=1)
test_idx_arr = np.array(test_idx)

In [None]:
X_test, y_test = test_idx_arr, test_arr

In [None]:
print('X_test Shape:', X_test.shape)
print('y_test Shape:', y_test.shape)

### Random Forest

### Apply the best param

In [None]:
bp_rf_pred = bp_rf.predict(X_test)
final['yhat'] = bp_rf_pred
final

In [None]:
final.resample('H',on='DateTime',closed='right').mean()

In [None]:
final.describe()