In [2]:

DF_PATH = "../data/train.pkl"
DF_PATH1 = "../data/test.pkl"

# Import libraries

In [3]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Read data

In [4]:
train = pd.read_pickle(DF_PATH)
print(train.shape)

(37872, 5)


In [None]:
train.head()

Unnamed: 0,AT,V,AP,RH,PE
5444,6.75,39.4,1011.28,90.84,483.77
44734,10.08,41.16,1023.14,96.03,469.17
39601,14.32,44.6,1013.85,68.13,466.36
2928,19.04,51.86,1018.05,79.01,458.64
29411,29.17,67.45,1014.1,46.85,435.08


# Split data

In [5]:
# remove target feature
X = train.drop('PE',axis=1)
y = train['PE']

In [6]:
# split train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'--> X_train: {X_train.shape}')
print(f'--> X_val: {X_val.shape}')
print(f'--> y_train: {y_train.shape}')
print(f'--> y_val: {y_val.shape}')

--> X_train: (30297, 4)
--> X_val: (7575, 4)
--> y_train: (30297,)
--> y_val: (7575,)


# Feature Scaling

In [7]:
# scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


# Model

## XGBRegressor


In [7]:
xgb_rg = XGBRegressor(learning_rate = 0.1,
                             max_depth = 70,
                             n_estimators = 500,
                             random_state = 42,
                             )
xgb_rg.fit(X_train_scaled, y_train,
            eval_set = [(X_train_scaled, y_train),(X_val_scaled, y_val)],
            early_stopping_rounds = 300,
            verbose=1,
            eval_metric=['rmse','mae']
           )

[0]	validation_0-rmse:408.53866	validation_0-mae:408.24207	validation_1-rmse:408.94579	validation_1-mae:408.65098
[1]	validation_0-rmse:367.70009	validation_0-mae:367.42330	validation_1-rmse:368.06506	validation_1-mae:367.79024




[2]	validation_0-rmse:330.94571	validation_0-mae:330.68586	validation_1-rmse:331.26696	validation_1-mae:331.00927
[3]	validation_0-rmse:297.86611	validation_0-mae:297.62272	validation_1-rmse:298.15804	validation_1-mae:297.91657
[4]	validation_0-rmse:268.09428	validation_0-mae:267.86543	validation_1-rmse:268.35870	validation_1-mae:268.13136
[5]	validation_0-rmse:241.29999	validation_0-mae:241.08423	validation_1-rmse:241.53659	validation_1-mae:241.32222
[6]	validation_0-rmse:217.18483	validation_0-mae:216.97983	validation_1-rmse:217.40104	validation_1-mae:217.19705
[7]	validation_0-rmse:195.48220	validation_0-mae:195.28618	validation_1-rmse:195.68042	validation_1-mae:195.48539
[8]	validation_0-rmse:175.95051	validation_0-mae:175.76081	validation_1-rmse:176.12845	validation_1-mae:175.93958
[9]	validation_0-rmse:158.37219	validation_0-mae:158.18764	validation_1-rmse:158.52894	validation_1-mae:158.34500
[10]	validation_0-rmse:142.55237	validation_0-mae:142.37204	validation_1-rmse:142.69861	

# save model

In [8]:
import pickle
with open('../models/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_rg, f)

In [9]:
import pickle
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Testing the model

In [12]:
test = pd.read_pickle(DF_PATH1)
print(test.shape)

(9468, 5)


In [13]:
# remove target feature
test_df = test.drop('PE',axis=1)
target = test['PE']

print(f'test_df: {test_df.shape}')
print(f'target: {target.shape}')

test_df: (9468, 4)
target: (9468,)


In [14]:
# scale test data
test_df_scaled = scaler.transform(test_df)

In [15]:
# predict on test data
y_hat = xgb_rg.predict(test_df_scaled)

In [20]:
y_hat[0:2]

array([443.30988, 434.83   ], dtype=float32)

In [16]:
# evaluate RMSE and MAE on data
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def score(y_true, y_pred):
    score = rmse(y_true, y_pred)
    return score


In [17]:
print(f'Score: {score(target, y_hat)}')
print(f'MAE: {mean_absolute_error(target, y_hat)}')

Score: 0.3068514332489778
MAE: 0.03209290698252503


This result is very close to the result we get in evaluation set which was 0.3080

In [22]:
# load model
with open('../models/xgb_model.pkl', 'rb') as f:
    loaded_xgb_rg = pickle.load(f)

In [23]:
y_pred = loaded_xgb_rg.predict(test_df_scaled[0:2])

In [24]:
y_pred

array([443.30988, 434.83   ], dtype=float32)