In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

ModuleNotFoundError: No module named 'xgboost'

In [None]:
df = pd.read_csv('../datasets/sales_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().values.any()

In [None]:
df.columns

In [None]:
train_cols = ['Sales Channel', 'StoreID',
       'SalesTeamID', 'CustomerID', 'ProductID',
       'Discount_Applied', 'Unit_Price', 'Unit_Cost']

train_cols_few = ['Sales Channel', 'StoreID',
       'SalesTeamID', 'CustomerID', 'ProductID',
       'Discount_Applied', 'Unit_Price', 'Unit_Cost']

x = df[train_cols_few]
y = df['Order_Quantity']

In [None]:
x.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
x_train_scale = StandardScaler().fit_transform(x_train)
x_test_scale = StandardScaler().fit_transform(x_test)

In [None]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 12, 1), 
    'gamma': hp.uniform('gamma', 1, 9), 
    'reg_alpha': hp.quniform('reg_alpha', 40, 100, 1), 
    'reg_lambda': hp.uniform('reg-lambda', 0, 1), 
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10,  1),
    'n_estimators': 30,
    'seed': 0,
    'use_label_encoder' : False
}

In [None]:
def objective(space): 
    clf = XGBRegressor(
        n_estimators = space['n_estimators'], 
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],
        reg_alpha = int(space['reg_alpha']),
        reg_lambda = int(space['reg_lambda']),
        min_child_weight = int(space['min_child_weight']),
        colsample_bytree = int(space['colsample_bytree']),
        use_label_encoder= space['use_label_encoder']
    )
    
    evaluation = [(x_train_scale, y_train), (x_test_scale, y_test)]
    
    clf.fit(x_train_scale, y_train, eval_set=evaluation, eval_metric='auc', early_stopping_rounds=10, verbose=False)
    
    pred = clf.predict(x_test_scale)
    
    accuracy = accuracy_score(y_test, pred>0.5)
    
    print('Score: ', accuracy)
    
    return {'loss' : -accuracy, 'status' : STATUS_OK}

In [None]:
trials = Trials()

best_hyperparameters = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 10,
    trials = trials
)

In [None]:
print(best_hyperparameters)

In [None]:
xgb_md = XGBRegressor(
    colsample_bytree = 0.97,
    gamma = 4.19,
    max_depth = 4, 
    min_child_weight = 8,
    reg_lambda = 0.12,
    reg_alpha = 58,
    use_label_encoder=False,
    #objective='reg:logistic'
)

In [None]:
xgb_md.fit(x_train_scale, y_train)

In [None]:
xg_preds = xgb_md.predict(x_test_scale)

In [None]:
xgb_rmse = np.sqrt(mean_squared_error(y_test, xg_preds))

In [None]:
xgb_rmse

In [None]:
single_instance  = [1, 259, 6, 15, 12, 0.075, 1963.1, 1001.18]

yhat = xgb_md.predict([single_instance])
# summarize prediction
print('Prediction: %d' % yhat)