In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error,mean_absolute_error
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from statsmodels.tools.eval_measures import rmse
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_row', 50)
pd.set_option('display.max_column', 150)

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Submission = pd.read_csv("sampleSubmission.csv")

In [None]:
train_x = train.drop(["target","id"],axis=1)
train_y = pd.DataFrame(train["target"])
test_x = test.drop("id", axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y_label = pd.DataFrame((le.fit_transform(train_y)))

In [None]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

train_y_class = create_dummies(train_y,"target").drop("target",axis=1)

In [None]:
# Features Transformation
train_x_log = train_x.apply(lambda x: np.log(x+1))
test_x_log = test_x.apply(lambda x: np.log(x+1))
train_x_log_sqrt = train_x.apply(lambda x: np.sqrt(np.log(x+1)))
test_x_log_sqrt = test_x.apply(lambda x: np.sqrt(np.log(x+1)))

In [None]:
# To check the unique columns value no. between train and test set
compare_list = []
for columns in train_x.columns:
    a = len(train_x[columns].unique())
    b = len(test_x[columns].unique())
    if a != b:
        compare_list.append(columns)   

In [None]:
print(train_x.shape)
print(test_x.shape)
print(train_x_log.shape)
print(test_x_log.shape)
print(train_x_log_sqrt.shape)
print(test_x_log_sqrt.shape)
print(train_y_class.shape)

# 1. XGBoost, train with Original Dataset (X)

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [None]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [None]:
hyperparameters_tuning = {'learning_rate': [0.1],               # 6th when boosting
                          'gamma': [0],                       # 3rd
                          'max_depth': [4],                     # 1st to tune
                          'min_child_weight': [0],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [i/10.0 for i in range(1,10,1)],                   # 4th
                          'colsample_bytree': [i/10.0 for i in range(1,10,1)],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [0],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [500],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["multi:softmax"],
                          "eval_metric": ["mlogloss"],
                          "num_class": [9],
                          "seed": [2]
                         }                

In [None]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBClassifier()

In [None]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 4, iid = False)

In [None]:
xgboost_grid.fit(train_x,train_y_label)

In [None]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_accuracy_score = best_rf.score(train_x,train_y_label)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_accuracy_score)
print(best_score)
print(best_accuracy_score)
print(best_rf)

In [None]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Prediction for Kaggle Submission Before Boosting Tuning

In [None]:
xgb_Ori_X_pred = best_rf.predict_proba(test_x)

In [None]:
Submission[['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']] = xgb_OriX_pred

In [None]:
## Creating a Submission File to submit to Kaggle competition ##
Submission.to_csv("XGBoost_Ori_X_pred.csv",index=False)

# Create XGBoost's DMatrix, after fine tuning the parameters

In [None]:
trainDMat = xgb.DMatrix(data = train_x, label = train_y_label)

In [None]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [None]:
hyperparameters_boosting ={'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.01,                       # 3rd
                          'max_depth': 6,                     # 1st to tune
                          'min_child_weight': 1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.7,                   # 4th
                          'colsample_bytree': 0.3,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 1,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 87,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 50
                         }                                  

In [None]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 6, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

In [None]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [None]:
xgb_OriX_pred_boosted = xgbFinal.predict_proba(xgb.DMatrix(test_x))

In [None]:
Submission[['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']] = xgb_OriX_pred_boosted

In [None]:
## Creating a Submission File to submit to Kaggle competition ##
Submission.to_csv("XGBoost_Ori_X_pred_boosted.csv",index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict_proba(xgb.DMatrix(test_x))

In [None]:
# 1. XGBoost, train with Original Dataset (X)

import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

hyperparameters_tuning = {'learning_rate': [0.1],               # 6th when boosting
                          'gamma': [0.01],                       # 3rd
                          'max_depth': [6],                     # 1st to tune
                          'min_child_weight': [1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.7],                   # 4th
                          'colsample_bytree': [0.3],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [1.0],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [10],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["multi:softmax"],
                          "eval_metric": ["mlogloss"],
                          "num_class": [9],
                          "seed": [50]
                         }                

# General Parameters setting inside Regressor
xgboost = xgb.XGBClassifier()

xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 6, iid = False, scoring = "accuracy")

xgboost_grid.fit(train_x,train_y_label)

best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_accuracy_score = best_rf.score(train_x,train_y_label)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_accuracy_score)
print(best_rf)

print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Prediction for Kaggle Submission Before Boosting Tuning

xgb_Ori_X_pred = best_rf.predict_proba(test_x)

Submission[['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']] = xgb_OriX_pred

## Creating a Submission File to submit to Kaggle competition ##
Submission.to_csv("XGBoost_Ori_X_pred.csv",index=False)

# Create XGBoost's DMatrix, after fine tuning the parameters

trainDMat = xgb.DMatrix(data = train_x, label = train_y_label)

# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

hyperparameters_boosting ={'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.01,                       # 3rd
                          'max_depth': 6,                     # 1st to tune
                          'min_child_weight': 1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.7,                   # 4th
                          'colsample_bytree': 0.3,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 1,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 87,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 50
                         }                                  

xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 6, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

xgb_OriX_pred_boosted = xgbFinal.predict_proba(xgb.DMatrix(test_x))

Submission[['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']] = xgb_OriX_pred

## Creating a Submission File to submit to Kaggle competition ##
Submission.to_csv("XGBoost_Ori_X_pred_boosted.csv",index=False)

# Saving the final model

pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict_proba(xgb.DMatrix(test_x))

verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()

# 2. XGBoost, train with Original Dataset log(X+1)

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [None]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [None]:
hyperparameters_tuning = {'learning_rate': [0.1],               # 6th when boosting
                          'gamma': [0.01],                       # 3rd
                          'max_depth': [6],                     # 1st to tune
                          'min_child_weight': [1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.7],                   # 4th
                          'colsample_bytree': [0.3],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [1.0],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [10],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["multi:softmax"],
                          "eval_metric": ["mlogloss"],
                          "num_class": [9],
                          "seed": [50]
                         }                

In [None]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBClassifier()

In [None]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 6, iid = False, scoring = "accuracy")

In [None]:
xgboost_grid.fit(train_x_log,train_y_label)

In [None]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_accuracy_score = best_rf.score(train_x,train_y_label)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_accuracy_score)
print(best_rf)

In [None]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Prediction for Kaggle Submission Before Boosting Tuning

In [None]:
XGBoost_log_X_pred = best_rf.predict_proba(test_x_log)

In [None]:
Submission[['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']] = XGBoost_log_X_pred

In [None]:
## Creating a Submission File to submit to Kaggle competition ##
Submission.to_csv("XGBoost_log_X_pred.csv",index=False)

# Create XGBoost's DMatrix, after fine tuning the parameters

In [None]:
trainDMat = xgb.DMatrix(data = train_x_log, label = train_y_label)

In [None]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [None]:
hyperparameters_boosting ={'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.01,                       # 3rd
                          'max_depth': 6,                     # 1st to tune
                          'min_child_weight': 1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.7,                   # 4th
                          'colsample_bytree': 0.3,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 1,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 87,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 50
                         }                                  

In [None]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 6, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

In [None]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [None]:
XGBoost_log_X_pred_boosted = xgbFinal.predict_proba(xgb.DMatrix(test_x_log))

In [None]:
Submission[['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']] = XGBoost_log_X_pred_boosted

In [None]:
## Creating a Submission File to submit to Kaggle competition ##
Submission.to_csv("XGBoost_log_X_pred_boosted.csv",index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict_proba(xgb.DMatrix(test_x))

In [None]:
verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()

# 3. XGBoost, train with Original Dataset sqrt(log(X+1))

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [None]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [None]:
hyperparameters_tuning = {'learning_rate': [0.1],               # 6th when boosting
                          'gamma': [0.01],                       # 3rd
                          'max_depth': [6],                     # 1st to tune
                          'min_child_weight': [1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.7],                   # 4th
                          'colsample_bytree': [0.3],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [1.0],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [10],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["multi:softmax"],
                          "eval_metric": ["mlogloss"],
                          "num_class": [9],
                          "seed": [50]
                         }                

In [None]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBClassifier()

In [None]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 6, iid = False, scoring = "accuracy")

In [None]:
xgboost_grid.fit(train_x_log_sqrt,train_y_label)

In [None]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_accuracy_score = best_rf.score(train_x,train_y_label)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_accuracy_score)
print(best_rf)

In [None]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Prediction for Kaggle Submission Before Boosting Tuning

In [None]:
XGBoost_sqrt_log_X_pred = best_rf.predict_proba(test_x_log_sqrt)

In [None]:
Submission[['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']] = XGBoost_sqrt_log_X_pred

In [None]:
## Creating a Submission File to submit to Kaggle competition ##
Submission.to_csv("XGBoost_sqrt_log_X_pred.csv",index=False)

# Create XGBoost's DMatrix, after fine tuning the parameters

In [None]:
trainDMat = xgb.DMatrix(data = train_x_log_sqrt, label = train_y_label)

In [None]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [None]:
hyperparameters_boosting ={'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.01,                       # 3rd
                          'max_depth': 6,                     # 1st to tune
                          'min_child_weight': 1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.7,                   # 4th
                          'colsample_bytree': 0.3,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 1,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 87,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 50
                         }                                  

In [None]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 6, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

In [None]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [None]:
XGBoost_sqrt_log_X_pred_boosted = xgbFinal.predict_proba(xgb.DMatrix(test_x_log_sqrt))

In [None]:
Submission[['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']] = XGBoost_sqrt_log_X_pred_boosted

In [None]:
## Creating a Submission File to submit to Kaggle competition ##
Submission.to_csv("XGBoost_sqrt_log_X_pred_boosted.csv",index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict_proba(xgb.DMatrix(test_x))

In [None]:
verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()