In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error,mean_absolute_error
from statsmodels.tools.eval_measures import rmse
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_row', 100)
pd.set_option('display.max_column', 150)

import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)
# %%notify

import warnings
warnings.filterwarnings('ignore')

<IPython.core.display.Javascript object>

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Submission = pd.read_csv("sampleSubmission.csv")

In [3]:
train_x = train.drop("revenue",axis=1)
train_y = pd.DataFrame(train["revenue"])

In [4]:
print(train_x.shape)
print(test.shape)

(137, 42)
(100000, 42)


In [5]:
data = pd.concat([train_x,test])
print(data.shape)
print(data.isnull().sum().sum())

(100137, 42)
0


In [6]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [7]:
def feature_engineer(df):
    df = df.astype(float,errors="ignore")
    df["Open Date"] = pd.to_datetime(df["Open Date"], format='%m/%d/%Y')
    df = df.set_index("Open Date")
    df["Open_Year"] = df.index.year
    df["Open_Month"] = df.index.month
    df = df.reset_index(drop=True)
    df = create_dummies(df,"Type")
    df = create_dummies(df,"City Group")
    df = create_dummies(df,"City")
    df = create_dummies(df,"Open_Year")
    df = create_dummies(df,"Open_Month")
    df = df.drop(["City","City Group","Type","Open_Year","Open_Month"],axis=1)
    return df

In [8]:
data_rf = feature_engineer(data)

In [9]:
train_rf_x = data_rf[:137]
test_x = data_rf[137:]
print(train_rf_x.shape)
print(test_x.shape)

(137, 139)
(100000, 139)


In [10]:
train_rf_x = train_rf_x.drop("Id",axis=1)
Submission_id = pd.DataFrame(test_x["Id"]).astype(int).reset_index(drop=True)
test_rf_x = test_x.drop("Id",axis=1).reset_index(drop=True)

In [11]:
print(train_rf_x.shape) # Performed features normalization
print(train_y.shape)
print(test_rf_x.shape) # Performed features normalization
print(Submission_id.shape)

(137, 138)
(137, 1)
(100000, 138)
(100000, 1)


In [12]:
# transformation for Train_y
from sklearn.preprocessing import StandardScaler
train_y_scaler = StandardScaler()
normalize_train_y = pd.DataFrame(train_y_scaler.fit_transform(train_y),columns = train_y.columns)
log_transform_y = train_y.apply(np.log)
sqrt_log_y = train_y.apply(np.sqrt).apply(np.log)

# Feature Engineering Part_1: Drop Non-Mutual Features
### As there are features available on test set only but not on training set

In [13]:
drop_columns = (train_rf_x.sum()==0)
drop_columns = drop_columns[drop_columns].index

In [14]:
train_rf_x_drop = train_rf_x.drop(drop_columns,axis=1)
test_rf_x_drop = test_rf_x.drop(drop_columns,axis=1)
print(train_rf_x_drop.shape)
print(test_rf_x_drop.shape)

(137, 105)
(100000, 105)


# Feature Engineering Part_2: Create N-way interaction features

In [15]:
from sklearn.preprocessing import PolynomialFeatures

In [16]:
# The interaction features can be as 2-way interaction, 3 way or more by adjusting the polynomial degree
n_way_interactions = PolynomialFeatures(2, interaction_only=True, include_bias=False)

In [17]:
n_way_interactions_columns = train_rf_x_drop.select_dtypes("float").columns

In [18]:
train_interactions = pd.DataFrame(n_way_interactions.fit_transform(train_rf_x_drop[n_way_interactions_columns]),columns = n_way_interactions.get_feature_names(train_rf_x_drop[n_way_interactions_columns].columns))
test_interactions = pd.DataFrame(n_way_interactions.fit_transform(test_rf_x_drop[n_way_interactions_columns]),columns = n_way_interactions.get_feature_names(test_rf_x_drop[n_way_interactions_columns].columns))

In [19]:
train_rf_x_interaction = train_rf_x_drop.drop(n_way_interactions_columns,axis=1)
test_rf_x_interaction = test_rf_x_drop.drop(n_way_interactions_columns,axis=1)

In [20]:
train_rf_x_interaction = pd.concat([train_rf_x_interaction,train_interactions],axis=1)
test_rf_x_interaction = pd.concat([test_rf_x_interaction,test_interactions],axis=1)
print(train_rf_x_interaction.shape)
print(test_rf_x_interaction.shape)

(137, 771)
(100000, 771)


# MinMax Features

In [21]:
min_max_columns = train_rf_x_interaction.select_dtypes(include="float").columns

In [22]:
# Transform features only, and need no inverse_transform back
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
train_score_scaler = MinMaxScaler()
test_score_scaler = MinMaxScaler()
normalize_train = pd.DataFrame(train_score_scaler.fit_transform(train_rf_x_interaction[min_max_columns]),columns = train_rf_x_interaction[min_max_columns].columns)
normalize_test = pd.DataFrame(test_score_scaler.fit_transform(test_rf_x_interaction[min_max_columns]),columns = test_rf_x_interaction[min_max_columns].columns)
# z- score transformation for train_rf_x & test_rf_x
train_rf_x_interaction[min_max_columns] = normalize_train
test_rf_x_interaction[min_max_columns] = normalize_test
print(test_rf_x_interaction.isnull().sum().sum())
print(train_rf_x_interaction[min_max_columns].equals(normalize_train))

0
True


# Feature Engineering Part_3: Remove Low Correlation features

In [27]:
features_train = pd.concat([train_rf_x_interaction,log_transform_y],axis=1)

In [28]:
# Sort the correlation values with the target columns revenue only
features_train_revenue_corr = features_train.corr()['revenue'][:-1].abs().sort_values(ascending=False)
revenue_corr_filter = features_train_revenue_corr[features_train_revenue_corr > 0.15]
print(revenue_corr_filter.shape)

(18,)


In [29]:
revenue_corr_filter_columns = revenue_corr_filter.index

In [30]:
train_rf_x_engine = train_rf_x_interaction[revenue_corr_filter_columns]
test_rf_x_engine = test_rf_x_interaction[revenue_corr_filter_columns]
print(train_rf_x_engine.shape)
print(test_rf_x_engine.shape)

(137, 18)
(100000, 18)


# Feature Engineering Part_4: PCA for dimensional reduction

In [None]:
pca_train = PCA(n_components=31)
pca_test = PCA(n_components=31)

In [None]:
train_x_pca = pca_train.fit_transform(train_rf_x_engine)
test_x_pca = pca_test.fit_transform(test_rf_x_engine)
print(train_x_pca.shape)
print(test_x_pca.shape)

In [None]:
print(pca_train.explained_variance_ratio_.sum())
print(pca_test.explained_variance_ratio_.sum())

In [None]:
train_x_pca = pd.DataFrame(data = train_x_pca)
test_x_pca = pd.DataFrame(data = test_x_pca)

In [None]:
print(train_x_pca.shape)
print(test_x_pca.shape)

# Training Features collections to try in XGBoost Model

In [31]:
print(train_rf_x_engine.shape)
print(test_rf_x_engine.shape)
print(train_x_pca.shape)
print(test_x_pca.shape)
print(log_transform_y.shape)
print(sqrt_log_y.shape)

(137, 18)
(100000, 18)


NameError: name 'train_x_pca' is not defined

# 1st: XGBoost, Grid Search with PCA Features with log_transformation_y

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [None]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [None]:
hyperparameters_tuning = {'learning_rate': [0.1],               # 6th when boosting
                          'gamma': [0.5],                       # 3rd
                          'max_depth': [2],                     # 1st to tune
                          'min_child_weight': [0.1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.6],                   # 4th
                          'colsample_bytree': [0.8],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [1],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [300],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["reg:squarederror"],
                          "seed": [99]
                         }                

In [None]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBRegressor()

In [None]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 10, iid = False, scoring="neg_mean_squared_error")

In [None]:
xgboost_grid.fit(train_x_pca,log_transform_y)

In [None]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_R2_score = best_rf.score(train_x_pca,log_transform_y)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_R2_score)
print(best_rf)

In [None]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Create XGBoost's DMatrix, after fine tuning the parameters

In [None]:
trainDMat = xgb.DMatrix(data = train_x_pca, label = log_transform_y)

In [None]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [None]:
hyperparameters_boosting ={'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.5,                       # 3rd
                          'max_depth': 2,                     # 1st to tune
                          'min_child_weight': 0.1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.6,                   # 4th
                          'colsample_bytree': 0.8,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 1,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 300,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 99
                         }                                

In [None]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 10, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

In [None]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [None]:
xgbFinal_submission = xgbFinal.predict(xgb.DMatrix(test_x_pca))
xgbFinal_submission = np.exp(xgbFinal_submission)

In [None]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": xgbFinal_submission
    })
submission.to_csv('Best_model_5th(0.001)_trial.csv',header=True, index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict(xgb.DMatrix(test_rf_x))
xgb_test_p = np.exp(xgb_test_p)

In [None]:
verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()

# 2nd: XGBoost, Grid Search with PCA Features with sqrt_log_transformation_y

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [None]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [None]:
hyperparameters_tuning = {'learning_rate': [0.1],               # 6th when boosting
                          'gamma': [0.1],                       # 3rd
                          'max_depth': [3],                     # 1st to tune
                          'min_child_weight': [0.1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.7],                   # 4th
                          'colsample_bytree': [0.7],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [1],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [260],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["reg:squarederror"],
                          "seed": [99]
                         }                

In [None]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBRegressor()

In [None]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 10, iid = False, scoring="neg_mean_squared_error")

In [None]:
xgboost_grid.fit(train_x_pca,sqrt_log_y)

In [None]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_R2_score = best_rf.score(train_x_pca,sqrt_log_y)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_R2_score)
print(best_rf)

In [None]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Create XGBoost's DMatrix, after fine tuning the parameters

In [None]:
trainDMat = xgb.DMatrix(data = train_x_pca, label = sqrt_log_y)

In [None]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [None]:
hyperparameters_boosting ={'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.1,                       # 3rd
                          'max_depth': 3,                     # 1st to tune
                          'min_child_weight': 0.1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.7,                   # 4th
                          'colsample_bytree': 0.7,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 1,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 260,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 99
                         }                

In [None]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 10, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

In [None]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [None]:
xgbFinal_submission = xgbFinal.predict(xgb.DMatrix(test_x_pca))
xgbFinal_submission = np.square(np.exp(xgbFinal_submission))

In [None]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": xgbFinal_submission
    })
submission.to_csv('Best_model_6th(0.001)_trial.csv',header=True, index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict(xgb.DMatrix(test_rf_x))
xgb_test_p = np.exp(xgb_test_p)

In [None]:
verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()

# 3rd: XGBoost, Grid Search with Interaction & High Correlation(>0.10) Features only with log_transformation_y

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [None]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [None]:
hyperparameters_tuning = {'learning_rate': [0.01],               # 6th when boosting
                          'gamma': [0.1],                       # 3rd
                          'max_depth': [3],                     # 1st to tune
                          'min_child_weight': [0.1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.2],                   # 4th
                          'colsample_bytree': [0.8],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [1],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [710],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["reg:squarederror"],
                          "seed": [99]
                         }                

In [None]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBRegressor()

In [None]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 10, iid = False, scoring="neg_mean_squared_error")

In [None]:
xgboost_grid.fit(train_rf_x_engine,log_transform_y)

In [None]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_R2_score = best_rf.score(train_rf_x_engine,log_transform_y)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_R2_score)
print(best_rf)

In [None]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Create XGBoost's DMatrix, after fine tuning the parameters

In [None]:
trainDMat = xgb.DMatrix(data = train_rf_x_engine, label = log_transform_y)

In [None]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [None]:
hyperparameters_boosting = {'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.1,                       # 3rd
                          'max_depth': 3,                     # 1st to tune
                          'min_child_weight': 0.1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.2,                   # 4th
                          'colsample_bytree': 0.8,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 1,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 710,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 99
                         }                                                                               

In [None]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 10, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

In [None]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [None]:
xgbFinal_submission = best_rf.predict(test_rf_x_engine)
xgbFinal_submission = np.exp(xgbFinal_submission)

In [None]:
xgbFinal_submission = xgbFinal.predict(xgb.DMatrix(test_rf_x_engine))
xgbFinal_submission = np.exp(xgbFinal_submission)

In [None]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": xgbFinal_submission
    })
submission.to_csv('Best_model_8th(0.001)_trial.csv',header=True, index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict(xgb.DMatrix(test_rf_x))
xgb_test_p = np.exp(xgb_test_p)

In [None]:
verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()

# 4th, XGBoost, Grid Search with Mutual Features & High Correlation(0.10) with sqrt_log_transformation_y

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [None]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [None]:
hyperparameters_tuning = {'learning_rate': [0.05],               # 6th when boosting
                          'gamma': [0.1],                       # 3rd
                          'max_depth': [3],                     # 1st to tune
                          'min_child_weight': [0.1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.9],                   # 4th
                          'colsample_bytree': [0.2],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [0],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [130],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["reg:squarederror"],
                          "seed": [99]
                         }                

In [None]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBRegressor()

In [None]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 10, iid = False, scoring="neg_mean_squared_error")

In [None]:
xgboost_grid.fit(train_rf_x_engine,sqrt_log_y)

In [None]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_R2_score = best_rf.score(train_rf_x_engine,sqrt_log_y)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_R2_score)
print(best_rf)

In [None]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Create XGBoost's DMatrix, after fine tuning the parameters

In [None]:
trainDMat = xgb.DMatrix(data = train_rf_x_engine, label = sqrt_log_y)

In [None]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [None]:
hyperparameters_boosting = {'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.1,                       # 3rd
                          'max_depth': 3,                     # 1st to tune
                          'min_child_weight': 0.1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.9,                   # 4th
                          'colsample_bytree': 0.2,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 0,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 130,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 99
                         }                                

In [None]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 10, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

In [None]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [None]:
xgbFinal_submission = xgbFinal.predict(xgb.DMatrix(test_rf_x_engine))
xgbFinal_submission = np.square(np.exp(xgbFinal_submission))

In [None]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": xgbFinal_submission
    })
submission.to_csv('Best_model_1(0.001)_trial.csv',header=True, index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict(xgb.DMatrix(test_rf_x_engine))
xgb_test_p = np.exp(xgb_test_p)

In [None]:
verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()

# 5th: XGBoost, Grid Search with Interaction & High Correlation(>0.15) Features only with log_transformation_y

In [32]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [33]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [34]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [86]:
hyperparameters_tuning = {'learning_rate': [0.05],               # 6th when boosting
                          'gamma': [0],                       # 3rd
                          'max_depth': [2],                     # 1st to tune
                          'min_child_weight': [0.1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.9],                   # 4th
                          'colsample_bytree': [0.1],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [0],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [150],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["reg:squarederror"],
                          "seed": [99]
                         }                

In [81]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBRegressor()

In [82]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 10, iid = False, scoring="neg_mean_squared_error")

In [83]:
xgboost_grid.fit(train_rf_x_engine,log_transform_y)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'learning_rate': [0.05], 'gamma': [0], 'max_depth': [2], 'min_child_weight': [0.1], 'max_delta_step': [0], 'subsample': [0.9], 'colsample_bytree': [0.1], 'colsample_bylevel': [1], 'colsample_bynode': [1], 'reg_lambda': [0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9], 'reg_alpha': [0], 'scale_pos_weight': [1.0], 'n_estimators': [150], 'booster': ['gbtree'], 'verbosity': [1], 'objective': ['reg:squarederror'], '

In [84]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_R2_score = best_rf.score(train_rf_x_engine,log_transform_y)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_R2_score)
print(best_rf)

-0.1939798893670266
0.30542657219387714
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.1, gamma=0,
       importance_type='gain', learning_rate=0.05, max_delta_step=0,
       max_depth=2, min_child_weight=0.1, missing=None, n_estimators=150,
       n_jobs=1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=0.7, scale_pos_weight=1.0,
       seed=99, silent=None, subsample=0.9, verbosity=1)


In [85]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

[-0.3811379085095802, -0.1984436411533798, -0.19418048634820112, -0.2071351983017649, -0.19378022577636905, -0.19786000396856168, -0.19378022577636905, -0.1939798893670266]
[-0.4067121643828935, 0.43976002102661993, 0.39965794303201907, 0.4516438269645361, 0.38569817847278554, 0.37474816732229665, 0.38569817847278554, 0.30542657219387714]
[XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.2, gamma=0,
       importance_type='gain', learning_rate=0.01, max_delta_step=0,
       max_depth=2, min_child_weight=0, missing=None, n_estimators=350,
       n_jobs=1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=0, scale_pos_weight=1.0,
       seed=99, silent=None, subsample=0.6, verbosity=1), XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.2, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
      

# Create XGBoost's DMatrix, after fine tuning the parameters

In [102]:
trainDMat = xgb.DMatrix(data = train_rf_x_engine, label = log_transform_y)

In [103]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [104]:
hyperparameters_boosting ={'learning_rate': 0.05,               # 6th when boosting
                          'gamma': 0,                       # 3rd
                          'max_depth': 2,                     # 1st to tune
                          'min_child_weight': 0.1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.9,                   # 4th
                          'colsample_bytree': 0.1,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 0,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 150,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 99
                         }                                                                                               

In [105]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 10, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

[0]	train-rmse:13.9605+0.0134369	test-rmse:13.9588+0.12786
[1]	train-rmse:13.2631+0.0128132	test-rmse:13.2616+0.127244
[2]	train-rmse:12.6009+0.0121477	test-rmse:12.599+0.12729
[3]	train-rmse:11.9714+0.0113061	test-rmse:11.969+0.127545
[4]	train-rmse:11.374+0.0107412	test-rmse:11.3702+0.129018
[5]	train-rmse:10.8062+0.0106731	test-rmse:10.803+0.129384
[6]	train-rmse:10.267+0.0101941	test-rmse:10.2637+0.129348
[7]	train-rmse:9.75435+0.00965762	test-rmse:9.75128+0.129239
[8]	train-rmse:9.26767+0.00915939	test-rmse:9.26473+0.129428
[9]	train-rmse:8.80524+0.00901517	test-rmse:8.80247+0.12944
[10]	train-rmse:8.36604+0.00853326	test-rmse:8.36278+0.130606
[11]	train-rmse:7.94896+0.00829499	test-rmse:7.94618+0.131195
[12]	train-rmse:7.55252+0.00783757	test-rmse:7.54949+0.13116
[13]	train-rmse:7.17644+0.00784431	test-rmse:7.17305+0.130483
[14]	train-rmse:6.81864+0.00726773	test-rmse:6.81495+0.130119
[15]	train-rmse:6.47943+0.00674688	test-rmse:6.47585+0.13044
[16]	train-rmse:6.1569+0.0064242	te

[130]	train-rmse:0.379352+0.00907348	test-rmse:0.429501+0.0900062
[131]	train-rmse:0.379062+0.00901647	test-rmse:0.429597+0.0897659
[132]	train-rmse:0.378781+0.00914285	test-rmse:0.42988+0.0898576
[133]	train-rmse:0.378378+0.00925128	test-rmse:0.429918+0.0900202
[134]	train-rmse:0.377993+0.00908776	test-rmse:0.429779+0.0895266
[135]	train-rmse:0.377622+0.0091181	test-rmse:0.429874+0.0896063
[136]	train-rmse:0.377161+0.009162	test-rmse:0.430053+0.0893568
[137]	train-rmse:0.376767+0.00939233	test-rmse:0.430207+0.0895385
[138]	train-rmse:0.376378+0.00931716	test-rmse:0.430181+0.0893658
[139]	train-rmse:0.376177+0.00927083	test-rmse:0.429402+0.089584
[140]	train-rmse:0.375902+0.00915164	test-rmse:0.428906+0.0892767
[141]	train-rmse:0.375578+0.00916178	test-rmse:0.428903+0.0893684
[142]	train-rmse:0.375269+0.00920259	test-rmse:0.428748+0.0893651
[143]	train-rmse:0.375057+0.00915243	test-rmse:0.42833+0.0889295
[144]	train-rmse:0.374756+0.00926198	test-rmse:0.428016+0.0887747
[145]	train-rmse

# Finalise XGBoost Model

In [106]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [94]:
xgbFinal_submission = best_rf.predict(test_rf_x_engine)
xgbFinal_submission = np.exp(xgbFinal_submission)

In [107]:
xgbFinal_submission = xgbFinal.predict(xgb.DMatrix(test_rf_x_engine))
xgbFinal_submission = np.exp(xgbFinal_submission)

In [108]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": xgbFinal_submission
    })
submission.to_csv('Best_model_2(0.05)_trial.csv',header=True, index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict(xgb.DMatrix(test_rf_x))
xgb_test_p = np.exp(xgb_test_p)

In [None]:
verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()

# 6th, XGBoost, Grid Search with Mutual Features & High Correlation(0.15) with sqrt_log_transformation_y

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
hyperparameters_dict = {"learning_rate": [0.1,0.2,0.3],              # range: [0,1], default = 0.3
                   # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
                   # The larger gamma is, the more conservative the algorithm will be.                   
                   "gamma": [i/10.0 for i in range(1,10,2)],
                   # Maximum depth of a tree. 
                   # Increasing this value will make the model more complex and more likely to overfit
                   "max_depth": [6], 
                   # Used to control over-fitting
                   # Too high values can lead to under-fitting hence, it should be tuned using grid search
                   "min_child_weight": [1],
                   # Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced
                   "max_delta_step": [0],
                   # Denotes the fraction of observations to be randomly samples for each tree
                   # Typical values: 0.5-1
                   "subsample": [0.8],
                   # Similar to max_features in GBM, Typical values: 0.5 - 1
                   # Denotes the fraction of columns to be randomly samples for each tree.
                   "colsample_bytree": [0.8,1],
                   "colsample_bylevel": [1], # colsample_bytree will control the decision over this, default = 1
                   "colsample_bynode": [1],  # colsample_bytree will control the decision over this, default = 1
                   # L2 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_lambda": [1], # default = 1
                   # L1 regularization term on weights. Increasing this value will make model more conservative.
                   "reg_alpha": [1],  # default = 0
                   # Control the balance of positive and negative weights, useful for unbalanced classes. 
                   # A typical value to consider: sum(negative instances) / sum(positive instances)
                   "scale_pos_weight": [1],
                   # No. of trees ensemble, too high sometimes still can cause overfitting
                   "n_estimators": [300,400], 
                   "booster": ["gbtree"],
                   "verbosity": [1],
                   "objective": ["reg:squarederror"],
                   "seed": [50]
                   }

In [None]:
# To track the iteration records for parameters tuning
best_score_list = []
best_params_list = []
best_R2_list = []

In [None]:
hyperparameters_tuning = {'learning_rate': [0.05],               # 6th when boosting
                          'gamma': [0.1],                       # 3rd
                          'max_depth': [3],                     # 1st to tune
                          'min_child_weight': [0.1],              # 2nd
                          'max_delta_step': [0],
                          'subsample': [0.9],                   # 4th
                          'colsample_bytree': [0.2],            # 4th 
                          'colsample_bylevel': [1],
                          'colsample_bynode': [1],
                          'reg_lambda': [0],                    # 5th
                          'reg_alpha': [0],                     # 5th
                          'scale_pos_weight': [1.0],            # only when dealing with imbalance classes
                          'n_estimators': [130],                # 1st
                          "booster": ["gbtree"],
                          "verbosity": [1],
                          "objective": ["reg:squarederror"],
                          "seed": [99]
                         }                

In [None]:
# General Parameters setting inside Regressor
xgboost = xgb.XGBRegressor()

In [None]:
xgboost_grid = GridSearchCV(estimator = xgboost, param_grid = hyperparameters_tuning, cv = 10, iid = False, scoring="neg_mean_squared_error")

In [None]:
xgboost_grid.fit(train_rf_x_engine,sqrt_log_y)

In [None]:
best_cv_score = xgboost_grid.cv_results_ #thus no need train-test split, as cv will automatic run for us 
best_params = xgboost_grid.best_params_
best_score = xgboost_grid.best_score_
best_rf = xgboost_grid.best_estimator_
best_R2_score = best_rf.score(train_rf_x_engine,sqrt_log_y)
best_score_list.append(best_score)
best_params_list.append(best_rf)
best_R2_list.append(best_R2_score)
print(best_score)
print(best_R2_score)
print(best_rf)

In [None]:
print(best_score_list)
print(best_R2_list)
print(best_params_list)

# Create XGBoost's DMatrix, after fine tuning the parameters

In [None]:
trainDMat = xgb.DMatrix(data = train_rf_x_engine, label = sqrt_log_y)

In [None]:
# Lower the learning_rate and set a large num_boost_round hyperparameter to ensure convergence. 
# If convergence is slow, retry with a slightly higher learning rate (e.g. 0.075 instead of 0.05)
num_boost_round = 150000
early_stopping_rounds = 50
# Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue.
# Last entry in evaluation history is the one from best iteration.

In [None]:
hyperparameters_boosting = {'learning_rate': 0.001,               # 6th when boosting
                          'gamma': 0.1,                       # 3rd
                          'max_depth': 3,                     # 1st to tune
                          'min_child_weight': 0.1,              # 2nd
                          'max_delta_step': 0,
                          'subsample': 0.9,                   # 4th
                          'colsample_bytree': 0.2,            # 4th 
                          'colsample_bylevel': 1,
                          'colsample_bynode': 1,
                          'reg_lambda': 0,                    # 5th
                          'reg_alpha': 0,                     # 5th
                          'scale_pos_weight': 1.0,            # only when dealing with imbalance classes
                          'n_estimators': 130,                # 1st
                          "booster": "gbtree",
                          "verbosity": 1,
                          "objective": "reg:squarederror",
                          "seed": 99
                         }                                

In [None]:
xgbCV = xgb.cv(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 10, #same as CV
    metrics = {'rmse'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,     
)

# Finalise XGBoost Model

In [None]:
num_boost_round = len(xgbCV)

xgbFinal = xgb.train(
    params = hyperparameters_boosting, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_importance(xgbFinal, ax=ax)

# Prediction for Kaggle Submission

In [None]:
xgbFinal_submission = xgbFinal.predict(xgb.DMatrix(test_rf_x_engine))
xgbFinal_submission = np.square(np.exp(xgbFinal_submission))

In [None]:
## Other submission style
## Creating a Submission File to submit to Kaggle competition ##
testData = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "Id": testData["Id"],
        "Prediction": xgbFinal_submission
    })
submission.to_csv('Best_model_1(0.001)_trial.csv',header=True, index=False)

# Saving the final model

In [None]:
pickle.dump(xgbFinal, open("xgbFinal.pickle.dat", "wb"))

# Loading the final model

In [None]:
xgb_test = pickle.load(open("xgbFinal.pickle.dat", "rb"))

In [None]:
# Re-forecast the prediction to verify the model
xgb_test_p = xgb_test.predict(xgb.DMatrix(test_rf_x_engine))
xgb_test_p = np.exp(xgb_test_p)

In [None]:
verify_model = (xgbFinal_submission == xgb_test_p)
verify_model.sum()