In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Reading in Data to DataFrame
df = pd.read_csv("FinalData.csv")

#Extracting the correct columns
df = df.iloc[:,1:]
train_df = df[df.SalePrice.isnull() == False]
predict_df = df[df.SalePrice.isnull() == True].drop('SalePrice',axis=1)

In [3]:
# Creating the DMatrix to be used by XGBoost Model
X = train_df.drop(['SalePrice','Id'],axis=1)
y = train_df.SalePrice

data_dmatrix = xgb.DMatrix(data=X,label=y)

In [4]:
#Creating the model with KFold Cross Validation
params = {"objective":"reg:squarederror",'colsample_bytree': 0.5,'learning_rate': 0.4,
                'max_depth': 5, 'reg_lambda': 5}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=5,
                    num_boost_round=100,
                    early_stopping_rounds=10,
                    metrics="rmse", as_pandas=True, seed=123)

In [5]:
cv_results.tail(1)

Unnamed: 0,test-rmse-mean,test-rmse-std,train-rmse-mean,train-rmse-std
38,24571.138672,1364.549944,8213.045313,134.424438


In [37]:
#HyperParameter Tuning using Random Search
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

estimator = XGBRegressor(objective = "reg:squarederror",
                         nthread=1,
                         seed =42)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': list(np.arange(0.1,1,0.01)),
    'reg_lambda': list(np.arange(1,5,0.1)),
    'colsample_bytree': [0.3,0.5,0.7]
}
random_search = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=parameters,
    scoring = 'neg_mean_squared_error',
    n_iter = 100,
    cv = 10,
    verbose=True
)

In [38]:
random_search.fit(X,y)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


KeyboardInterrupt: 

In [14]:
random_search.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.12,
 'max_depth': 3,
 'n_estimators': 180,
 'reg_lambda': 3.500000000000002}

In [42]:
#Training the model with found parameters

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
dtrain,dtest = xgb.DMatrix(data=X_train,label=y_train), xgb.DMatrix(data=X_test,label=y_test)

evallist = [(dtest, 'eval'), (dtrain, 'train')]

param = {'colsample_bytree': 0.7,
 'learning_rate': 0.12,
 'max_depth': 3,
 'n_estimators': 180,
 'reg_lambda': 3.5,
  "objective":"reg:squarederror",
  'eval_metric': 'rmse'}

no_jobs = 10000

bst = xgb.train(param,data_dmatrix,no_jobs,evallist,early_stopping_rounds=5)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	eval-rmse:163443.79688	train-rmse:167164.12500
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.

Will train until train-rmse hasn't improved in 5 rounds.
[1]	eval-rmse:144941.98438	train-rmse:148385.90625
[2]	eval-rmse:128709.00781	train-rmse:131891.42188
[3]	eval-rmse:114390.96875	train-rmse:117418.18750
[4]	eval-rmse:101928.33594	train-rmse:104612.18750
[5]	eval-rmse:90897.73438	train-rmse:93444.69531
[6]	eval-rmse:81233.14844	train-rmse:83677.95312
[7]	eval-rmse:72784.27344	train-rmse:75165.43750
[8]	eval-rmse:65279.93750	train-rmse:67555.61719
[9]	eval-rmse:58680.75781	train-rmse:61029.48047
[10]	eval-rmse:52980.59766	train-rmse:55248.57031
[11]	eval-rmse:480

In [83]:
bst.save_model("Model")

In [86]:
df_id = predict_df['Id']
df1 = predict_df.drop('Id',axis=1)
predict_matrix = xgb.DMatrix(df1)
Predictions = bst.predict(predict_matrix)

In [118]:
Id = [x[1] for x in list(predict_df.iloc[:,-1].items())]
Final = pd.Series(dict(zip(Id,Predictions)))
Final.to_csv("Predictions.csv")