In [26]:
import joblib as jb
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [195]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xg

In [28]:
house_train = jb.load('house_train_data')

In [29]:
house_test = jb.load('house_test2')

In [30]:
categorical_features = jb.load('categories')

In [31]:
house_data = pd.concat([house_train,house_test]).reset_index(drop = True)

In [32]:
def dummies(data,columns):
    
    for i in columns:
        oh = OneHotEncoder(sparse = False,drop = 'first')
        oh.fit(data[[i]])
        cate = oh.categories_[0]
        u = 0
        for j in cate[1:]:
            if u <= len(cate[1:]):
                data[j] = oh.transform(data[[i]])[:,u]
                u +=1
        data.drop(i,axis = 1,inplace = True)
    return data

In [33]:
house_data.drop('Id',axis = 1,inplace=True)

In [34]:
house_data.shape

(2911, 75)

In [35]:
house_data2 = dummies(house_data,categorical_features)

In [36]:
house_data.shape

(2911, 176)

### Split data

In [188]:
X = house_data.drop('SalePrice',axis = 1)
y = house_data['SalePrice']

In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## Xgboost model

In [190]:
model = xg.XGBRegressor()

In [191]:
import timeit

start = timeit.default_timer()

model.fit(X_train,y_train)

stop = timeit.default_timer()

print('Time: ', stop - start)  


Time:  0.5582910000002812


In [192]:
model.score(X_test,y_test)

0.8734087802768278

## Parameter Tuning

In [198]:
param = {'learning_rate' : [0.01,0.05,0.1,0.2],
        'max_depth':[3,5,7,8],
        'gamma':[0,.1,.2,.3],
        'min_child_weight' : [5,6,8,10],
        'colsample_bytree' : [0.65,0.75,0.85,0.95]}

In [199]:
start = timeit.default_timer()

ra_model = RandomizedSearchCV(xg.XGBRegressor(),param_distributions= param)

stop = timeit.default_timer()

print('Time: ', stop - start)  


Time:  0.00013670000043930486


In [200]:
start = timeit.default_timer()

ra_model.fit(X,y)

stop = timeit.default_timer()

print('Time: ', stop - start)  

Time:  25.888447899999846


In [201]:
ra_model.best_params_ # best parameters

{'min_child_weight': 5,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 0,
 'colsample_bytree': 0.95}

In [None]:
## best - min_child_weight =  5,max_depth = 8,learning_rate = 0.1,gamma = 0.3,colsample_bytree = 0.75(previous one)

### Cross Validation

In [203]:
start = timeit.default_timer()

score = cross_val_score(xg.XGBRegressor(min_child_weight =  5,max_depth = 8,learning_rate = 0.1,gamma = 0,colsample_bytree = 0.95)
                        ,X,y,cv = 10)

stop = timeit.default_timer()

print('Time: ', stop - start)  

Time:  8.415321400000721


In [205]:
score

array([0.86972259, 0.81312961, 0.8824293 , 0.91674525, 0.83123059,
       0.96592368, 0.96405085, 0.92997792, 0.93813305, 0.94411779])

In [206]:
score.mean() # score from the cv

0.9055460617883548

### Best Score after parameter tuning and CV

In [265]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.4, random_state=0)

In [266]:
summa = xg.XGBRegressor(min_child_weight =  5,max_depth = 8,learning_rate = 0.1,gamma = 0,colsample_bytree = 0.95)

In [267]:
summa.fit(X_train2,y_train2)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.95, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=8,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [268]:
summa.score(X_test,y_test)

0.9541076052772117