## 1. Import the Pandas Library

In [151]:
import numpy as np
import pandas as pd

## 2. Exploratory Data Analysis

In [152]:
# Read the data into a data frame
data = pd.read_csv('kc_house_data.csv')

In [153]:
# Check the number of data points in the data set
print(len(data))
# Check the number of features in the data set
print(len(data.columns))
# Check the data types available in this dataset
print(data.dtypes.unique())

21613
21
[dtype('int64') dtype('O') dtype('float64')]


We have one both numerical and categorical columns in this dataset.

## 3. Check Categorical Features
We can get the categorical column list using the below code

In [6]:
data.select_dtypes(include=['O']).columns.tolist()

['date']


We only have one categorical columns which is the date column that we will ignore. If you are interested to fine tune this model further, then you can preprocess this column. 


In [155]:
#view sample data
data.sample(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
17399,5153900080,20140714T000000,199000.0,3,1.0,1510,9100,1.0,0,0,...,7,1510,0,1966,0,98003,47.3331,-122.319,1180,7220
5701,1552800010,20150311T000000,352000.0,5,2.75,2980,9838,1.0,0,0,...,7,1710,1270,1968,0,98030,47.3807,-122.222,2240,9838
17305,452002135,20150422T000000,1070000.0,4,2.5,2740,5000,2.0,0,0,...,9,2740,0,2012,0,98107,47.674,-122.371,1660,5000
13795,9100000040,20140807T000000,480000.0,3,1.75,1710,4080,1.0,0,0,...,7,1130,580,1979,0,98136,47.5563,-122.392,1200,4080
5446,7645900235,20140710T000000,880000.0,6,2.5,2640,3680,2.0,0,0,...,8,1760,880,1922,0,98126,47.5771,-122.38,1960,3680


we don not need the id column also. So drop the column while fitting the data in ML model

## 4. Missing Value Columns List

In [10]:
#get the dimension of dataset
data.shape

(21613, 21)

In [9]:
#get record count for each variable
data.count()

id               21613
date             21613
price            21613
bedrooms         21613
bathrooms        21613
sqft_living      21613
sqft_lot         21613
floors           21613
waterfront       21613
view             21613
condition        21613
grade            21613
sqft_above       21613
sqft_basement    21613
yr_built         21613
yr_renovated     21613
zipcode          21613
lat              21613
long             21613
sqft_living15    21613
sqft_lot15       21613
dtype: int64

This dataset contains no missing value. So we can proceed further to make the machine learning model using the lightgbm

## 5. Split the Dataset

In [18]:
#separte x and y variable
X_data = data[['sqft_living','grade', 'sqft_above', 'sqft_living15','bathrooms','view',
                 'sqft_basement','lat','waterfront','yr_built','bedrooms']]
X=X_data.values
y = data.price.values

In [19]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [23]:
#get train and test dataset size
X_train.shape


(17290, 11)

In [22]:
X_test.shape

(4323, 11)

17k records for training and 4k records for testing model efficiency.

## 6. XGBoost Regression Model

In [24]:
# import xgboost library
import xgboost

In [81]:
# Let's try XGboost algorithm to see if we can get better results
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

You can give your custom values for the above parameters except objective. Why because we are doing the regression model, so the objective must be regression(more regression algorithms are available. Check documentation). To know more about parameters https://lightgbm.readthedocs.io/en/latest/Parameters.html
Boosting type traditional Gradient Boosting Decision Tree.

In [46]:
#convert your data to lightgbm dataset format
d_train = lgb.Dataset(X_train, label=y_train)
d_valid = lgb.Dataset(X_test, label=y_test)
valid_sets = [d_valid]

In [156]:
#pass data
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'max_depth': 7, 
    'num_leaves':10,
    'learning_rate': 0.5,
    'verbose': 0
}
n_estimators = 100
model = lgb.train(params=params, train_set=d_train, num_boost_round=n_estimators,
                  valid_sets=valid_sets,verbose_eval=1)

[1]	valid_0's l2: 1.19068e+11
[2]	valid_0's l2: 1.19027e+11
[3]	valid_0's l2: 1.18987e+11
[4]	valid_0's l2: 1.18947e+11
[5]	valid_0's l2: 1.18907e+11
[6]	valid_0's l2: 1.18866e+11
[7]	valid_0's l2: 1.18826e+11
[8]	valid_0's l2: 1.18786e+11
[9]	valid_0's l2: 1.18746e+11
[10]	valid_0's l2: 1.18706e+11
[11]	valid_0's l2: 1.18666e+11
[12]	valid_0's l2: 1.18625e+11
[13]	valid_0's l2: 1.18585e+11
[14]	valid_0's l2: 1.18545e+11
[15]	valid_0's l2: 1.18505e+11
[16]	valid_0's l2: 1.18465e+11
[17]	valid_0's l2: 1.18425e+11
[18]	valid_0's l2: 1.18385e+11
[19]	valid_0's l2: 1.18345e+11
[20]	valid_0's l2: 1.18305e+11
[21]	valid_0's l2: 1.18265e+11
[22]	valid_0's l2: 1.18225e+11
[23]	valid_0's l2: 1.18186e+11
[24]	valid_0's l2: 1.18146e+11
[25]	valid_0's l2: 1.18106e+11
[26]	valid_0's l2: 1.18066e+11
[27]	valid_0's l2: 1.18026e+11
[28]	valid_0's l2: 1.17986e+11
[29]	valid_0's l2: 1.17948e+11
[30]	valid_0's l2: 1.1791e+11
[31]	valid_0's l2: 1.17871e+11
[32]	valid_0's l2: 1.17833e+11
[33]	valid_0's l2:

In [38]:
#predict values
y_pred = model.predict(X_test)

## 7. Computer Error Metrics

In [157]:
y_pred = model.predict(X_test)
# Evaluating the Algorithm
from sklearn.metrics import explained_variance_score
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance',explained_variance_score(y_test,y_pred))
print('R Square :', metrics.r2_score(y_test, y_pred))  


Mean Absolute Error: 224653.524353
Mean Squared Error: 115312905075.0
Root Mean Squared Error: 339577.53912
Explained Variance 0.0311840359791
R Square : 0.0303669237836


In [92]:
from catboost import CatBoostRegressor

In [143]:
modelc=CatBoostRegressor(iterations=200,
                            learning_rate=0.6,
                            depth=8,
                            loss_function='RMSE',
                            eval_metric='RMSE',
                            random_seed=99,
                            od_type='Iter',
                            od_wait=50)
modelc.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, verbose=False)
y_pred = modelc.predict(X_test)
# Evaluating the Algorithm
from sklearn.metrics import explained_variance_score
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance',explained_variance_score(y_test,y_pred))
print('R Square :', metrics.r2_score(y_test, y_pred))  


Mean Absolute Error: 87852.5034367
Mean Squared Error: 21712306616.1
Root Mean Squared Error: 147350.964083
Explained Variance 0.817427881063
R Square : 0.817427454089


In [147]:
modelg=lgb.LGBMRegressor(n_estimators=200)
modelg.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)
y_pred = modelg.predict(X_test)
# Evaluating the Algorithm
from sklearn.metrics import explained_variance_score
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance',explained_variance_score(y_test,y_pred))
print('R Square :', metrics.r2_score(y_test, y_pred))  


Mean Absolute Error: 217080.187761
Mean Squared Error: 111214696777.0
Root Mean Squared Error: 333488.675635
Explained Variance 0.0653091914419
R Square : 0.0648275794798


In [150]:
lgb_params = {}
lgb_params['boost'] = 'gbdt'
lgb_params['objective'] = 'regression_l2'
lgb_params['num_leaves'] = 128
lgb_params['sub_feature'] = 0.8 
lgb_params['max_depth'] = 9
lgb_params['feature_fraction'] = 0.7
lgb_params['bagging_fraction'] = 0.7
lgb_params['bagging_freq'] = 50
lgb_params['learning_rate'] = 0.01
lgb_params['num_iterations'] = 1500
lgb_params['early_stopping_round'] = 50
lgb_params['verbose'] = 2

model = lgb.train(params=lgb_params, train_set=d_train, num_boost_round=n_estimators,
                  valid_sets=valid_sets,verbose_eval=1)

y_pred = model.predict(X_test)
# Evaluating the Algorithm
from sklearn.metrics import explained_variance_score
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance',explained_variance_score(y_test,y_pred))
print('R Square :', metrics.r2_score(y_test, y_pred))  




ValueError: For early stopping, at least one dataset and eval metric is required for evaluation