https://lightgbm.readthedocs.io/en/latest/Python-Intro.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

In [3]:
housing = clean(config.HOUSING_CSV)
housing = add_features(housing)
housing = pd.read_pickle(config.HOUSING_PICKLE)

In [4]:
the_chosen_variables = ['3SsnPorch', 'BedroomAbvGr', 
             'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 
             'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 
             'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 
             'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 
             'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'MoSold', 'MSSubClass', 
             'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 
             'ScreenPorch', 'TotalBsmtSF', 'TotRmsAbvGrd', 'WoodDeckSF', 
             'YearBuilt', 'YearRemodAdd', 'YrSold','DecadeBuilt', 'DecadeRemodel',
             'TotalLivingArea','UnusedLotSize','HasPool','HouseAge', 'HouseAgeSq',
             'Toilets','Showers']

In [5]:
# The following variables have been removed because they correlate with other variables.
# Having them included would be redundant.
the_chosen_variables.remove('GrLivArea')     # Baked into TotalLivingArea
#the_chosen_variables.remove('1stFlrSF')      # Baked into TotalLivingArea?
the_chosen_variables.remove('TotalBsmtSF')   # Baked into TotalLivingArea
the_chosen_variables.remove('BsmtFinSF1')    # Baked into TotalLivingArea
the_chosen_variables.remove('BsmtUnfSF')     # Baked into TotalLivingArea
the_chosen_variables.remove('LotArea')       # Baked into UnusedLotSize
the_chosen_variables.remove('HouseAgeSq')    # Baked into HouseAge
the_chosen_variables.remove('YearBuilt')     # Baked into HouseAge
the_chosen_variables.remove('GarageYrBlt')   # Redundant with YearRemodAdd/YearBuilt
the_chosen_variables.remove('FullBath')      # Baked into Toilets/Showers
the_chosen_variables.remove('HalfBath')      # Baked into Toilets
the_chosen_variables.remove('BsmtFullBath')  # Baked into Toilets/Showers
the_chosen_variables.remove('BsmtHalfBath')  # Baked into Toilets
the_chosen_variables.remove('PoolArea')      # Redundant with HasPool
the_chosen_variables.remove('GarageCars')    # Redundant with GarageArea

In [6]:
the_chosen_variables = ['3SsnPorch','BedroomAbvGr','EnclosedPorch','Fireplaces',
                    'GarageArea','KitchenAbvGr','LotFrontage','LowQualFinSF','MasVnrArea',
                    'MiscVal','MoSold','MSSubClass','OpenPorchSF','OverallCond','OverallQual',
                    'ScreenPorch','TotRmsAbvGrd','WoodDeckSF','YearRemodAdd','YrSold',
                    'TotalLivingArea','UnusedLotSize','HasPool','HouseAge','Toilets','Showers',
                       'UpDownRatio']

In [7]:
limited_variables = ['TotalLivingArea','UnusedLotSize','HasPool','OverallQual','GarageArea','YearBuilt']
X = housing[the_chosen_variables]
dummy_vars = {'Neighborhood':'Nbhd','LotConfig':'LC','SaleCondition':'SC'}
dummy_df = dummify(housing, dummy_vars)
X = pd.concat([X,dummy_df], axis=1)
y = np.log(housing['SalePrice'])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [9]:
from lightgbm import LGBMRegressor, cv
lgb = LGBMRegressor(random_state=42)
lgb.fit(X_train,y_train)
print(lgb.score(X_train, y_train))
print(lgb.score(X_test,y_test))

0.9742813178285777
0.9103517916510846


In [10]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y_test, lgb.predict(X_test), squared=False)
print(rms)

0.1169258599396627


## Feature Importance

In [11]:
d = pd.DataFrame(lgb.feature_importances_,index=X.columns,columns=['Importance'])
d.sort_values('Importance',ascending=False).head(25)

Unnamed: 0,Importance
TotalLivingArea,348
UnusedLotSize,296
HouseAge,241
GarageArea,234
OverallQual,157
OverallCond,152
YearRemodAdd,150
OpenPorchSF,149
LotFrontage,136
MasVnrArea,136
