In [121]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib import style
%matplotlib inline
style.use('seaborn')

In [122]:
df = pd.read_csv('data/train.csv',index_col='Id')
cities = pd.read_csv('data/cities-month.csv')

In [123]:
df.drop(['MSSubClass','MSZoning','LotFrontage','LotShape','Utilities',
         'LotConfig','Neighborhood','Condition1','Condition2',
         'RoofMatl','Exterior1st','RoofStyle','MasVnrType',
         'MasVnrArea','BsmtExposure','BsmtFinType1','BsmtFinType2',
         'BsmtFinSF2','BsmtUnfSF','BsmtFinSF1','Heating','Electrical',
         'LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
         'FireplaceQu','GarageYrBlt','GarageFinish','GarageQual',
         'GarageCond','PavedDrive','WoodDeckSF','OpenPorchSF',
         'EnclosedPorch','3SsnPorch','ScreenPorch','PoolQC','Fence',
         'MiscFeature','MiscVal','SaleType','SaleCondition','LandContour',
         'Functional','BldgType','Foundation','CentralAir','YrSold',
         'Street','Alley','HouseStyle','Exterior2nd','YearRemodAdd','GarageType'],inplace=True,axis=1)

In [124]:
df.columns

Index(['LotArea', 'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt',
       'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'TotalBsmtSF',
       'HeatingQC', '1stFlrSF', '2ndFlrSF', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'PoolArea', 'MoSold',
       'SalePrice'],
      dtype='object')

In [125]:
df.LandSlope.value_counts()

Gtl    1382
Mod      65
Sev      13
Name: LandSlope, dtype: int64

In [126]:
df['ExterQual'].value_counts()

TA    906
Gd    488
Ex     52
Fa     14
Name: ExterQual, dtype: int64

In [127]:
def qualScale(column):
    eq = np.array(df[column].tolist())
    eq[eq == 'Fa'] = 3
    eq[eq == 'Ex'] = 5
    eq[eq == 'Gd'] = 4
    eq[eq == 'TA'] = 3.5
    eq[eq == 'Po'] = 2
    df[column] = eq
    df[column] = df[column].astype(float)

In [128]:
qualScale('KitchenQual')
qualScale('ExterCond')
qualScale('BsmtQual')
qualScale('BsmtCond')
qualScale('HeatingQC')
qualScale('ExterQual')

In [130]:
eq = np.array(df['LandSlope'].tolist())
eq[eq == 'Gtl'] = 0
eq[eq == 'Mod'] = 1
eq[eq == 'Sev'] = 2
df['LandSlope'] = eq.astype(float)

1
2
3
4
5
6
7
8
9
10
11


In [131]:
df = df.rename(columns={'TotalBsmtSF':'BasementArea','1stFlrSF':'1stFlrArea',
                   '2ndFlrSF':'2ndFlrArea','BedroomAbvGr':'Bedrooms',
                   'KitchenAbvGr':'Kitchen','TotRmsAbvGrd':'TotalRooms'})

In [132]:
X = df.drop(['SalePrice'],axis=1)
y = df['SalePrice']

In [160]:
params = {
    'objective':'regression',
    'num_leaves':5,                          
    'learning_rate':0.05, 
    'max_bin' : 55, 
    'bagging_fraction' : 0.8,
    'bagging_freq' : 5, 
    'feature_fraction' : 0.2319,
    'feature_fraction_seed':9, 
    'bagging_seed':9,
    'min_data_in_leaf' :6, 
    'min_sum_hessian_in_leaf' : 11,
    'verbose':3
}

In [163]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [165]:
import lightgbm as lgb
from lightgbm import LGBMRegressor
train = lgb.Dataset(X_train,label=y_train)
test = lgb.Dataset(X_test,label=y_test)
lgbm = lgb.train(params,train,1000,valid_sets=[train,test],early_stopping_rounds=30)

[1]	training's l2: 5.77755e+09	valid_1's l2: 6.66604e+09
Training until validation scores don't improve for 30 rounds.
[2]	training's l2: 5.4706e+09	valid_1's l2: 6.38435e+09
[3]	training's l2: 5.16505e+09	valid_1's l2: 6.09165e+09
[4]	training's l2: 4.87883e+09	valid_1's l2: 5.84302e+09
[5]	training's l2: 4.61904e+09	valid_1's l2: 5.60341e+09
[6]	training's l2: 4.38355e+09	valid_1's l2: 5.39711e+09
[7]	training's l2: 4.12574e+09	valid_1's l2: 5.13891e+09
[8]	training's l2: 3.92631e+09	valid_1's l2: 4.97049e+09
[9]	training's l2: 3.70929e+09	valid_1's l2: 4.75434e+09
[10]	training's l2: 3.50877e+09	valid_1's l2: 4.55531e+09
[11]	training's l2: 3.34386e+09	valid_1's l2: 4.3647e+09
[12]	training's l2: 3.19331e+09	valid_1's l2: 4.22498e+09
[13]	training's l2: 3.02291e+09	valid_1's l2: 4.05652e+09
[14]	training's l2: 2.8915e+09	valid_1's l2: 3.92712e+09
[15]	training's l2: 2.77209e+09	valid_1's l2: 3.80854e+09
[16]	training's l2: 2.65866e+09	valid_1's l2: 3.71109e+09
[17]	training's l2: 2.

In [168]:
lgbm.predict(X.iloc[10:12])

array([123016.19312964, 422618.35222351])