In [177]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib import style
%matplotlib inline
style.use('seaborn')

In [178]:
df = pd.read_csv('data/train.csv',index_col='Id')
cities = pd.read_csv('data/cities-month.csv')

In [179]:
df.drop(['MSSubClass','MSZoning','LotFrontage','LotShape','Utilities',
         'LotConfig','Neighborhood','Condition1','Condition2',
         'RoofMatl','Exterior1st','RoofStyle','MasVnrType',
         'MasVnrArea','BsmtExposure','BsmtFinType1','BsmtFinType2',
         'BsmtFinSF2','BsmtUnfSF','BsmtFinSF1','Heating','Electrical',
         'LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
         'FireplaceQu','GarageYrBlt','GarageFinish','GarageQual',
         'GarageCond','PavedDrive','WoodDeckSF','OpenPorchSF',
         'EnclosedPorch','3SsnPorch','ScreenPorch','PoolQC','Fence',
         'MiscFeature','MiscVal','SaleType','SaleCondition','LandContour',
         'Functional','BldgType','Foundation','CentralAir','YrSold',
         'Street','Alley','HouseStyle','Exterior2nd','YearRemodAdd','GarageType','LandSlope','PoolArea'],inplace=True,axis=1)

In [180]:
df.columns

Index(['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterQual',
       'ExterCond', 'BsmtQual', 'BsmtCond', 'TotalBsmtSF', 'HeatingQC',
       '1stFlrSF', '2ndFlrSF', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageCars', 'GarageArea', 'MoSold', 'SalePrice'],
      dtype='object')

In [181]:
columns = ['LotArea','GarageArea','1stFlrSF','2ndFlrSF','TotalBsmtSF','TotRmsAbvGrd',
          'BedroomAbvGr','FullBath','HalfBath','KitchenAbvGr','Fireplaces','GarageCars',
          'ExterCond','ExterQual','BsmtQual','BsmtCond','HeatingQC','KitchenQual',
          'OverallCond','OverallQual','YearBuilt','MoSold','SalePrice']

In [182]:
df = df[columns]

In [183]:
df['ExterQual'].value_counts()

TA    906
Gd    488
Ex     52
Fa     14
Name: ExterQual, dtype: int64

In [184]:
def qualScale(column):
    eq = np.array(df[column].tolist())
    eq[eq == 'Fa'] = 3
    eq[eq == 'Ex'] = 5
    eq[eq == 'Gd'] = 4
    eq[eq == 'TA'] = 3.5
    eq[eq == 'Po'] = 2
    df[column] = eq
    df[column] = df[column].astype(float)

In [185]:
qualScale('KitchenQual')
qualScale('ExterCond')
qualScale('BsmtQual')
qualScale('BsmtCond')
qualScale('HeatingQC')
qualScale('ExterQual')

In [186]:
X = df.drop(['SalePrice'],axis=1)
y = df['SalePrice']

In [215]:
params = {
    'metric':'rmse',
    'objective':'regression',
    'num_leaves':12,                          
    'learning_rate':0.05, 
    'max_bin' : 55, 
    'bagging_fraction' : 0.8,
    'bagging_freq' : 5, 
    'feature_fraction' : 0.5,
    'feature_fraction_seed':9, 
    'bagging_seed':9,
    'min_data_in_leaf' :6, 
    'min_sum_hessian_in_leaf' : 11,
    'verbose':0
}

In [212]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [217]:
lgbm.best_score['valid_1']['rmse']

32096.186785197486

In [226]:
import lightgbm as lgb
best = 40000
lgb_m = None
for i in range(5,100):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=i)
    train = lgb.Dataset(X_train,label=y_train)
    test = lgb.Dataset(X_test,label=y_test)
    lgbm = lgb.train(params,train,1000,valid_sets=[train,test],early_stopping_rounds=30)
    score = lgbm.best_score['valid_1']['rmse']
    if best > score:
        best = score
        lgb_m = lgbm

[1]	training's rmse: 76619.1	valid_1's rmse: 78228.6
Training until validation scores don't improve for 30 rounds.
[2]	training's rmse: 74134.9	valid_1's rmse: 75894.3
[3]	training's rmse: 71538.9	valid_1's rmse: 73428.1
[4]	training's rmse: 69104.7	valid_1's rmse: 71085.6
[5]	training's rmse: 67058.5	valid_1's rmse: 69390.3
[6]	training's rmse: 64676.5	valid_1's rmse: 67269.9
[7]	training's rmse: 62404	valid_1's rmse: 65015.5
[8]	training's rmse: 60378.6	valid_1's rmse: 62972.1
[9]	training's rmse: 58423	valid_1's rmse: 61188.5
[10]	training's rmse: 56519.2	valid_1's rmse: 59451.4
[11]	training's rmse: 54765.5	valid_1's rmse: 57818.1
[12]	training's rmse: 53025	valid_1's rmse: 56120.9
[13]	training's rmse: 51403.9	valid_1's rmse: 54734.3
[14]	training's rmse: 49945.5	valid_1's rmse: 53315.9
[15]	training's rmse: 48487.7	valid_1's rmse: 52009.6
[16]	training's rmse: 47208.2	valid_1's rmse: 50850.2
[17]	training's rmse: 45962.7	valid_1's rmse: 49660.8
[18]	training's rmse: 44837.9	valid

In [228]:
lgbm = lgb_m

In [229]:
fi = list(zip(lgbm.feature_importance(),df.drop(['SalePrice'],axis=1).columns))
fi

[(229, 'LotArea'),
 (187, 'GarageArea'),
 (221, '1stFlrSF'),
 (176, '2ndFlrSF'),
 (184, 'TotalBsmtSF'),
 (88, 'TotRmsAbvGrd'),
 (53, 'BedroomAbvGr'),
 (40, 'FullBath'),
 (38, 'HalfBath'),
 (12, 'KitchenAbvGr'),
 (53, 'Fireplaces'),
 (49, 'GarageCars'),
 (2, 'ExterCond'),
 (40, 'ExterQual'),
 (50, 'BsmtQual'),
 (10, 'BsmtCond'),
 (21, 'HeatingQC'),
 (40, 'KitchenQual'),
 (98, 'OverallCond'),
 (112, 'OverallQual'),
 (218, 'YearBuilt'),
 (92, 'MoSold')]

In [230]:
lgbm.save_model('advanced_model.txt',num_iteration=lgbm.best_iteration)

<lightgbm.basic.Booster at 0x11e2201d0>

In [231]:
X_small = X[['LotArea','1stFlrSF', '2ndFlrSF','TotalBsmtSF','GarageArea','TotRmsAbvGrd','YearBuilt','MoSold']]

In [232]:
params = {
    'metric':'rmse',
    'objective':'regression',
    'num_leaves':12,                          
    'learning_rate':0.05, 
    'max_bin' : 55, 
    'bagging_fraction' : 0.8,
    'bagging_freq' : 5, 
    'feature_fraction' : 0.7,
    'feature_fraction_seed':9, 
    'bagging_seed':9,
    'min_data_in_leaf' :6, 
    'min_sum_hessian_in_leaf' : 11,
    'verbose':0
}

In [236]:
import lightgbm as lgb
best = 45000
lgb_m = None
for i in range(5,100):
    X_train,X_test,y_train,y_test = train_test_split(X_small,y,test_size=0.2,random_state=i)
    train = lgb.Dataset(X_train,label=y_train)
    test = lgb.Dataset(X_test,label=y_test)
    lgbm = lgb.train(params,train,1000,valid_sets=[train,test],early_stopping_rounds=30)
    score = lgbm.best_score['valid_1']['rmse']
    if best > score:
        best = score
        lgb_m = lgbm

[1]	training's rmse: 76589	valid_1's rmse: 78085.8
Training until validation scores don't improve for 30 rounds.
[2]	training's rmse: 74279.4	valid_1's rmse: 75924.5
[3]	training's rmse: 72131.3	valid_1's rmse: 73724.4
[4]	training's rmse: 70222.6	valid_1's rmse: 72062.5
[5]	training's rmse: 68217.9	valid_1's rmse: 70265.4
[6]	training's rmse: 66240.5	valid_1's rmse: 68239
[7]	training's rmse: 64174.7	valid_1's rmse: 66052.1
[8]	training's rmse: 62258	valid_1's rmse: 64032.4
[9]	training's rmse: 60370.5	valid_1's rmse: 62021.7
[10]	training's rmse: 58756.7	valid_1's rmse: 60329.4
[11]	training's rmse: 57194.1	valid_1's rmse: 58814.9
[12]	training's rmse: 55783.6	valid_1's rmse: 57464.1
[13]	training's rmse: 54336.1	valid_1's rmse: 56093.1
[14]	training's rmse: 53037	valid_1's rmse: 54857.7
[15]	training's rmse: 51686.7	valid_1's rmse: 53517.6
[16]	training's rmse: 50561.9	valid_1's rmse: 52390.9
[17]	training's rmse: 49553.6	valid_1's rmse: 51380.5
[18]	training's rmse: 48439.1	valid_1

In [237]:
best

26568.338262737594

In [238]:
lgb_m.save_model('basic_model.txt',num_iteration=lgb_m.best_iteration)

<lightgbm.basic.Booster at 0x11e30a320>

In [243]:
cities.tail(12).to_csv('data/cities_small.csv',index=False)

In [250]:
s = ''
for i in X.iloc[0].tolist():
    s += str(i)+' '
s

'8450.0 548.0 856.0 854.0 856.0 8.0 3.0 2.0 1.0 1.0 0.0 2.0 3.0 4.0 4.0 3.5 5.0 4.0 5.0 7.0 2003.0 2.0 '

In [252]:
y.iloc[0]

208500