In [1]:
import requests

def send_line_notification(message):
    line_token = "9jBlZvpTEg7fQSKaevfKIGU1EQCAxl0SujL7i1pywuQ"
    endpoint = 'https://notify-api.line.me/api/notify'
    message = "\n{}".format(message)
    payload = {'message': message}
    headers = {'Authorization': 'Bearer {}'.format(line_token)}
    requests.post(endpoint, data=payload, headers=headers)

In [2]:
import numpy as np
from scipy.stats import stats, norm, skew
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasRegressor
from scipy.special import boxcox1p
import lightgbm as lgb
import xgboost as xgb
import pandas as pd

%matplotlib inline
np.random.seed(2)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
# Load the data
train = pd.read_csv("./train_set.csv")
test = pd.read_csv("./test_set.csv")

print(train.shape)
print(test.shape)

  interactivity=interactivity, compiler=compiler, result=result)


(49108, 49)
(49108, 48)


In [4]:
train = train.drop('Id', axis=1)
train = train.drop('SQUARE', axis=1)
test = test.drop('SQUARE', axis=1)
train.columns

Index(['BATHRM', 'HF_BATHRM', 'HEAT', 'AC', 'NUM_UNITS', 'ROOMS', 'BEDRM',
       'AYB', 'YR_RMDL', 'EYB', 'STORIES', 'SALEDATE', 'QUALIFIED', 'SALE_NUM',
       'GBA', 'BLDG_NUM', 'STYLE', 'STRUCT', 'GRADE', 'CNDTN', 'EXTWALL',
       'ROOF', 'INTWALL', 'KITCHENS', 'FIREPLACES', 'USECODE', 'LANDAREA',
       'GIS_LAST_MOD_DTTM', 'SOURCE', 'CMPLX_NUM', 'LIVING_GBA', 'FULLADDRESS',
       'CITY', 'STATE', 'ZIPCODE', 'NATIONALGRID', 'LATITUDE', 'LONGITUDE',
       'ASSESSMENT_NBHD', 'ASSESSMENT_SUBNBHD', 'CENSUS_TRACT', 'CENSUS_BLOCK',
       'WARD', 'X', 'Y', 'QUADRANT', 'PRICE'],
      dtype='object')

In [5]:
train.dtypes

BATHRM                  int64
HF_BATHRM               int64
HEAT                   object
AC                     object
NUM_UNITS             float64
ROOMS                   int64
BEDRM                   int64
AYB                   float64
YR_RMDL               float64
EYB                     int64
STORIES               float64
SALEDATE               object
QUALIFIED              object
SALE_NUM                int64
GBA                   float64
BLDG_NUM                int64
STYLE                  object
STRUCT                 object
GRADE                  object
CNDTN                  object
EXTWALL                object
ROOF                   object
INTWALL                object
KITCHENS              float64
FIREPLACES              int64
USECODE                 int64
LANDAREA                int64
GIS_LAST_MOD_DTTM      object
SOURCE                 object
CMPLX_NUM             float64
LIVING_GBA            float64
FULLADDRESS            object
CITY                   object
STATE     

In [6]:
cols_with_none_as_nan = [
    "HEAT", 
    "AC",
    "SALEDATE",
    "STYLE",
    "STRUCT",
    "GRADE",
    "CNDTN",
    "EXTWALL",
    "ROOF",
    "INTWALL",
    "FULLADDRESS",
    "CITY",
    "STATE",
    "NATIONALGRID",
    "ASSESSMENT_SUBNBHD",
    "CENSUS_BLOCK",
    "QUADRANT"
    ]


# fill missing text fields with a default string
object_columns = train.select_dtypes(include=[object])
test_object_columns = test.select_dtypes(include=[object])

# for these colunms the string 'None' will be inserted in place of nan
for col in cols_with_none_as_nan:
    object_columns.loc[:, col] = object_columns.loc[:, col].fillna('None')
    test_object_columns.loc[:, col] = test_object_columns.loc[:, col].fillna('None')

remaining_fix = object_columns.isnull().sum()
print('Fixes remaining on train set\n', remaining_fix[remaining_fix>0])

remaining_fix = test_object_columns.isnull().sum()
print('Fixes remaining on test set\n',remaining_fix[remaining_fix>0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Fixes remaining on train set
 Series([], dtype: int64)
Fixes remaining on test set
 Series([], dtype: int64)


In [7]:
numeric_columns = train.select_dtypes(include=[int, float])

remaining_fix = numeric_columns.isnull().sum()
print('Fixes remaining on train set\n',remaining_fix[remaining_fix>0])

test_numeric_columns = test.select_dtypes(include=[int, float])

remaining_fix = test_numeric_columns.isnull().sum()
print('Fixes remaining on test set\n',remaining_fix[remaining_fix>0])

Fixes remaining on train set
 NUM_UNITS     20053
AYB              56
YR_RMDL       20418
STORIES       20071
GBA           20053
KITCHENS      20053
CMPLX_NUM     29055
LIVING_GBA    29055
X                47
Y                47
dtype: int64
Fixes remaining on test set
 NUM_UNITS     20263
AYB              56
YR_RMDL       20124
STORIES       20278
GBA           20263
KITCHENS      20264
CMPLX_NUM     28845
LIVING_GBA    28845
X                54
Y                54
dtype: int64


In [8]:
cols_with_zero_as_nan = ['NUM_UNITS',
                         'AYB', 
                         'YR_RMDL',
                         'STORIES',
                         'GBA',
                         'CMPLX_NUM',
                         'LIVING_GBA',
                         'KITCHENS'
                        ]
cols_with_mean_as_nan = ['X','Y']

# for these colunms the mean will be inserted in place of nan
for col in cols_with_mean_as_nan:
    numeric_columns.loc[:, col] = numeric_columns.loc[:, col].fillna(numeric_columns[col].mean())
    test_numeric_columns.loc[:, col] = test_numeric_columns.loc[:, col].fillna(test_numeric_columns[col].mean())

# for these colunms a zero will be inserted in place of nan
for col in cols_with_zero_as_nan:
    numeric_columns.loc[:, col] = numeric_columns.loc[:, col].fillna(0)
    test_numeric_columns.loc[:, col] = test_numeric_columns.loc[:, col].fillna(0)


remaining_fix = numeric_columns.isnull().sum()
print('Fixes remaining on train set\n',remaining_fix[remaining_fix>0])

remaining_fix = test_numeric_columns.isnull().sum()
print('Fixes remaining on test set\n',remaining_fix[remaining_fix>0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Fixes remaining on train set
 Series([], dtype: int64)
Fixes remaining on test set
 Series([], dtype: int64)


In [9]:
def fix_skewness(dataframe):
    skewed_feats = dataframe.apply(lambda x: abs(skew(x.dropna()))).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew': skewed_feats})
    skewness = skewness[abs(skewness) > 0.75].dropna()
    print("There are {} skewed numerical features to transform".format(skewness.shape[0]))
    print("\nSkew > .75 in numerical features: \n")
    print(skewness)
    # Fix skewness
    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
        if feat == 'PRICE': continue
        dataframe[feat] = boxcox1p(dataframe[feat], lam)
    print('Fixed skewness')

fix_skewness(numeric_columns)
fix_skewness(test_numeric_columns)

There are 17 skewed numerical features to transform

Skew > .75 in numerical features: 

                  Skew
FIREPLACES  221.596365
STORIES     178.135568
BLDG_NUM     83.740245
AYB          19.774995
PRICE        18.377059
ZIPCODE      16.634201
LANDAREA     13.677086
KITCHENS      4.464330
USECODE       2.245318
SALE_NUM      1.764593
NUM_UNITS     1.631241
LIVING_GBA    1.543186
BATHRM        1.369842
GBA           1.355860
CMPLX_NUM     1.289017
ROOMS         1.227033
HF_BATHRM     0.990909


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Fixed skewness
There are 16 skewed numerical features to transform

Skew > .75 in numerical features: 

                  Skew
FIREPLACES  167.931247
STORIES      95.583603
BLDG_NUM     83.740245
AYB          19.759579
ZIPCODE      16.335698
LANDAREA      4.606162
USECODE       2.265421
SALE_NUM      1.753085
LIVING_GBA    1.630623
NUM_UNITS     1.612659
KITCHENS      1.527030
BATHRM        1.321382
CMPLX_NUM     1.290780
ROOMS         1.210204
GBA           1.207344
HF_BATHRM     1.138631
Fixed skewness


In [10]:
test_object_columns.isnull().sum()

HEAT                  0
AC                    0
SALEDATE              0
QUALIFIED             0
STYLE                 0
STRUCT                0
GRADE                 0
CNDTN                 0
EXTWALL               0
ROOF                  0
INTWALL               0
GIS_LAST_MOD_DTTM     0
SOURCE                0
FULLADDRESS           0
CITY                  0
STATE                 0
NATIONALGRID          0
ASSESSMENT_NBHD       0
ASSESSMENT_SUBNBHD    0
CENSUS_BLOCK          0
WARD                  0
QUADRANT              0
dtype: int64

In [11]:
strings = np.array([])
for c in object_columns.columns:
    strings = np.append(strings, pd.unique(object_columns[c].values))

for c in test_object_columns.columns:
    strings = np.append(strings, pd.unique(test_object_columns[c].values))

print(len(strings), 'distinct labels generated')

labeler = LabelEncoder()
labeler.fit(strings.astype("str"))

for c in object_columns.columns:
    object_columns.loc[:,c] = labeler.transform(object_columns.loc[:,c])
    test_object_columns.loc[:,c] = labeler.transform(test_object_columns.loc[:,c])

object_columns[0:5]

135516 distinct labels generated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,HEAT,AC,SALEDATE,QUALIFIED,STYLE,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,...,SOURCE,FULLADDRESS,CITY,STATE,NATIONALGRID,ASSESSMENT_NBHD,ASSESSMENT_SUBNBHD,CENSUS_BLOCK,WARD,QUADRANT
0,125609,125702,85058,125643,92729,125649,125678,125678,125568,125555,...,125645,18317,125682,125577,33683,125636,3868,1655,125686,125629
1,125609,125702,83385,125643,102182,125649,125678,125601,125568,125555,...,125645,17334,125682,125577,33959,125636,3868,1655,125686,125629
2,125693,125702,83796,125643,80543,125649,125537,125678,125667,125623,...,125645,17734,125682,125577,33961,125636,3868,1654,125686,125629
3,125693,125702,83557,125643,80543,125649,125537,125678,125568,125623,...,125645,17860,125682,125577,33919,125636,3868,1654,125686,125629
4,125609,125702,81023,125643,80543,125649,125602,125601,125568,125555,...,125645,18132,125682,125577,33851,125636,3868,1654,125686,125629


In [13]:
# final train dataset
train_ds = object_columns.join(numeric_columns)
train_ds["PRICE"] = np.log1p(train_ds["PRICE"])

# test dataset
test_ds = test_object_columns.join(test_numeric_columns)

#correlation matrix
corrmat = train_ds.corr()['PRICE']
#f, ax = plt.subplots(figsize=(12, 9))
#sns.heatmap(corrmat, vmax=.8, square=True);

best_columns = corrmat[abs(corrmat) > 0.0].index
train_ds = train_ds[best_columns]
test_ds = test_ds[best_columns.drop('PRICE')]

In [14]:
X_train = (train_ds.values[:,:-1])
y_train = np.asarray([[t] for t in (train_ds.values[:,-1])])
X_test = test_ds.values

print('Training set features shape', X_train.shape)
print('Training set labels shape', y_train.shape)
print('Test set shape', test_ds.shape)

Training set features shape (49108, 46)
Training set labels shape (49108, 1)
Test set shape (49108, 46)


In [15]:
from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X_train)
X_train = pd.DataFrame(transformer.transform(X_train))
X_test = pd.DataFrame(transformer.transform(X_test))

In [16]:
def train_model(depth, learning_rate, n_estimators, model_type='xgb'):
    if model_type == 'xgb':
        model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                                 learning_rate=learning_rate, max_depth=depth, 
                                 min_child_weight=1.7817, n_estimators=n_estimators,
                                 reg_alpha=0.4640, reg_lambda=0.8571,
                                 subsample=0.5213, silent=1,
                                 random_state =7, nthread = -1)
    if model_type == 'lgb':
        model = lgb.LGBMRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                                 learning_rate=learning_rate, max_depth=depth, 
                                 min_child_weight=1.7817, n_estimators=n_estimators,
                                 reg_alpha=0.4640, reg_lambda=0.8571,
                                 subsample=0.5213, silent=1,
                                 random_state =7, nthread = -1)
    score = rmsle_cv(model)
    print(model_type, " score: depth={:d} lr={:.2f} est={:d} -> mean:{:.5f} std:{:.4f}".format(depth, learning_rate, n_estimators, score.mean(), score.std()))
    return score

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train.flatten(), scoring="neg_mean_squared_error", cv = kf))
    return(rmse)


def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
result = []
for depth in range(3, 4):
    for learning_rate in range(2, 3, 1):
        for n_estimators in range(4000, 4800, 200):
            score = train_model(depth, learning_rate/100, n_estimators, model_type='xgb')
            result.append([depth, learning_rate/100, n_estimators, score.mean(), score.std()])

xgb  score: depth=3 lr=0.02 est=4000 -> mean:0.51648 std:0.1681


In [None]:
# check results
result = pd.DataFrame(result, columns=['depth', 'learning_rate', 'n_estimators', 'score_mean', 'score_std'])
result.describe()