In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, date
from operator import le, eq
import gc
from sklearn import model_selection, preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import zipfile
import pickle
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler


RS = 20170501
np.random.seed(RS)

# Extracting the zip files

with zipfile.ZipFile('../input/sberbank-russian-housing-market/train.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('./')
    
with zipfile.ZipFile('../input/sberbank-russian-housing-market/test.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

with zipfile.ZipFile('../input/sberbank-russian-housing-market/macro.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

# Data Cleaning

#Data importing
trainDf = pd.read_csv('train.csv').set_index('id')
testDf = pd.read_csv('test.csv').set_index('id')
testDf['isTrain'] = 0
trainDf['isTrain'] = 1
allDf = pd.concat([trainDf,testDf])

# Change price by rate
allDf['timestamp'] = pd.to_datetime(allDf['timestamp'])

allDf['apartment_name'] = allDf.sub_area + allDf['metro_km_avto'].astype(str)
eco_map = {'excellent':4, 'good':3, 'satisfactory':2, 'poor':1, 'no data':0}
allDf['ecology'] = allDf['ecology'].map(eco_map)
#encode subarea in order
# price_by_area = allDf['price_doc'].groupby(allDf.sub_area).mean().sort_values()
# area_dict = {}
# for i in range(0,price_by_area.shape[0]):
#    area_dict[price_by_area.index[i]] = i
# allDf['sub_area'] = allDf['sub_area'].map(area_dict)
for c in allDf.columns:
    if allDf[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(allDf[c].values))
        allDf[c] = lbl.transform(list(allDf[c].values))

train_df=train_df[(train_df.price_doc>1e6) & (train_df.price_doc!=2e6) & (train_df.price_doc!=3e6)]
train_df.loc[(train_df.product_type=='Investment') & (train_df.build_year<2000),'price_doc']*=0.895
train_df.loc[train_df.product_type!='Investment','price_doc']*=0.96
###Dealing with Outlier###
allDf.loc[allDf.full_sq>2000,'full_sq'] = np.nan
allDf.loc[allDf.full_sq<3,'full_sq'] = np.nan
allDf.loc[allDf.life_sq>500,'life_sq'] = np.nan
allDf.loc[allDf.life_sq<3,'life_sq'] = np.nan
# allDf['lifesq_to_fullsq'] = 0 # 0 for normal, 1 for close,2 for outlier
allDf.loc[allDf.life_sq>0.8*allDf.full_sq,'life_sq'] = np.nan
# allDf.ix[allDf.life_sq>allDf.full_sq,['life_sq','lifesq_to_fullsq']] = np.nan, 2
allDf.loc[allDf.kitch_sq>=allDf.life_sq,'kitch_sq'] = np.nan
allDf.loc[allDf.kitch_sq>500,'kitch_sq'] = np.nan
allDf.loc[allDf.kitch_sq<2,'kitch_sq'] = np.nan
allDf.loc[allDf.state>30,'state'] = np.nan
allDf.loc[allDf.build_year<1800,'build_year'] = np.nan
allDf.loc[allDf.build_year==20052009,'build_year'] = 2005
allDf.loc[allDf.build_year==4965,'build_year'] = np.nan
allDf.loc[allDf.build_year>2021,'build_year'] = np.nan
allDf.loc[allDf.num_room>15,'num_room'] = np.nan
allDf.loc[allDf.num_room==0,'num_room'] = np.nan
allDf.loc[allDf.floor==0,'floor'] = np.nan
allDf.loc[allDf.max_floor==0,'max_floor'] = np.nan
allDf.loc[allDf.floor>allDf.max_floor,'max_floor'] = np.nan

# brings error down a lot by removing extreme price per sqm
bad_index = allDf[allDf.price_doc/allDf.full_sq > 600000].index
bad_index = bad_index.append(allDf[allDf.price_doc/allDf.full_sq < 10000].index)
allDf.drop(bad_index,axis=0,inplace=True)

####Feature Engineering####
print('Feature Engineering...')
gc.collect()

allDf['year'] = allDf.timestamp.dt.year  #may be no use because test data is out of range
allDf['weekday'] = allDf.timestamp.dt.weekday

# Assign weight
allDf['w'] = 1
allDf.loc[allDf.price_doc==1000000,'w'] *= 0.5
allDf.loc[allDf.year==2015,'w'] *= 1.5

#Floor
allDf['floor_by_max_floor'] = allDf.floor / allDf.max_floor
#allDf['floor_to_top'] = allDf.max_floor - allDf.floor

#Room
allDf['avg_room_size'] = (allDf.life_sq - allDf.kitch_sq) / allDf.num_room
allDf['life_sq_prop'] = allDf.life_sq / allDf.full_sq
allDf['kitch_sq_prop'] = allDf.kitch_sq / allDf.full_sq

#Calculate age of building
allDf['build_age'] = allDf.year - allDf.build_year
allDf = allDf.drop(['build_year'], axis=1)

#Population
allDf['popu_den'] = allDf.raion_popul / allDf.area_m
allDf['gender_rate'] = allDf.male_f / allDf.female_f
allDf['working_rate'] = allDf.work_all / allDf.full_all

#Education
allDf.loc[allDf.preschool_quota==0,'preschool_quota'] = np.nan
allDf['preschool_ratio'] =  allDf.children_preschool / allDf.preschool_quota
allDf['school_ratio'] = allDf.children_school / allDf.school_quota

allDf['square_full_sq'] = (allDf.full_sq - allDf.full_sq.mean()) ** 2
allDf['square_build_age'] = (allDf.build_age - allDf.build_age.mean()) ** 2
allDf['nan_count'] = allDf[['full_sq','build_age','life_sq','floor','max_floor','num_room']].isnull().sum(axis=1)
allDf['full*maxfloor'] = allDf.max_floor * allDf.full_sq
allDf['full*floor'] = allDf.floor * allDf.full_sq

allDf['full/age'] = allDf.full_sq / (allDf.build_age + 0.5)
allDf['age*state'] = allDf.build_age * allDf.state

# new trial
allDf['main_road_diff'] = allDf['big_road2_km'] - allDf['big_road1_km']
allDf['rate_metro_km'] = allDf['metro_km_walk'] / allDf['ID_metro'].map(allDf.metro_km_walk.groupby(allDf.ID_metro).mean().to_dict())
allDf['rate_road1_km'] = allDf['big_road1_km'] / allDf['ID_big_road1'].map(allDf.big_road1_km.groupby(allDf.ID_big_road1).mean().to_dict())
# best on LB with weekday

allDf['rate_road2_km'] = allDf['big_road2_km'] / allDf['ID_big_road2'].map(allDf.big_road2_km.groupby(allDf.ID_big_road2).mean().to_dict())
allDf['rate_railroad_km'] = allDf['railroad_station_walk_km'] / allDf['ID_railroad_station_walk'].map(allDf.railroad_station_walk_km.groupby(allDf.ID_railroad_station_walk).mean().to_dict())

allDf.drop(['year','timestamp'], axis=1, inplace = True)

#Separate train and test again
trainDf = allDf[allDf.isTrain==1].drop(['isTrain'],axis=1)
testDf = allDf[allDf.isTrain==0].drop(['isTrain','price_doc', 'w'],axis=1)

outputFile = 'train_featured.csv'
trainDf.to_csv(outputFile,index=False)
outputFile = 'test_featured.csv'
testDf.to_csv(outputFile,index=False)

In [None]:
class LGBregressor(object):
    def __init__(self,params):
        self.params = params

    def fit(self, X, y, w):
        y /= 10000000
        split = int(X.shape[0] * 0.8)
        indices = np.random.permutation(X.shape[0])
        train_id, test_id = indices[:split], indices[split:]
        x_train, y_train, w_train, x_valid, y_valid,  w_valid = X[train_id], y[train_id], w[train_id], X[test_id], y[test_id], w[test_id],
        d_train = lgb.Dataset(x_train, y_train, weight=w_train)
        d_valid = lgb.Dataset(x_valid, y_valid, weight=w_valid)
        partial_bst = lgb.train(self.params, d_train, 10000, valid_sets=d_valid, early_stopping_rounds=50)
        num_round = partial_bst.best_iteration
        d_all = lgb.Dataset(X, label = y, weight=w)
        self.bst = lgb.train(self.params, d_all, num_round)

    def predict(self, X):
        return self.bst.predict(X) * 10000000

class XGBregressor(object):
    def __init__(self, params):
        self.params = params

    def fit(self, X, y, w=None):
        if w is None:
            w = np.ones(X.shape[0])
        split = int(X.shape[0] * 0.8)
        indices = np.random.permutation(X.shape[0])
        train_id, test_id = indices[:split], indices[split:]
        x_train, y_train, w_train, x_valid, y_valid,  w_valid = X[train_id], y[train_id], w[train_id], X[test_id], y[test_id], w[test_id],
        d_train = xgb.DMatrix(x_train, label=y_train, weight=w_train)
        d_valid = xgb.DMatrix(x_valid, label=y_valid, weight=w_valid)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        partial_bst = xgb.train(self.params, d_train, 10000, early_stopping_rounds=50, evals = watchlist, verbose_eval=100)
        num_round = partial_bst.best_iteration
        d_all = xgb.DMatrix(X, label = y, weight=w)
        self.bst = xgb.train(self.params, d_all, num_round)

    def predict(self, X):
        test = xgb.DMatrix(X)
        return self.bst.predict(test)

class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, trainDf, testDf):
        X = trainDf.drop(['price_doc', 'w'], axis=1).values
        y = trainDf['price_doc'].values
        w = trainDf['w'].values
        T = testDf.values

        X_fillna = trainDf.drop(['price_doc', 'w'], axis=1).fillna(-999).values
        T_fillna = testDf.fillna(-999).values

        folds = list(KFold(n_splits=self.n_folds, shuffle=True).split(X))
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            print('Training base model ' + str(i+1) + '...')
            S_test_i = np.zeros((T.shape[0], len(folds)))
            for j, (train_idx, test_idx) in enumerate(folds):
                print('Training round ' + str(j+1) + '...')
                if clf not in [xgb1,lgb1]: # sklearn models cannot handle missing values.
                    X = X_fillna
                    T = T_fillna
                X_train = X[train_idx]
                y_train = y[train_idx]
                w_train = w[train_idx]
                X_holdout = X[test_idx]
                clf.fit(X_train, y_train, w_train)
                y_pred = clf.predict(X_holdout)
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)
            S_test[:, i] = S_test_i.mean(1)
        self.S_train, self.S_test, self.y = S_train, S_test, y
        self.corr = pd.concat([pd.DataFrame(S_train),trainDf['price_doc']],axis=1).corr()
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)
        return y_pred

trainDf = pd.read_csv('/kaggle/working/train_featured.csv')
testDf = pd.read_csv('/kaggle/working/test_featured.csv')

params1 = {'eta':0.05, 'max_depth':5, 'subsample':0.8, 'colsample_bytree':0.8, 'min_child_weight':1,
              'gamma':0, 'silent':1, 'objective':'reg:linear', 'eval_metric':'rmse'}
xgb1 = XGBregressor(params1)

params2 = {'booster':'gblinear', 'alpha':0,
           'eta':0.1, 'max_depth':2, 'subsample':1, 'colsample_bytree':1, 'min_child_weight':1,
            'gamma':0, 'silent':1, 'objective':'reg:linear', 'eval_metric':'rmse'}
xgb2_meta = XGBregressor(params2)

params_lgb = {'objective':'regression','metric':'rmse',
              'learning_rate':0.05,'max_depth':-1,'sub_feature':0.7,'sub_row':1,
              'num_leaves':15,'min_data':30,'max_bin':20,
              'bagging_fraction':0.9,'bagging_freq':40,'verbosity':0}
lgb1 = LGBregressor(params_lgb)

RF = RandomForestRegressor(n_estimators=500, max_features=0.2)
ETR = ExtraTreesRegressor(n_estimators=500, max_features=0.3, max_depth=None)
Ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=15),n_estimators=200)
GBR = GradientBoostingRegressor(n_estimators=200,max_depth=5,max_features=0.5)
LR =LinearRegression()
    
E = Ensemble(n_folds=5, stacker=xgb2_meta, base_models=[xgb1,lgb1,RF,ETR,Ada,GBR])
prediction = E.fit_predict(trainDf, testDf)
output = pd.read_csv('test.csv')
output = output[['id']]
output['price_doc'] = prediction
output.to_csv(r'Ensemble\Submission_Stack.csv',index=False)

corr = pd.concat([pd.DataFrame(S_train),trainDf['price_doc']],axis=1).corr()