In [1]:
import numpy as np
import pandas as pd
from scipy import stats

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.style as style

import seaborn as sns 
sns.set()

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.feature_selection import RFE

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge,RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb
import lightgbm as lgb

import statsmodels.api as sm

from scipy.stats import linregress

In [2]:
style.use('fivethirtyeight')

rand_state=1000 
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=rand_state)
folds = KFold(n_splits = 5, shuffle = True, random_state = rand_state)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## import data set

In [3]:
train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')
# train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
# test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
train["SalePrice1"] = np.log1p(train["SalePrice"])

## missing values

In [4]:
# missing value for each variables
def missing_percentage(df):
    nadf=(round(df.isna().sum().sort_values(ascending=True)/len(df),4)).reset_index()
    nadf.columns=['Variable','Missing_percentage']
    nadf=nadf[nadf['Missing_percentage']>0]

#     if len(nadf)>0 :        
#         fig, ax = plt.subplots(figsize=(10,5m
#         sns.barplot(x='Missing_percentage',
#                y='Variable',
#                data=nadf,
#                palette='Spectral')
    
    return nadf

train_na=missing_percentage(train)
test_na=missing_percentage(test)

## Replaced all missing values in LotFrontage by imputing the median value of each neighborhood. 
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform( lambda x: x.fillna(x.median()))
test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'].transform( lambda x: x.fillna(x.median()))

typicals=['ExterQual','ExterCond','BsmtQual','BsmtCond',
          'HeatingQC','KitchenQual','FireplaceQu','GarageQual',
          'GarageCond','PoolQC']
nos=['CentralAir']
nas=['FireplaceQu','Fence']

for col in typicals:
    train[col]=train[col].fillna('TA')
    test[col]=test[col].fillna('TA')
    
for col in nos:
    train[col]=train[col].fillna('N')
    test[col]=test[col].fillna('TA')

for col in nas:
    train[col]=train[col].fillna('NA')
    test[col]=test[col].fillna('TA')
    
# numeric columns
def numeric_columns(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df_numeric = df.select_dtypes(include=numerics)
    return df_numeric 

# object columns
def object_columns(df):
    objects = ['object']
    df_object = df.select_dtypes(include=objects)
    return df_object 

## Train Dataset
missing=train_na['Variable']
train_missing=train[missing]

# fill null values with 0 for numeric columns
for col in numeric_columns(train_missing).columns:
    if col !='LotFrontage':
        train[col]=train[col].fillna(0)
    
# fill null values with None for object columns
for col in object_columns(train_missing).columns:
    if col not in typicals and col not in nos and col not in nas:
        train[col]=train[col].fillna('None')
        
## test Dataset
missing=test_na['Variable']
test_missing=test[missing]

# fill null values with 0 for numeric columns
for col in numeric_columns(test_missing).columns:
    if col !='LotFrontage':
        test[col]=test[col].fillna(0)
    
# fill null values with None for object columns
for col in object_columns(test_missing).columns:
    if col not in typicals and col not in nos and col not in nas:
        test[col]=test[col].fillna('None')


In [5]:
missing_percentage(train)

Unnamed: 0,Variable,Missing_percentage


In [6]:
missing_percentage(test)

Unnamed: 0,Variable,Missing_percentage


## feature transformations

In [7]:
def cat_features(df, ls):
    for l in ls:
        df[l]=df[l].astype(str)
    return df

cat_fns=['MSSubClass','YrSold','MoSold', 
         'OverallQual','OverallCond','GarageCars']
cat_features(train, cat_fns)
cat_features(test, cat_fns)

def sum_features(df):
    df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                                   df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))
    df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                                  df['EnclosedPorch'] + df['ScreenPorch'] +
                                  df['WoodDeckSF'])
    return df

sum_features(train)
sum_features(test)

def log_features(df, ls):
    for l in ls:
        df[l+'_log'] = np.log1p(df[l])
        df.drop([l], axis=1)
    return df

log_fns = ['TotalSF']
log_features(train,log_fns)
log_features(test,log_fns)

def sqr_features(df, ls):
    for l in ls:
        df[l+'_sqr']=df[l]*df[l]
#         df[l+'_log'+'_sqr']=df[l+'_log']*df[l+'_log']
    return df

sqr_fns = ['TotalSF_log','Total_Bathrooms','Total_porch_sf',
           'GrLivArea', 'GarageArea','TotalBsmtSF','1stFlrSF','YearBuilt','FullBath',
           'YearRemodAdd','TotRmsAbvGrd','MasVnrArea','BsmtFinSF1','Fireplaces','LotFrontage',
           'BsmtFullBath','OpenPorchSF','WoodDeckSF','HalfBath','2ndFlrSF','GarageYrBlt',
           'LotArea','BedroomAbvGr','BsmtUnfSF','ScreenPorch','EnclosedPorch',
           'KitchenAbvGr','LowQualFinSF','MiscVal','BsmtFinSF2','BsmtHalfBath']

sqr_features(train,sqr_fns)
sqr_features(test,sqr_fns)

def binary_features(df):
    df['haspool'] = df['PoolArea'].apply(lambda x: 'Y' if x > 0 else 'N')
    df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 'Y' if x > 0 else 'N')
    df['hasgarage'] = df['GarageArea'].apply(lambda x: 'Y' if x > 0 else 'N')
    df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 'Y' if x > 0 else 'N')
    df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 'Y' if x > 0 else 'N')
    return df

binary_features(train)
binary_features(test)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotalSF,Total_Bathrooms,Total_porch_sf,TotalSF_log,TotalSF_log_sqr,Total_Bathrooms_sqr,Total_porch_sf_sqr,GrLivArea_sqr,GarageArea_sqr,TotalBsmtSF_sqr,1stFlrSF_sqr,YearBuilt_sqr,FullBath_sqr,YearRemodAdd_sqr,TotRmsAbvGrd_sqr,MasVnrArea_sqr,BsmtFinSF1_sqr,Fireplaces_sqr,LotFrontage_sqr,BsmtFullBath_sqr,OpenPorchSF_sqr,WoodDeckSF_sqr,HalfBath_sqr,2ndFlrSF_sqr,GarageYrBlt_sqr,LotArea_sqr,BedroomAbvGr_sqr,BsmtUnfSF_sqr,ScreenPorch_sqr,EnclosedPorch_sqr,KitchenAbvGr_sqr,LowQualFinSF_sqr,MiscVal_sqr,BsmtFinSF2_sqr,BsmtHalfBath_sqr,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,TA,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,TA,MnPrv,,0,6,2010,WD,Normal,1778.0,1.0,260,7.483807,56.007363,1.00,67600,802816,532900.0,777924.0,802816,3845521,1,3845521,25,0.0,219024.0,0,6400.0,0.0,0,19600,0,0,3845521.0,135070884,4,72900.0,14400,0,1,0,0,20736.0,0.0,N,N,Y,Y,N
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,TA,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,TA,TA,Gar2,12500,6,2010,WD,Normal,2658.0,1.5,429,7.885705,62.184350,2.25,184041,1766241,97344.0,1766241.0,1766241,3833764,1,3833764,36,11664.0,851929.0,0,6561.0,0.0,1296,154449,1,0,3833764.0,203547289,9,164836.0,0,0,1,0,156250000,0.0,0.0,N,N,Y,Y,N
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,TA,MnPrv,,0,3,2010,WD,Normal,2557.0,2.5,246,7.846981,61.575111,6.25,60516,2653641,232324.0,861184.0,861184,3988009,4,3992004,36,0.0,625681.0,1,5476.0,0.0,1156,44944,1,491401,3988009.0,191268900,9,18769.0,0,0,1,0,0,0.0,0.0,N,Y,Y,Y,Y
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,TA,TA,,0,6,2010,WD,Normal,2530.0,2.5,396,7.836370,61.408691,6.25,156816,2572816,220900.0,857476.0,857476,3992004,4,3992004,49,400.0,362404.0,1,6084.0,0.0,1296,129600,1,459684,3992004.0,99560484,9,104976.0,0,0,1,0,0,0.0,0.0,N,Y,Y,Y,Y
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,TA,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,TA,TA,,0,1,2010,WD,Normal,2560.0,2.0,226,7.848153,61.593507,4.00,51076,1638400,256036.0,1638400.0,1638400,3968064,4,3968064,25,0.0,69169.0,0,1849.0,0.0,6724,0,0,0,3968064.0,25050025,4,1034289.0,20736,0,1,0,0,0.0,0.0,N,N,Y,Y,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,2Story,4,7,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,546.0,546.0,GasA,Gd,Y,SBrkr,546,546,0,1092,0.0,0.0,1,1,3,1,TA,5,Typ,0,TA,,0.0,,0.0,0.0,TA,TA,Y,0,0,0,0,0,0,TA,TA,,0,6,2006,WD,Normal,1638.0,1.5,0,7.401842,54.787259,2.25,0,1192464,0.0,298116.0,298116,3880900,1,3880900,25,0.0,0.0,0,441.0,0.0,0,0,1,298116,0.0,3748096,9,298116.0,0,0,1,0,0,0.0,0.0,N,Y,N,Y,N
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,5,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,252.0,Unf,0.0,294.0,546.0,GasA,TA,Y,SBrkr,546,546,0,1092,0.0,0.0,1,1,3,1,TA,6,Typ,0,TA,CarPort,1970.0,Unf,1.0,286.0,TA,TA,Y,0,24,0,0,0,0,TA,TA,,0,4,2006,WD,Abnorml,1638.0,1.5,24,7.401842,54.787259,2.25,576,1192464,81796.0,298116.0,298116,3880900,1,3880900,36,0.0,63504.0,0,441.0,0.0,576,0,1,298116,3880900.0,3587236,9,86436.0,0,0,1,0,0,0.0,0.0,N,Y,Y,Y,N
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1Story,5,7,1960,1996,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,ALQ,1224.0,Unf,0.0,0.0,1224.0,GasA,Ex,Y,SBrkr,1224,0,0,1224,1.0,0.0,1,0,4,1,TA,7,Typ,1,TA,Detchd,1960.0,Unf,2.0,576.0,TA,TA,Y,474,0,0,0,0,0,TA,TA,,0,9,2006,WD,Abnorml,2448.0,2.0,474,7.803435,60.893599,4.00,224676,1498176,331776.0,1498176.0,1498176,3841600,1,3984016,49,0.0,1498176.0,1,25600.0,1.0,0,224676,0,0,3841600.0,400000000,16,0.0,0,0,1,0,0,0.0,0.0,N,N,Y,Y,Y
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,SFoyer,5,5,1992,1992,Gable,CompShg,HdBoard,Wd Shng,,0.0,TA,TA,PConc,Gd,TA,Av,GLQ,337.0,Unf,0.0,575.0,912.0,GasA,TA,Y,SBrkr,970,0,0,970,0.0,1.0,1,0,3,1,TA,6,Typ,0,TA,,0.0,,0.0,0.0,TA,TA,Y,80,32,0,0,0,0,TA,MnPrv,Shed,700,7,2006,WD,Normal,1882.0,1.5,112,7.540622,56.860973,2.25,12544,940900,0.0,831744.0,940900,3968064,1,3968064,36,0.0,113569.0,0,3844.0,0.0,1024,6400,0,0,0.0,109014481,9,330625.0,0,0,1,0,490000,0.0,1.0,N,N,N,Y,N


## data cleansing
- delete outliers

In [8]:
def clean_data(df, target):
    # z-score greater than 3
    df_num=numeric_columns(df)
    z=np.abs(stats.zscore(df_num))
    df_z1=z[(z[target]>3) ]

    # IQR 
    Q1=df_num.quantile(0.25)
    Q3=df_num.quantile(0.75)
    IQR=(Q3-Q1)
    df_iqr=(df_num<(Q1-1.5*IQR)) | (df_num>(Q3+1.5*IQR))
    df_iqr1=df_iqr[(df_iqr[target]) ]

    # merge two data sets
    d1=pd.merge(df_z1, df_iqr1, left_index=True, right_index=True)
    d1

    index = d1.index
    print(index.tolist())

    # drop outliers
    df=df.drop(index)
    return df

clean_data(train,'SalePrice')
    
# train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
# train.drop(train[(train['GrLivArea']>4500) & (train['SalePrice']<300000)].index, inplace=True)


[58, 178, 185, 349, 389, 440, 473, 496, 527, 591, 664, 691, 769, 798, 803, 898, 1046, 1142, 1169, 1182, 1243, 1373]


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,SalePrice1,TotalSF,Total_Bathrooms,Total_porch_sf,TotalSF_log,TotalSF_log_sqr,Total_Bathrooms_sqr,Total_porch_sf_sqr,GrLivArea_sqr,GarageArea_sqr,TotalBsmtSF_sqr,1stFlrSF_sqr,YearBuilt_sqr,FullBath_sqr,YearRemodAdd_sqr,TotRmsAbvGrd_sqr,MasVnrArea_sqr,BsmtFinSF1_sqr,Fireplaces_sqr,LotFrontage_sqr,BsmtFullBath_sqr,OpenPorchSF_sqr,WoodDeckSF_sqr,HalfBath_sqr,2ndFlrSF_sqr,GarageYrBlt_sqr,LotArea_sqr,BedroomAbvGr_sqr,BsmtUnfSF_sqr,ScreenPorch_sqr,EnclosedPorch_sqr,KitchenAbvGr_sqr,LowQualFinSF_sqr,MiscVal_sqr,BsmtFinSF2_sqr,BsmtHalfBath_sqr,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,TA,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,TA,,,0,2,2008,WD,Normal,208500,12.247699,2566,3.5,61,7.850493,61.630243,12.25,3721,2924100,300304,732736,732736,4012009,4,4012009,64,38416.0,498436,0,4225.0,1,3721,0,1,729316,4012009.0,71402500,9,22500,0,0,1,0,0,0,0,N,Y,Y,Y,N
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,TA,,,0,5,2007,WD,Normal,181500,12.109016,2524,2.5,298,7.833996,61.371499,6.25,88804,1592644,211600,1592644,1592644,3904576,4,3904576,36,0.0,956484,1,6400.0,0,0,88804,0,0,3904576.0,92160000,9,80656,0,0,1,0,0,0,1,N,N,Y,Y,Y
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,TA,,,0,9,2008,WD,Normal,223500,12.317171,2706,3.5,42,7.903596,62.466834,12.25,1764,3189796,369664,846400,846400,4004001,4,4008004,36,26244.0,236196,1,4624.0,1,1764,0,1,749956,4004001.0,126562500,9,188356,0,0,1,0,0,0,0,N,Y,Y,Y,Y
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,TA,,,0,2,2006,WD,Abnorml,140000,11.849405,2473,2.0,307,7.813592,61.052213,4.00,94249,2948089,412164,571536,923521,3667225,1,3880900,49,0.0,46656,1,3600.0,1,1225,0,0,571536,3992004.0,91202500,9,291600,0,73984,1,0,0,0,0,N,Y,Y,Y,Y
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,TA,,,0,12,2008,WD,Normal,250000,12.429220,3343,3.5,276,8.114923,65.851975,12.25,76176,4831204,698896,1311025,1311025,4000000,4,4000000,81,122500.0,429025,1,7056.0,1,7056,36864,1,1108809,4000000.0,203347600,16,240100,0,0,1,0,0,0,0,N,Y,Y,Y,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,TA,,,0,8,2007,WD,Normal,175000,12.072547,2600,2.5,40,7.863651,61.837011,6.25,1600,2712609,211600,908209,908209,3996001,4,4000000,49,0.0,0,1,3844.0,0,1600,0,1,481636,3996001.0,62678889,9,908209,0,0,1,0,0,0,0,N,Y,Y,Y,Y
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,TA,MnPrv,,0,2,2010,WD,Normal,210000,12.254868,3615,3.0,349,8.193124,67.127276,9.00,121801,4297329,250000,2377764,4297329,3912484,4,3952144,49,14161.0,624100,4,7225.0,1,0,121801,0,0,3912484.0,173580625,9,346921,0,0,1,0,0,26569,0,N,N,Y,Y,Y
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,TA,GdPrv,Shed,2500,5,2010,WD,Normal,266500,12.493133,3492,2.0,60,8.158516,66.561387,4.00,3600,5475600,63504,1327104,1411344,3767481,4,4024036,81,0.0,75625,4,4356.0,0,3600,0,0,1327104,3767481.0,81757764,16,769129,0,0,1,0,6250000,0,0,N,Y,Y,Y,Y
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,TA,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,TA,,,0,4,2010,WD,Normal,142125,11.864469,2156,2.0,478,7.676474,58.928248,4.00,228484,1162084,57600,1162084,1162084,3802500,1,3984016,25,0.0,2401,0,4624.0,1,0,133956,0,0,3802500.0,94420089,4,0,0,12544,1,0,0,1058841,0,N,N,Y,Y,N


## split data set

In [9]:
train['SalePrice']=train['SalePrice1']
train=train.drop('SalePrice1',axis=1)

train_objs_num = len(train)

df_full = pd.concat(objs=[train, test], axis=0)
df_full_dummy = pd.get_dummies(df_full, drop_first=True)

train_dummy = df_full_dummy[:train_objs_num]
test_dummy = df_full_dummy[train_objs_num:]

test_dummy=test_dummy.drop('SalePrice',axis=1)

# train_dummy=pd.get_dummies(train, drop_first=True)
# test_dummy=pd.get_dummies(test, drop_first=True)

y=train['SalePrice']
X=train.drop('SalePrice',axis=1)

yd=train_dummy['SalePrice']
Xd=train_dummy.drop('SalePrice',axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)
Xd_train, Xd_test, yd_train, yd_test = train_test_split(Xd, yd, test_size=0.2, random_state=rand_state)

sc = StandardScaler()
Xd_sc_train = sc.fit_transform(Xd_train)
Xd_sc_test = sc.fit_transform(Xd_test)

test_sc=sc.fit_transform(test_dummy)


# checking the percentage of training set!
np.round(len(X_train)/len(X),3)

0.8

## linear regression

In [10]:
%%time

lm = LinearRegression()

lm_params = [{'n_features_to_select': list(range(250, 300))}]

lm.fit(Xd_train, yd_train)
rfe = RFE(lm)             

lm_search = GridSearchCV(estimator = rfe, 
                        param_grid = lm_params, 
                        scoring=['neg_mean_squared_error', 'r2'],
                        refit='r2',
                        cv = 3, 
                        n_jobs=-1,
                        return_train_score=True)      

# fit the model
lm_result = lm_search.fit(Xd_train, yd_train)   

# summarize result
print('Train R^2 Score : %.3f'%lm_result.best_estimator_.score(Xd_train, yd_train))
print('Test R^2 Score : %.3f'%lm_result.best_estimator_.score(Xd_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%lm_result.best_score_)
print('Best Parameters : ',lm_result.best_params_)
print('Training mse : ',mean_squared_error(yd_train,lm_search.predict(Xd_train)))
print('Testing mse : ',mean_squared_error(yd_test,lm_search.predict(Xd_test)))

# Train R^2 Score : 0.946
# Test R^2 Score : 0.890
# Best R^2 Score Through Grid Search : 0.777
# Best Parameters :  {'n_features_to_select': 261}
# Training mse :  0.008558220652074759
# Testing mse :  0.01837241515214434
# CPU times: user 14.5 s, sys: 9.49 s, total: 24 s
# Wall time: 3min 6s

Train R^2 Score : 0.947
Test R^2 Score : 0.893
Best R^2 Score Through Grid Search : 0.780
Best Parameters :  {'n_features_to_select': 277}
Training mse :  0.008400494139087325
Testing mse :  0.017802367770479433
Wall time: 3min 12s


## lasso regression

In [11]:
%%time

lasso = Lasso()

lasso_alphas = [0.0003]
lasso_param = {
    'alpha':lasso_alphas
       }

# define search
lasso_search = GridSearchCV(lasso, lasso_param, 
                        scoring=['neg_mean_squared_error', 'r2'],
                        refit='r2', n_jobs=-1, cv=folds,
                        return_train_score=True)

# execute search
lasso_result = lasso_search.fit(Xd_sc_train, yd_train)

# summarize result
# print('Best Score: %s' % lasso_result.best_score_)
# print('Best Hyperparameters: %s' % lasso_result.best_params_)

print('Train R^2 Score : %.3f'%lasso_result.best_estimator_.score(Xd_sc_train, yd_train))
print('Test R^2 Score : %.3f'%lasso_result.best_estimator_.score(Xd_sc_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%lasso_result.best_score_)
print('Best Parameters : ',lasso_result.best_params_)
print('Training mse : ', round(mean_squared_error(yd_train,lasso_search.predict(Xd_sc_train)),4))
print('Testing mse : ',round(mean_squared_error(yd_test,lasso_search.predict(Xd_sc_test)),4))

# Train R^2 Score : 0.952
# Test R^2 Score : 0.903
# Best R^2 Score Through Grid Search : 0.864
# Best Parameters :  {'alpha': 0.0003}
# Training mse :  0.0076
# Testing mse :  0.0162
# CPU times: user 814 ms, sys: 266 ms, total: 1.08 s
# Wall time: 4.88 s

Train R^2 Score : 0.952
Test R^2 Score : 0.903
Best R^2 Score Through Grid Search : 0.864
Best Parameters :  {'alpha': 0.0003}
Training mse :  0.0076
Testing mse :  0.0162
Wall time: 2.5 s


## ridge regression

In [12]:
%%time

ridge = Ridge()

# ridge_alphas = [0.1, 0.3, 1, 3, 5, 10, 15, 18, 20]
ridge_alphas = [5]
ridge_param = {
    'alpha':ridge_alphas
       }
# define search
ridge_search = GridSearchCV(ridge, ridge_param, 
                        scoring=['neg_mean_squared_error', 'r2'],
                        refit='r2', n_jobs=-1, cv=folds,
                        return_train_score=True)

# execute search
ridge_result = ridge_search.fit(Xd_sc_train, yd_train)

print('Train R^2 Score : %.3f'%ridge_result.best_estimator_.score(Xd_sc_train, yd_train))
print('Test R^2 Score : %.3f'%ridge_result.best_estimator_.score(Xd_sc_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%ridge_result.best_score_)
print('Best Parameters : ',ridge_result.best_params_)
print('Training mse : ', round(mean_squared_error(yd_train,ridge_search.predict(Xd_sc_train)),4))
print('Testing mse : ',round(mean_squared_error(yd_test,ridge_search.predict(Xd_sc_test)),4))

# Train R^2 Score : 0.953
# Test R^2 Score : 0.899
# Best R^2 Score Through Grid Search : 0.854
# Best Parameters :  {'alpha': 5}
# Training mse :  0.0074
# Testing mse :  0.0168
# CPU times: user 192 ms, sys: 89 ms, total: 281 ms
# Wall time: 443 ms

Train R^2 Score : 0.953
Test R^2 Score : 0.899
Best R^2 Score Through Grid Search : 0.854
Best Parameters :  {'alpha': 5}
Training mse :  0.0074
Testing mse :  0.0168
Wall time: 356 ms


## elastic net regression

In [13]:
%%time

net = ElasticNet()

net_alphas = [0.001]
net_ratios =[0.3]
net_param = {
    'alpha':net_alphas,
    'l1_ratio':net_ratios
       }

# define search
net_search = GridSearchCV(net, net_param, 
                        scoring=['neg_mean_squared_error', 'r2'],
                        refit='r2', n_jobs=-1, cv=folds,
                        return_train_score=True)

# execute search
net_result = net_search.fit(Xd_sc_train, yd_train)

print('Train R^2 Score : %.3f'%net_result.best_estimator_.score(Xd_sc_train, yd_train))
print('Test R^2 Score : %.3f'%net_result.best_estimator_.score(Xd_sc_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%net_result.best_score_)
print('Best Parameters : ',net_result.best_params_)
print('Training mse : ', round(mean_squared_error(yd_train,net_search.predict(Xd_sc_train)),4))
print('Testing mse : ',round(mean_squared_error(yd_test,net_search.predict(Xd_sc_test)),4))

# Train R^2 Score : 0.952
# Test R^2 Score : 0.903
# Best R^2 Score Through Grid Search : 0.863
# Best Parameters :  {'alpha': 0.001, 'l1_ratio': 0.30000000000000004}
# Training mse :  0.0076
# Testing mse :  0.0161
# CPU times: user 3.34 s, sys: 396 ms, total: 3.73 s
# Wall time: 31.7 s


Train R^2 Score : 0.952
Test R^2 Score : 0.903
Best R^2 Score Through Grid Search : 0.862
Best Parameters :  {'alpha': 0.001, 'l1_ratio': 0.3}
Training mse :  0.0076
Testing mse :  0.0161
Wall time: 2 s


## PCA - Linear regression

In [14]:
%%time

n_comp=200
pca=PCA(n_components=n_comp)

Xd_reduced_train = pca.fit_transform(Xd_train)
Xd_reduced_test = pca.transform(Xd_test)
test_reduced=pca.transform(test_dummy)

Xd_sc_reduced_train = sc.fit_transform(Xd_reduced_train)
Xd_sc_reduced_test = sc.fit_transform(Xd_reduced_test)

test_sc_reduced=sc.fit_transform(test_reduced)

pca_lm_params = [{'n_features_to_select': list(range(100, n_comp))}]

pca_lm = LinearRegression()
pca_lm.fit(Xd_reduced_train, yd_train)
pca_rfe = RFE(pca_lm)            

pca_lm_search = GridSearchCV(estimator = pca_rfe, 
                        param_grid = pca_lm_params, 
                        scoring='r2',
                        cv = 3, 
                        n_jobs=-1,
                        return_train_score=True)      

# fit the model
pca_lm_result = pca_lm_search.fit(Xd_reduced_train, yd_train)  

# Train R^2 Score : 0.925
# Test R^2 Score : -0.797
# Best R^2 Score Through Grid Search : 0.809
# Best Parameters :  {'n_features_to_select': 198}
# Training mse :  0.0119
# Testing mse :  0.2991
# CPU times: user 3.61 s, sys: 1.12 s, total: 4.73 s
# Wall time: 1min 23s


Wall time: 1min 24s


## PCA - Lasso regression

In [15]:
%%time

pca_lasso = Lasso()

# pca_lasso_alphas = [1e-5, 5e-5, 1e-4, 5e-4, 1e-4, 7e-3, 
#                 5e-3, 3e-3, 1e-3, 5e-2, 1e-2]
pca_lasso_alphas = [1e-5]
pca_lasso_param = {
    'alpha':pca_lasso_alphas
       }

# define search
pca_lasso_search = GridSearchCV(pca_lasso, pca_lasso_param, 
                        scoring=['neg_mean_squared_error', 'r2'],
                        refit='r2', n_jobs=-1, cv=folds,
                        return_train_score=True)

# execute search
pca_lasso_result = pca_lasso_search.fit(Xd_sc_reduced_train, yd_train)

print('Train R^2 Score : %.3f'%pca_lasso_result.best_estimator_.score(Xd_sc_reduced_train, yd_train))
print('Test R^2 Score : %.3f'%pca_lasso_result.best_estimator_.score(Xd_sc_reduced_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%pca_lasso_result.best_score_)
print('Best Parameters : ',pca_lasso_result.best_params_)
print('Training mse : ', round(mean_squared_error(yd_train,pca_lasso_search.predict(Xd_sc_reduced_train)),4))
print('Testing mse : ',round(mean_squared_error(yd_test,pca_lasso_search.predict(Xd_sc_reduced_test)),4))

# Train R^2 Score : 0.937
# Test R^2 Score : 0.862
# Best R^2 Score Through Grid Search : 0.874
# Best Parameters :  {'alpha': 1e-05}
# Training mse :  0.0099
# Testing mse :  0.023
# CPU times: user 226 ms, sys: 87.1 ms, total: 313 ms
# Wall time: 381 ms

Train R^2 Score : 0.937
Test R^2 Score : 0.860
Best R^2 Score Through Grid Search : 0.874
Best Parameters :  {'alpha': 1e-05}
Training mse :  0.0099
Testing mse :  0.0233
Wall time: 326 ms


## PCA - Ridge regression

In [16]:
%%time

pca_ridge = Ridge()

pca_ridge_alphas = [30]
pca_ridge_param = {
    'alpha':pca_ridge_alphas
       }
# define search
pca_ridge_search = GridSearchCV(pca_ridge, pca_ridge_param, 
                        scoring=['neg_mean_squared_error', 'r2'],
                        refit='r2', n_jobs=-1, cv=folds,
                        return_train_score=True)

# execute search
pca_ridge_result = pca_ridge_search.fit(Xd_sc_reduced_train, yd_train)

# summarize result
print('Train R^2 Score : %.3f'%pca_ridge_result.best_estimator_.score(Xd_sc_reduced_train, yd_train))
print('Test R^2 Score : %.3f'%pca_ridge_result.best_estimator_.score(Xd_sc_reduced_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%pca_ridge_result.best_score_)
print('Best Parameters : ',pca_ridge_result.best_params_)
print('Training mse : ', round(mean_squared_error(yd_train,pca_ridge_search.predict(Xd_sc_reduced_train)),4))
print('Testing mse : ', round(mean_squared_error(yd_test,pca_ridge_search.predict(Xd_sc_reduced_test)),4))

# Train R^2 Score : 0.936
# Test R^2 Score : 0.864
# Best R^2 Score Through Grid Search : 0.879
# Best Parameters :  {'alpha': 30}
# Training mse :  0.01
# Testing mse :  0.0226
# CPU times: user 243 ms, sys: 74.3 ms, total: 317 ms
# Wall time: 406 ms


Train R^2 Score : 0.936
Test R^2 Score : 0.863
Best R^2 Score Through Grid Search : 0.879
Best Parameters :  {'alpha': 30}
Training mse :  0.01
Testing mse :  0.0228
Wall time: 279 ms


## PCA - Elastic net regression

In [17]:
%%time

pca_net = ElasticNet()

pca_net_alphas = [0.05]
pca_net_ratios = [0]
pca_net_param = {
    'alpha':pca_net_alphas,
    'l1_ratio':pca_net_ratios
       }

# define search
pca_net_search = GridSearchCV(pca_net, pca_net_param, 
                        scoring=['neg_mean_squared_error', 'r2'],
                        refit='r2', n_jobs=-1, cv=folds,
                        return_train_score=True)

# execute search
pca_net_result = pca_net_search.fit(Xd_sc_reduced_train, yd_train)

# summarize result
print('Train R^2 Score : %.3f'%pca_net_result.best_estimator_.score(Xd_sc_reduced_train, yd_train))
print('Test R^2 Score : %.3f'%pca_net_result.best_estimator_.score(Xd_sc_reduced_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%pca_net_result.best_score_)
print('Best Parameters : ',pca_net_result.best_params_)
print('Training mse : ', round(mean_squared_error(yd_train,pca_net_search.predict(Xd_sc_reduced_train)),4))
print('Testing mse : ', round(mean_squared_error(yd_test,pca_net_search.predict(Xd_sc_reduced_test)),4))
# Train R^2 Score : 0.935
# Test R^2 Score : 0.866
# Best R^2 Score Through Grid Search : 0.879
# Best Parameters :  {'alpha': 0.05, 'l1_ratio': 0.0}
# Training mse :  0.0103
# Testing mse :  0.0224
# CPU times: user 1.77 s, sys: 594 ms, total: 2.36 s
# Wall time: 5.67 s

Train R^2 Score : 0.935
Test R^2 Score : 0.864
Best R^2 Score Through Grid Search : 0.879
Best Parameters :  {'alpha': 0.05, 'l1_ratio': 0}
Training mse :  0.0103
Testing mse :  0.0226
Wall time: 1.55 s


## PCA - KNN regression

In [18]:
%%time

knn_param = [{'n_neighbors': [2], 
              'weights': ['distance']}]

knn_search = GridSearchCV(estimator=KNeighborsRegressor(), param_grid= knn_param, 
             refit = True, verbose=3, cv=folds )

knn_result = knn_search.fit(Xd_sc_reduced_train,yd_train)

print('Train R^2 Score : %.3f'%knn_result.best_estimator_.score(Xd_sc_reduced_train, yd_train))
print('Test R^2 Score : %.3f'%knn_result.best_estimator_.score(Xd_sc_reduced_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%knn_result.best_score_)
print('Best Parameters : ',knn_result.best_params_)
print('Training mse : ', round(mean_squared_error(yd_train,knn_search.predict(Xd_sc_reduced_train)),4))
print('Testing mse : ', round(mean_squared_error(yd_test,knn_search.predict(Xd_sc_reduced_test)),4))

# Train R^2 Score : 1.000
# Test R^2 Score : -0.208
# Best R^2 Score Through Grid Search : 0.274
# Best Parameters :  {'n_neighbors': 2, 'weights': 'distance'}
# Training mse :  0.0
# Testing mse :  0.2012
# CPU times: user 2.02 s, sys: 1.45 s, total: 3.46 s
# Wall time: 881 ms

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ...n_neighbors=2, weights=distance;, score=0.338 total time=   0.0s
[CV 2/5] END ...n_neighbors=2, weights=distance;, score=0.256 total time=   0.0s
[CV 3/5] END ...n_neighbors=2, weights=distance;, score=0.260 total time=   0.0s
[CV 4/5] END ...n_neighbors=2, weights=distance;, score=0.214 total time=   0.0s
[CV 5/5] END ...n_neighbors=2, weights=distance;, score=0.316 total time=   0.0s
Train R^2 Score : 1.000
Test R^2 Score : -0.164
Best R^2 Score Through Grid Search : 0.277
Best Parameters :  {'n_neighbors': 2, 'weights': 'distance'}
Training mse :  0.0
Testing mse :  0.1937
Wall time: 369 ms


## SVM Regression

In [19]:
%%time

svr_param = {'C': [1], 
                 'gamma': [0.0003], 
                 'kernel': ['rbf']} 

svr_search = GridSearchCV(estimator=SVR(), param_grid= svr_param, 
             refit = True, verbose=3, cv=folds )

svr_result = svr_search.fit(Xd_sc_train,yd_train)

print('Train R^2 Score : %.3f'%svr_result.best_estimator_.score(Xd_sc_train, yd_train))
print('Test R^2 Score : %.3f'%svr_result.best_estimator_.score(Xd_sc_test, yd_test))
print('Best R^2 Score Through Grid Search : %.3f'%svr_result.best_score_)
print('Best Parameters : ',svr_result.best_params_)
print('Training mse : ', round(mean_squared_error(yd_train,svr_search.predict(Xd_sc_train)),4))
print('Testing mse : ', round(mean_squared_error(yd_test,svr_search.predict(Xd_sc_test)),4))
# Train R^2 Score : 0.934
# Test R^2 Score : 0.918
# Best R^2 Score Through Grid Search : 0.882
# Best Parameters :  {'C': 1, 'gamma': 0.0003, 'kernel': 'rbf'}
# Training mse :  0.0104
# Testing mse :  0.0136
# CPU times: user 42.2 s, sys: 60.3 ms, total: 42.3 s
# Wall time: 41.9 s

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END .....C=1, gamma=0.0003, kernel=rbf;, score=0.887 total time=   0.2s
[CV 2/5] END .....C=1, gamma=0.0003, kernel=rbf;, score=0.875 total time=   0.1s
[CV 3/5] END .....C=1, gamma=0.0003, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .....C=1, gamma=0.0003, kernel=rbf;, score=0.871 total time=   0.1s
[CV 5/5] END .....C=1, gamma=0.0003, kernel=rbf;, score=0.869 total time=   0.1s
Train R^2 Score : 0.934
Test R^2 Score : 0.919
Best R^2 Score Through Grid Search : 0.881
Best Parameters :  {'C': 1, 'gamma': 0.0003, 'kernel': 'rbf'}
Training mse :  0.0104
Testing mse :  0.0135
Wall time: 1.78 s


## Decision Tree

In [20]:
%%time

le = preprocessing.LabelEncoder()
X_lb_train = X_train.apply(le.fit_transform)
X_lb_test = X_test.apply(le.fit_transform)

dt_param = {  'bootstrap': [True], 
              'max_depth': [ 20], 
              'max_features': ['auto'], 
              'n_estimators': [1300]}

dt_search = GridSearchCV(estimator = RandomForestRegressor(), param_grid = dt_param, 
                          cv = folds, n_jobs = 1, verbose = 0, return_train_score=True)

dt_result = dt_search.fit(X_lb_train,y_train)

print('Train R^2 Score : %.3f'%dt_result.best_estimator_.score(X_lb_train, y_train))
print('Test R^2 Score : %.3f'%dt_result.best_estimator_.score(X_lb_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%dt_result.best_score_)
print('Best Parameters : ',dt_result.best_params_)
print('Training mse : ', round(mean_squared_error(y_train,dt_search.predict(X_lb_train)),4))
print('Testing mse : ', round(mean_squared_error(y_test,dt_search.predict(X_lb_test)),4))
# Train R^2 Score : 0.983
# Test R^2 Score : 0.110
# Best R^2 Score Through Grid Search : 0.872
# Best Parameters :  {'bootstrap': True, 'max_depth': 20, 'max_features': 'auto', 'n_estimators': 1300}
# Training mse :  0.0027
# Testing mse :  0.1482
# CPU times: user 50min 40s, sys: 7.41 s, total: 50min 47s
# Wall time: 50min 48s

Train R^2 Score : 0.983
Test R^2 Score : 0.115
Best R^2 Score Through Grid Search : 0.869
Best Parameters :  {'bootstrap': True, 'max_depth': 20, 'max_features': 'auto', 'n_estimators': 1300}
Training mse :  0.0027
Testing mse :  0.1474
Wall time: 6min 2s


## Random forest

In [21]:
%%time

ordinal_categorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 
                       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 
                       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
                       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 
                       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 
                       'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 
                       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 
                       'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 
                       'haspool', 'has2ndfloor', 'hasgarage', 'hasbsmt', 'hasfireplace',
                       'OverallQual', 'OverallCond', 'GarageCars']

enc = OrdinalEncoder()

X_ord_train=X_train.copy()
X_ord_test=X_test.copy()

X_ord_train[ordinal_categorical]= enc.fit_transform(X_train[ordinal_categorical])
X_ord_test[ordinal_categorical]= enc.fit_transform(X_test[ordinal_categorical])
X_ord_train.head()

test_ord=test.copy()
test_ord[ordinal_categorical]= enc.fit_transform(test[ordinal_categorical])

# Best Parameters :  {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 300}

rf_param = {'n_estimators': [1000], 
            'max_features':['sqrt'], 
            'max_depth':[25]}

rf_search = GridSearchCV(estimator=RandomForestRegressor(random_state=rand_state),
                         param_grid= rf_param, refit = True, verbose=2, cv=folds )

rf_result = rf_search.fit(X_ord_train,y_train)

print('Train R^2 Score : %.3f'%rf_search.best_estimator_.score(X_ord_train, y_train))
print('Test R^2 Score : %.3f'%rf_search.best_estimator_.score(X_ord_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%rf_search.best_score_)
print('Best Parameters : ',rf_search.best_params_)
print('Training mse : ', round(mean_squared_error(y_train,rf_search.predict(X_ord_train)),4))
print('Testing mse : ', round(mean_squared_error(y_test,rf_search.predict(X_ord_test)),4))

# Best Parameters :  {'max_depth': 25, 'max_features': 'sqrt', 'n_estimators': 1000}

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .max_depth=25, max_features=sqrt, n_estimators=1000; total time=  10.9s
[CV] END .max_depth=25, max_features=sqrt, n_estimators=1000; total time=   8.0s
[CV] END .max_depth=25, max_features=sqrt, n_estimators=1000; total time=   5.3s
[CV] END .max_depth=25, max_features=sqrt, n_estimators=1000; total time=   5.9s
[CV] END .max_depth=25, max_features=sqrt, n_estimators=1000; total time=   5.7s
Train R^2 Score : 0.982
Test R^2 Score : 0.905
Best R^2 Score Through Grid Search : 0.862
Best Parameters :  {'max_depth': 25, 'max_features': 'sqrt', 'n_estimators': 1000}
Training mse :  0.0029
Testing mse :  0.0159
Wall time: 44.1 s


## Gradient Boosting regression (GBM)

In [22]:
%%time

n_samples = X_ord_train.shape[0]
n_features = X_ord_train.shape[1]

# n_estimators represents the number of trees in the forest.
gbr_param={'n_estimators':[1500],
            'learning_rate':[0.005],
            'max_depth':[30],
            'max_features':['sqrt']}

gbr_search = GridSearchCV(GradientBoostingRegressor(random_state=1), 
                          param_grid=gbr_param, n_jobs=-1, cv=folds, verbose=5)
gbr_result=gbr_search.fit(X_ord_train,y_train)

print('Train R^2 Score : %.3f'%gbr_result.best_estimator_.score(X_ord_train, y_train))
print('Test R^2 Score : %.3f'%gbr_result.best_estimator_.score(X_ord_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%gbr_result.best_score_)
print('Best Parameters : ',gbr_result.best_params_)
print('Training mse : ', round(mean_squared_error(y_train,gbr_search.predict(X_ord_train)),4))
print('Testing mse : ', round(mean_squared_error(y_test,gbr_search.predict(X_ord_test)),4))
# Train R^2 Score : 1.000
# Test R^2 Score : 0.912
# Best R^2 Score Through Grid Search : 0.867
# Best Parameters :  {'learning_rate': 0.005, 'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 1500}
# Training mse :  0.0
# Testing mse :  0.0146
# CPU times: user 23.2 s, sys: 1.46 s, total: 24.7 s
# Wall time: 1h 12min 3s

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Train R^2 Score : 1.000
Test R^2 Score : 0.911
Best R^2 Score Through Grid Search : 0.866
Best Parameters :  {'learning_rate': 0.005, 'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 1500}
Training mse :  0.0
Testing mse :  0.0148
Wall time: 37.6 s


## ADA regression

In [23]:
%%time
n_samples = X_ord_train.shape[0]
n_features = X_ord_train.shape[1]

ada_param = {
            'base_estimator':[None, DecisionTreeRegressor()],
            'n_estimators': [300,500],
            'learning_rate': [1.0 ]
         }

ada_search = GridSearchCV(AdaBoostRegressor(random_state=1), 
                                        param_grid=ada_param, 
                                        n_jobs=-1, cv=folds, verbose=5)
ada_result=ada_search.fit(X_ord_train,y_train)

print('Train R^2 Score : %.3f'%ada_result.best_estimator_.score(X_ord_train, y_train))
print('Test R^2 Score : %.3f'%ada_result.best_estimator_.score(X_ord_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%ada_result.best_score_)
print('Best Parameters : ',ada_result.best_params_)
print('Training mse : ', round(mean_squared_error(y_train,ada_search.predict(X_ord_train)),4))
print('Testing mse : ', round(mean_squared_error(y_test,ada_search.predict(X_ord_test)),4))

# Train R^2 Score : 1.000
# Test R^2 Score : 0.906
# Best R^2 Score Through Grid Search : 0.872
# Best Parameters :  {'base_estimator': DecisionTreeRegressor(), 'learning_rate': 1.0, 'n_estimators': 500}
# Training mse :  0.0
# Testing mse :  0.0156
# CPU times: user 23.4 s, sys: 734 ms, total: 24.1 s
# Wall time: 55min 5s

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Train R^2 Score : 1.000
Test R^2 Score : 0.907
Best R^2 Score Through Grid Search : 0.874
Best Parameters :  {'base_estimator': DecisionTreeRegressor(), 'learning_rate': 1.0, 'n_estimators': 300}
Training mse :  0.0
Testing mse :  0.0155
Wall time: 1min 26s


## Extreme Gradient Boosting (XGBoost)

In [24]:
%%time

xgb_param = {
        'n_estimators': [300,500],
        'max_depth': [10,15],
        'eta': [0.05]
        }
xgb_search = GridSearchCV(xgb.XGBRegressor(), xgb_param, n_jobs=-1, cv=folds)

xgb_result=xgb_search.fit(X_ord_train, y_train)


print('Train R^2 Score : %.3f'%xgb_result.best_estimator_.score(X_ord_train, y_train))
print('Test R^2 Score : %.3f'%xgb_result.best_estimator_.score(X_ord_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%xgb_result.best_score_)
print('Best Parameters : ',xgb_result.best_params_)
print('Training mse : ', round(mean_squared_error(y_train,xgb_search.predict(X_ord_train)),4))
print('Testing mse : ', round(mean_squared_error(y_test,xgb_search.predict(X_ord_test)),4))

# Train R^2 Score : 1.000
# Test R^2 Score : 0.906
# Best R^2 Score Through Grid Search : 0.884
# Best Parameters :  {'eta': 0.05, 'max_depth': 15, 'n_estimators': 500}
# Training mse :  0.0
# Testing mse :  0.0156
# CPU times: user 25.8 s, sys: 743 ms, total: 26.6 s
# Wall time: 32min 3s

Train R^2 Score : 1.000
Test R^2 Score : 0.902
Best R^2 Score Through Grid Search : 0.883
Best Parameters :  {'eta': 0.05, 'max_depth': 10, 'n_estimators': 500}
Training mse :  0.0
Testing mse :  0.0163
Wall time: 1min 14s


## Cat Boost Regression

In [25]:
from catboost import CatBoostRegressor

cbr_param = {
            'iterations':[100],
            'learning_rate':[0.1],
            'bootstrap_type':['No']
}

cbr_search = GridSearchCV(CatBoostRegressor(),cbr_param, cv=folds)
cbr_result=cbr_search.fit(X_ord_train, y_train)

print('Train R^2 Score : %.3f'%cbr_result.best_estimator_.score(X_ord_train, y_train))
print('Test R^2 Score : %.3f'%cbr_result.best_estimator_.score(X_ord_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%cbr_result.best_score_)
print('Best Parameters : ',cbr_result.best_params_)
print('Training mse : ', round(mean_squared_error(y_train,cbr_search.predict(X_ord_train)),4))
print('Testing mse : ', round(mean_squared_error(y_test,cbr_search.predict(X_ord_test)),4))

# Train R^2 Score : 0.956
# Test R^2 Score : 0.903
# Best R^2 Score Through Grid Search : 0.884
# Best Parameters :  {'bootstrap_type': 'No', 'iterations': 100, 'learning_rate': 0.1}
# Training mse :  0.007
# Testing mse :  0.0161
# CPU times: user 6min 34s, sys: 23.5 s, total: 6min 57s
# Wall time: 2min 16s

0:	learn: 0.3750725	total: 180ms	remaining: 17.8s
1:	learn: 0.3528866	total: 207ms	remaining: 10.2s
2:	learn: 0.3328852	total: 230ms	remaining: 7.44s
3:	learn: 0.3133952	total: 250ms	remaining: 6.01s
4:	learn: 0.2952975	total: 265ms	remaining: 5.03s
5:	learn: 0.2798544	total: 277ms	remaining: 4.34s
6:	learn: 0.2664174	total: 290ms	remaining: 3.85s
7:	learn: 0.2542544	total: 302ms	remaining: 3.47s
8:	learn: 0.2424093	total: 314ms	remaining: 3.17s
9:	learn: 0.2325113	total: 325ms	remaining: 2.92s
10:	learn: 0.2223526	total: 336ms	remaining: 2.72s
11:	learn: 0.2128939	total: 349ms	remaining: 2.56s
12:	learn: 0.2045082	total: 360ms	remaining: 2.41s
13:	learn: 0.1976925	total: 373ms	remaining: 2.29s
14:	learn: 0.1909360	total: 385ms	remaining: 2.18s
15:	learn: 0.1845800	total: 397ms	remaining: 2.08s
16:	learn: 0.1791143	total: 408ms	remaining: 1.99s
17:	learn: 0.1735678	total: 419ms	remaining: 1.91s
18:	learn: 0.1692752	total: 436ms	remaining: 1.86s
19:	learn: 0.1649105	total: 447ms	remaini

74:	learn: 0.0962802	total: 1s	remaining: 333ms
75:	learn: 0.0956188	total: 1.01s	remaining: 319ms
76:	learn: 0.0947198	total: 1.02s	remaining: 306ms
77:	learn: 0.0942439	total: 1.03s	remaining: 292ms
78:	learn: 0.0937177	total: 1.05s	remaining: 279ms
79:	learn: 0.0931881	total: 1.06s	remaining: 265ms
80:	learn: 0.0923781	total: 1.07s	remaining: 252ms
81:	learn: 0.0921658	total: 1.09s	remaining: 239ms
82:	learn: 0.0913917	total: 1.1s	remaining: 225ms
83:	learn: 0.0908682	total: 1.11s	remaining: 212ms
84:	learn: 0.0906504	total: 1.13s	remaining: 199ms
85:	learn: 0.0898027	total: 1.14s	remaining: 185ms
86:	learn: 0.0893583	total: 1.15s	remaining: 172ms
87:	learn: 0.0891840	total: 1.16s	remaining: 158ms
88:	learn: 0.0886777	total: 1.17s	remaining: 145ms
89:	learn: 0.0879128	total: 1.18s	remaining: 132ms
90:	learn: 0.0871984	total: 1.2s	remaining: 118ms
91:	learn: 0.0869811	total: 1.21s	remaining: 105ms
92:	learn: 0.0867586	total: 1.22s	remaining: 91.8ms
93:	learn: 0.0865641	total: 1.23s	r

36:	learn: 0.1244062	total: 462ms	remaining: 786ms
37:	learn: 0.1232613	total: 475ms	remaining: 774ms
38:	learn: 0.1217281	total: 491ms	remaining: 767ms
39:	learn: 0.1208006	total: 505ms	remaining: 758ms
40:	learn: 0.1197682	total: 517ms	remaining: 743ms
41:	learn: 0.1188327	total: 529ms	remaining: 730ms
42:	learn: 0.1180721	total: 541ms	remaining: 717ms
43:	learn: 0.1169462	total: 551ms	remaining: 701ms
44:	learn: 0.1160416	total: 561ms	remaining: 686ms
45:	learn: 0.1150864	total: 573ms	remaining: 673ms
46:	learn: 0.1138115	total: 585ms	remaining: 660ms
47:	learn: 0.1131709	total: 596ms	remaining: 646ms
48:	learn: 0.1124303	total: 608ms	remaining: 633ms
49:	learn: 0.1113207	total: 621ms	remaining: 621ms
50:	learn: 0.1108714	total: 636ms	remaining: 611ms
51:	learn: 0.1100530	total: 655ms	remaining: 604ms
52:	learn: 0.1094918	total: 671ms	remaining: 595ms
53:	learn: 0.1084548	total: 684ms	remaining: 583ms
54:	learn: 0.1079622	total: 701ms	remaining: 574ms
55:	learn: 0.1075846	total: 714

97:	learn: 0.0837830	total: 1.19s	remaining: 24.3ms
98:	learn: 0.0836869	total: 1.2s	remaining: 12.1ms
99:	learn: 0.0830591	total: 1.22s	remaining: 0us
0:	learn: 0.3732954	total: 25.9ms	remaining: 2.56s
1:	learn: 0.3500090	total: 38.3ms	remaining: 1.88s
2:	learn: 0.3307889	total: 56.8ms	remaining: 1.84s
3:	learn: 0.3126063	total: 73.5ms	remaining: 1.76s
4:	learn: 0.2954597	total: 85.6ms	remaining: 1.63s
5:	learn: 0.2802700	total: 97.6ms	remaining: 1.53s
6:	learn: 0.2659895	total: 110ms	remaining: 1.46s
7:	learn: 0.2530501	total: 123ms	remaining: 1.41s
8:	learn: 0.2413393	total: 130ms	remaining: 1.31s
9:	learn: 0.2302445	total: 144ms	remaining: 1.29s
10:	learn: 0.2206663	total: 157ms	remaining: 1.27s
11:	learn: 0.2120576	total: 168ms	remaining: 1.23s
12:	learn: 0.2046436	total: 179ms	remaining: 1.2s
13:	learn: 0.1978326	total: 191ms	remaining: 1.17s
14:	learn: 0.1912935	total: 204ms	remaining: 1.16s
15:	learn: 0.1854299	total: 219ms	remaining: 1.15s
16:	learn: 0.1798339	total: 231ms	rem

## LightGBM Regression

In [26]:
%%time

# lgbr_param = {
#         'n_estimators': [500, 800, 1000,1300, 1500],
#         'max_depth': [10, 15,20,25,30],
#         'learning_rate': [0.1, 0.5, 1],
#         'boosting_type': ['dart']
#         }
# Best Parameters :  {'boosting_type': 'dart', 'learning_rate': 0.5, 'max_depth': 15, 'n_estimators': 1300}
# Best Parameters :  {'boosting_type': 'dart', 'learning_rate': 0.8, 'max_depth': 15, 'n_estimators': 1300}

# lgbr_param = {
#         'n_estimators': [1300,1500, 2000, 3000],
#         'max_depth': [10, 15,20,30],
#         'learning_rate': [ 0.8, 0.9, 1],
#         'boosting_type': ['dart']
#         }
# lgbr_param = {
#         'n_estimators': [3000, 4000],
#         'max_depth': [10, 15],
#         'learning_rate': [ 0.2, 0.8],
#         'boosting_type': ['dart']
#         }

lgbr_param = {
        'n_estimators': [ 5000],
        'max_depth': [15],
        'learning_rate': [ 0.8],
        'boosting_type': ['dart']
        }
lgbr_search = GridSearchCV(lgb.LGBMRegressor(), lgbr_param, n_jobs=-1, cv=folds)

lgbr_result=lgbr_search.fit(X_ord_train, y_train)

print('')
print('Train R^2 Score : %.3f'%lgbr_result.best_estimator_.score(X_ord_train, y_train))
print('Test R^2 Score : %.3f'%lgbr_result.best_estimator_.score(X_ord_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%lgbr_result.best_score_)
print('Best Parameters : ',lgbr_result.best_params_)
print('Training mse : ', round(mean_squared_error(y_train,lgbr_search.predict(X_ord_train)),4))
print('Testing mse : ', round(mean_squared_error(y_test,lgbr_search.predict(X_ord_test)),4))

# Train R^2 Score : 1.000
# Test R^2 Score : 0.985
# Best R^2 Score Through Grid Search : 0.985
# Best Parameters :  {'boosting_type': 'dart', 'learning_rate': 0.8, 'max_depth': 15, 'n_estimators': 3000}
# Training mse :  0.0
# Testing mse :  0.0007


Train R^2 Score : 1.000
Test R^2 Score : 0.899
Best R^2 Score Through Grid Search : 0.871
Best Parameters :  {'boosting_type': 'dart', 'learning_rate': 0.8, 'max_depth': 15, 'n_estimators': 5000}
Training mse :  0.0
Testing mse :  0.0169
Wall time: 3min 23s


## Blended model

In [27]:
def blended_predict(train_or_test):
    if train_or_test=='train':
        return ((0.05 * lasso_search.predict(Xd_sc_train)) + \
                (0.05 * ridge_search.predict(Xd_sc_train)) + \
                (0.1 * net_search.predict(Xd_sc_train)) + \
                (0.2 * svr_search.predict(Xd_sc_train)) + \
                (0.1 * rf_search.predict(X_ord_train)) + \
                (0.1 * gbr_search.predict(X_ord_train)) + \
                (0.1 * ada_search.predict(X_ord_train)) + \
                (0.1 * xgb_search.predict(X_ord_train)) + \
                (0.1 * cbr_search.predict(X_ord_train)) + \
                (0.1 * lgbr_search.predict(X_ord_train)))
    else:
        return ((0.05 * lasso_search.predict(Xd_sc_test)) + \
                (0.05 * ridge_search.predict(Xd_sc_test)) + \
                (0.1 * net_search.predict(Xd_sc_test)) + \
                (0.2 * svr_search.predict(Xd_sc_test)) + \
                (0.1 * rf_search.predict(X_ord_test)) + \
                (0.1 * gbr_search.predict(X_ord_test)) + \
                (0.1 * ada_search.predict(X_ord_test)) + \
                (0.1 * xgb_search.predict(X_ord_test)) + \
                (0.1 * cbr_search.predict(X_ord_test)) + \
                (0.1 * lgbr_search.predict(X_ord_test)))  

# def blended_predict(train_or_test):
#     if train_or_test=='train':
#         return (
#                 (0.3 * svr_search.predict(Xd_sc_train)) + \
#                 (0.2 * gbr_search.predict(X_ord_train)) + \
#                 (0.1 * ada_search.predict(X_ord_train)) + \
#                 (0.1 * xgb_search.predict(X_ord_train)) + \
#                 (0.1 * cbr_search.predict(X_ord_train)) + \
#                 (0.2 * lgbr_search.predict(X_ord_train)))
#     else:
#         return (
#                 (0.3 * svr_search.predict(Xd_sc_test)) + \
#                 (0.2 * gbr_search.predict(X_ord_test)) + \
#                 (0.1 * ada_search.predict(X_ord_test)) + \
#                 (0.1 * xgb_search.predict(X_ord_test)) + \
#                 (0.1 * cbr_search.predict(X_ord_test)) + \
#                 (0.2 * lgbr_search.predict(X_ord_test)))  

In [28]:
df=pd.DataFrame()    

def score_list(regr, search=None , result=None , 
               X_train=None , y_train=y_train , 
               X_test=None , y_test=y_test ):
    if regr=='Blended Model':
        y_pred_train=blended_predict('train')
        y_pred_test=blended_predict('test')
        
        score_train=round(r2_score(y_train, y_pred_train),4)
        score_test=round(r2_score(y_test, y_pred_test),4)

        mse_train=round(mean_squared_error(y_train,y_pred_train),4)
        mse_test=round(mean_squared_error(y_test,y_pred_test),4)

        rmse_train=round(np.sqrt(mse_train),5) 
        rmse_test=round(np.sqrt(mse_test),5)  
        
    else:
#         score_train=round(result.best_estimator_.score(X_train,y_train),4)
#         score_test=round(result.best_estimator_.score(X_test,y_test),4)

        score_train=round(search.score(X_train,y_train),4)
        score_test=round(search.score(X_test,y_test),4)


        mse_train=round(mean_squared_error(y_train,search.predict(X_train)),4)
        mse_test=round(mean_squared_error(y_test,search.predict(X_test)),4)


        rmse_train=round(np.sqrt(mse_train),5)    
        rmse_test=round(np.sqrt(mse_test),5)    


    df[regr]=[score_train, mse_train, rmse_train, score_test,mse_test,rmse_test]
    return df 
    
    
# stack_gen1 = StackingCVRegressor(regressors=(lasso_search, ridge_search, net_search, svr_search),
#                                 meta_regressor=net_search,
#                                 use_features_in_secondary=True)
# stack_gen2 = StackingCVRegressor(regressors=(rf_search, gbr_search, ada_search, cbr_search, lgbr_search),
#                                 meta_regressor=lgbr_search,
#                                 use_features_in_secondary=True)
# stack_result = stack_gen1.fit(Xd_sc_train, yd_train)
# stack_result = stack_gen2.fit(X_ord_train, yd_train)

    
# df['Scores']=['Train Score', 'Test Score', 'Train MSE', 'Test MSE', 'Train RMSE', 'Test RMSE']
df=score_list('Linear Model', lm_search, lm_result, Xd_train, yd_train, Xd_test, yd_test)
df=score_list('Lasso', lasso_search, lasso_result, Xd_sc_train, yd_train, Xd_sc_test, yd_test)
df=score_list('Ridge', ridge_search, ridge_result, Xd_sc_train, yd_train, Xd_sc_test, yd_test)
df=score_list('ElasticNet', net_search, net_result, Xd_sc_train, yd_train, Xd_sc_test, yd_test)
df=score_list('Decision Tree', dt_search, dt_result, X_lb_train, yd_train, X_lb_test, yd_test)
df=score_list('SVR', svr_search, svr_result, Xd_sc_train, yd_train, Xd_sc_test, yd_test)
df=score_list('Random Forest', rf_search, rf_result, X_ord_train, y_train, X_ord_test, y_test)
df=score_list('GBM', gbr_search, gbr_result, X_ord_train, y_train, X_ord_test, y_test)
df=score_list('ADA', ada_search, ada_result, X_ord_train, y_train, X_ord_test, y_test)
df=score_list('XGBoost', xgb_search, xgb_result, X_ord_train, y_train, X_ord_test, y_test)
df=score_list('Cat Boost', cbr_search, cbr_result, X_ord_train, y_train, X_ord_test, y_test)
df=score_list('LightGBM', lgbr_search, lgbr_result, X_ord_train, y_train, X_ord_test, y_test)
df=score_list('PCA-Linear', pca_lm_search, pca_lm_result, Xd_reduced_train, yd_train, Xd_reduced_test, yd_test)
df=score_list('PCA-Lasso', pca_lasso_search, pca_lasso_result, Xd_reduced_train, yd_train, Xd_reduced_test, yd_test)
df=score_list('PCA-Ridge', pca_ridge_search, pca_ridge_result, Xd_reduced_train, yd_train, Xd_reduced_test, yd_test)
df=score_list('PCA-ElasticNet', pca_net_search, pca_net_result, Xd_reduced_train, yd_train, Xd_reduced_test, yd_test)
df=score_list('PCA-KNN', knn_search, knn_result, Xd_reduced_train, yd_train, Xd_reduced_test, yd_test)
# df=score_list('Stack1', stack_gen1, net_result, Xd_sc_train, yd_train, Xd_sc_test, yd_test)
# df=score_list('Stack2', stack_gen2, lgbr_result, X_ord_train, yd_train, X_ord_test, yd_test)
df=score_list(regr='Blended Model')

df=df.transpose()
df.columns=['Train Score', 'Train MSE', 'Train RMSE', 'Test Score', 'Test MSE',  'Test RMSE']
df1=df.sort_values('Test RMSE',ascending=True)
df1.style.format({
    'Train Score': '{:,.2%}'.format,
    'Train MSE': '{:,.4f}'.format,
    'Train RMSE': '{:,.5f}'.format,
    'Test Score': '{:,.2%}'.format,
    'Test MSE': '{:,.4f}'.format,
    'Test RMSE': '{:,.5f}'.format
})

Unnamed: 0,Train Score,Train MSE,Train RMSE,Test Score,Test MSE,Test RMSE
Blended Model,98.55%,0.0023,0.04796,93.34%,0.0111,0.10536
SVR,93.42%,0.0104,0.10198,91.92%,0.0135,0.11619
GBM,100.00%,0.0,0.0,91.12%,0.0148,0.12166
ADA,99.98%,0.0,0.0,90.70%,0.0155,0.1245
Random Forest,98.17%,0.0029,0.05385,90.46%,0.0159,0.1261
ElasticNet,95.17%,0.0076,0.08718,90.30%,0.0161,0.12689
Lasso,95.20%,0.0076,0.08718,90.28%,0.0162,0.12728
XGBoost,100.00%,0.0,0.0,90.21%,0.0163,0.12767
Ridge,95.32%,0.0074,0.08602,89.91%,0.0168,0.12961
LightGBM,100.00%,0.0,0.0,89.87%,0.0169,0.13


In [29]:
def blended_predict_final():
    return ((0.05 * lasso_search.predict(test_sc)) + \
            (0.05 * ridge_search.predict(test_sc)) + \
            (0.1 * net_search.predict(test_sc)) + \
            (0.2 * svr_search.predict(test_sc)) + \
            (0.1 * rf_search.predict(test_ord)) + \
            (0.1 * gbr_search.predict(test_ord)) + \
            (0.1 * ada_search.predict(test_ord)) + \
            (0.1 * xgb_search.predict(test_ord)) + \
            (0.1 * cbr_search.predict(test_ord)) + \
            (0.1 * lgbr_search.predict(test_ord)))

# def blended_predict_final():
#     return (
#             (0.3 * svr_search.predict(test_sc)) + \
#             (0.2 * gbr_search.predict(test_ord)) + \
#             (0.1 * ada_search.predict(test_ord)) + \
#             (0.1 * xgb_search.predict(test_ord)) + \
#             (0.1 * cbr_search.predict(test_ord)) + \
#             (0.2 * lgbr_search.predict(test_ord)))
y_pred_final=blended_predict_final()

In [30]:
submission = pd.read_csv("data/sample_submission.csv")
# submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission.iloc[:,1] = np.floor(np.expm1(y_pred_final))
submission

Unnamed: 0,Id,SalePrice
0,1461,129653.0
1,1462,158478.0
2,1463,186752.0
3,1464,196262.0
4,1465,190341.0
...,...,...
1454,2915,86739.0
1455,2916,85262.0
1456,2917,168220.0
1457,2918,118782.0


In [31]:
# Save predictions in format used for competition scoring
submission.to_csv('data/submission.csv', index=False)
# submission.to_csv('submission.csv', index=False)