In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load traning data
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import xgboost as xgb
from xgboost import XGBRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from lightgbm import LGBMRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
import csv
import datetime
import os.path
from os import path

In [3]:
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

In [4]:
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
REMOVE_ONLY_2_Outliers = True
USE_DUMMY_CAT_FEATURES = True
STACK_MODELLING = True
RANDOM_SEED = 42

In [5]:
df_train = pd.read_csv('./data/train.csv')
df_test =  pd.read_csv('./data/test.csv')
display(df_train.head(5))
display(df_test.head(5))

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [6]:
#Remove Id field as it is not useful in regression 
df_test_id = df_test['Id']
#print(df_test_id)

df_train.drop(['Id'],axis=1,inplace=True)
df_test.drop(['Id'],axis=1,inplace=True)

#df_train.drop(['Id','PoolQC','MiscFeature','Alley'],axis=1,inplace=True)
#df_test.drop(['Id','PoolQC','MiscFeature','Alley'],axis=1,inplace=True)

print(df_train.shape)
print(df_test.shape)

(1460, 80)
(1459, 79)


In [7]:
# Remove outliers  Reference - https://www.kaggle.com/zoupet/neural-network-model-for-house-prices-tensorflow
from sklearn.ensemble import IsolationForest

def RemoveOutliers(df_train):
    if(REMOVE_ONLY_2_Outliers):
        #Deleting outliers
        df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index)
    else:
        clf = IsolationForest(max_samples = 100, random_state = 42)
        clf.fit(df_train)
        y_noano = clf.predict(df_train)
        y_noano = pd.DataFrame(y_noano, columns = ['Top'])
        y_noano[y_noano['Top'] == 1].index.values

        df_train = df_train.iloc[y_noano[y_noano['Top'] == 1].index.values]
        df_train.reset_index(drop = True, inplace = True)
        print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
        print("Number of rows without outliers:", df_train.shape[0])
        
RemoveOutliers(df_train)
print(df_train.shape)

(1460, 80)


In [8]:
#Log-transformation of the target variable
#Handle skewness(Reference - https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard)
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])
df_train["SalePrice"].head()

0   12.248
1   12.109
2   12.317
3   11.849
4   12.429
Name: SalePrice, dtype: float64

In [9]:
ntrain = df_train.shape[0]
ntest = df_test.shape[0]
y_train_all = df_train.SalePrice.values
all_data = pd.concat((df_train, df_test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

all_data size is : (2919, 79)


In [10]:
total = all_data.isnull().sum().sort_values(ascending=False)
display(total.head(10))

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
FireplaceQu     1420
LotFrontage      486
GarageFinish     159
GarageQual       159
GarageYrBlt      159
GarageCond       159
dtype: int64

In [11]:
def datapreprocessing(all_data):
    all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
    all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
    all_data["Alley"] = all_data["Alley"].fillna("None")
    all_data["Fence"] = all_data["Fence"].fillna("None")
    all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")
    
    #Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
    all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
        all_data[col] = all_data[col].fillna('None')
        
    #Replacing missing data with 0 (Since No garage = no cars in such garage.)
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        all_data[col] = all_data[col].fillna(0)
        
    # missing values are likely zero for having no basement
    for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
        all_data[col] = all_data[col].fillna(0)
        
    # For all these categorical basement-related features, NaN means that there is no basement.
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        all_data[col] = all_data[col].fillna('None')
        
    #NA most likely means no masonry veneer for these houses. We can fill 0 for the area and None for the type.
    all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
    all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
    
    # 'RL' is by far the most common value. So we can fill in missing values with 'RL'
    all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
    #For this categorical feature all records are "AllPub", except for one "NoSeWa" and 2 NA . 
    #Since the house with 'NoSewa' is in the training set, this feature won't help in predictive modelling. 
    #We can then safely remove it.
    all_data = all_data.drop(['Utilities'], axis=1)
    
    #data description says NA means typical
    all_data["Functional"] = all_data["Functional"].fillna("Typ")
    
    #It has one NA value. Since this feature has mostly 'SBrkr', we can set that for the missing value.
    all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
    #Only one NA value, and same as Electrical, we set 'TA' (which is the most frequent) for the missing value in KitchenQual.
    all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
    
    #Exterior 1 & 2 have only one missing value. We will just substitute in the most common string
    all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
    all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
    
    all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
    all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")
    
    #Transforming some numerical variables that are really categorical

    #MSSubClass=The building class
    all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)


    #Changing OverallCond into a categorical variable
    all_data['OverallCond'] = all_data['OverallCond'].astype(str)


    #Year and month sold are transformed into categorical features.
    all_data['YrSold'] = all_data['YrSold'].astype(str)
    all_data['MoSold'] = all_data['MoSold'].astype(str)
    
    
#datapreprocessing(df_train)
#datapreprocessing(df_test)
datapreprocessing(all_data)

#all_data.dtypes

In [12]:
from sklearn.preprocessing import LabelEncoder
def encodeCategories(all_data):
    cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
            'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
            'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
            'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold')
    # process columns, apply LabelEncoder to categorical features
    for c in cols:
        lbl = LabelEncoder() 
        lbl.fit(list(all_data[c].values)) 
        all_data[c] = lbl.transform(list(all_data[c].values))

    # shape        
    print('Shape all_data: {}'.format(all_data.shape))
    #return 
    
#encodeCategories(df_train)
#encodeCategories(df_test)
encodeCategories(all_data)

Shape all_data: (2919, 79)


In [13]:
# Adding total sqfootage feature 
#df_train['TotalSF'] = df_train['TotalBsmtSF'] + df_train['1stFlrSF'] + df_train['2ndFlrSF']
#df_test['TotalSF'] = df_test['TotalBsmtSF'] + df_test['1stFlrSF'] + df_test['2ndFlrSF']
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [14]:
display(all_data.head(5))

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,TotalSF
0,856,854,0,1,3,1Fam,4,3,706.0,0.0,...,0,1,8,856.0,AllPub,0,2003,2003,2,2566.0
1,1262,0,0,1,3,1Fam,4,1,978.0,0.0,...,0,1,6,1262.0,AllPub,298,1976,1976,1,2524.0
2,920,866,0,1,3,1Fam,4,2,486.0,0.0,...,0,1,6,920.0,AllPub,0,2001,2002,2,2706.0
3,961,756,0,1,3,1Fam,1,3,216.0,0.0,...,0,1,7,756.0,AllPub,0,1915,1970,0,2473.0
4,1145,1053,0,1,4,1Fam,4,0,655.0,0.0,...,0,1,9,1145.0,AllPub,192,2000,2000,2,3343.0


In [15]:
from scipy import stats
from scipy.stats import norm, skew #for some statistics

skewness_threshold = 0.75

def skewnesshandling(all_data):
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

    # Check the skew of all numerical features
    skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    print("\nSkew in numerical features: \n")
    skewness = pd.DataFrame({'Skew' :skewed_feats})
    print(skewness.head(10))

    skewness = skewness[abs(skewness) > skewness_threshold]
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
    
    from scipy.special import boxcox1p
    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
        #all_data[feat] += 1
        all_data[feat] = boxcox1p(all_data[feat], lam)
    
    #all_data[skewed_features] = np.log1p(all_data[skewed_features])
#skewnesshandling(df_train)
#skewnesshandling(df_test)
skewnesshandling(all_data)


Skew in numerical features: 

                Skew
MiscVal       21.947
PoolArea      16.898
LotArea       12.822
LowQualFinSF  12.089
3SsnPorch     11.376
LandSlope      4.975
KitchenAbvGr   4.302
BsmtFinSF2     4.146
EnclosedPorch  4.004
ScreenPorch    3.947
There are 59 skewed numerical features to Box Cox transform


In [16]:
# Separate numerical and categorical columns
def getNumericaldata(df,excludefield):
    quantitative = [f for f in df.columns if df.dtypes[f] != 'object']
    quantitative.remove(excludefield)
    return quantitative

def getCategoricaldata(df):
    qualitative = [f for f in df.columns if df.dtypes[f] == 'object']
    return qualitative

numcols = getNumericaldata(df_train,'SalePrice')
catcols = getCategoricaldata(df_train)
alldata_catcols = getCategoricaldata(all_data)

print(numcols)
print(catcols)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCon

In [17]:
def convertasCategory(df,cols):
    for col in cols:
        df[col] = df[col].astype('category')
        df[col+'_code'] = df[col].cat.codes
        df[col] = df[col +'_code']
        df.drop(labels=col+'_code', axis="columns", inplace=True)
    #return df

if(USE_DUMMY_CAT_FEATURES):
    #df_train = pd.get_dummies(df_train)
    #print(df_train.shape)
    #df_test = pd.get_dummies(df_test)
    #print(df_test.shape)
    all_data = pd.get_dummies(all_data)
    print(all_data.shape)
else:
    convertasCategory(all_data,catcols)
    #convertasCategory(df_train,catcols)
    #convertasCategory(df_test,ccatcolsatcols)

(2919, 223)


In [18]:
   

#print(df_train.head())
#print(df_train.dtypes)
# Handle remaining missing values for numerical features by using median as replacement
#print("NAs for numerical features in train : " + str(df_train.isnull().values.sum()))
######df_train = df_train.fillna(df_train.median())
######df_test = df_test.fillna(df_train.median())
#print("Remaining NAs for numerical features in train : " + str(df_train.isnull().values.sum()))
#print("Remaining NAs for numerical features in test : " + str(df_test.isnull().values.sum()))
print("Remaining NAs for numerical features in all_data : " + str(all_data.isnull().values.sum()))

Remaining NAs for numerical features in all_data : 0


In [19]:
#dropcols=['BldgType_1Fam', 'Condition1_Artery', 'Condition2_Artery','Exterior1st_AsbShng','Exterior1st_CBlock',
#          'Exterior2nd_AsbShng','Foundation_BrkTil','GarageType_2Types','Heating_Floor','HouseStyle_1.5Fin',
#          'LandContour_Bnk','LotConfig_Corner','MSZoning_C (all)','MasVnrType_BrkCmn','MiscFeature_Gar2',
 #         'Neighborhood_Blmngtn','RoofMatl_ClyTile','RoofStyle_Flat','SaleCondition_Abnorml','SaleType_COD',
  #        'Utilities_AllPub','YearBuilt','YearRemodAdd','GarageYrBlt','TotalSF','GrLivArea','MiscFeature_None',
   #       'PoolQC','RoofMatl_CompShg','Heating_GasA','1stFlrSF','Electrical_SBrkr','Condition2_Norm','GarageArea',
    #      'RoofStyle_Gable','LotArea','Street','TotRmsAbvGrd','OverallQual','LotFrontage','KitchenAbvGr','TotalBsmtSF',
     #     'MSZoning_RL','Exterior1st_VinylSd','GarageType_Attchd','GarageCond']
dropcols=['YearRemodAdd', 'GrLivArea', '1stFlrSF', 'GarageArea', 'YearBuilt', 'TotRmsAbvGrd', 'LotFrontage',
          'OverallQual', 'LotArea', 'TotalBsmtSF', 'KitchenAbvGr', 'GarageYrBlt', 'BedroomAbvGr', 'OverallCond', 
          'FullBath', 'GarageCars', 'MoSold', 'BsmtUnfSF']
dropcols=[]
#all_data = all_data.drop(dropcols,axis=1)

In [20]:
df_train_clean = all_data[:ntrain]
df_test_clean = all_data[ntrain:]
print(df_train_clean.shape)
print(df_test_clean.shape)

(1460, 223)
(1459, 223)


In [21]:
from sklearn.model_selection import train_test_split


X = df_train_clean.iloc[:, 1:-2].values  
#print(X[1]) 
print(X.shape)
y = y_train_all ##df_train.iloc[:, -1].values
print(y.shape)

X_test = df_test_clean.iloc[:, 1:-2].values  
#print(X_test[1]) 
print(X_test.shape)
y_test = df_test_clean.iloc[:, -1].values
print(y_test)


X_train, X_train_test, y_train, y_train_test = train_test_split(X, y, test_size=0.1, random_state=101)


(1460, 220)
(1460,)
(1459, 220)
[0 0 0 ... 0 0 0]


In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor    

def calculate_vif_(X, thresh=5.0):
    cols = X.columns
    variables = np.arange(X.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X[cols[variables]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc))
            variables = np.delete(variables, maxloc)
            dropped=True

    print('Remaining variables:')
    print(X.columns[variables])
    return X[cols[variables]]
####Careful before uncomment below code - takes some time and resources to run#########
#df = df_train[numcols]#drop non-numeric cols
#df_colliner = calculate_vif_(df)
#df = df_train[catcols]#drop non-numeric cols
#df_colliner = calculate_vif_(df)
#display(df_colliner)

In [23]:
# we are going to scale to data

print(y.shape)
print(y[1:5])
y_forstack = y
y = y.reshape(-1,1)
print(y.shape)
print(y[1:5])
y_train= y_train.reshape(-1,1)
y_train_test= y_train_test.reshape(-1,1)
y_test= y_test.reshape(-1,1)

sc_X_fd = StandardScaler()
sc_y_fd = StandardScaler()


sc_X_train = StandardScaler()
sc_y_train = StandardScaler()

sc_X_train_test = StandardScaler()
sc_y_train_test = StandardScaler()

sc_X_test = StandardScaler()
sc_y_test = StandardScaler()


X_train = sc_X_train.fit_transform(X_train)
y_train = sc_y_train.fit_transform(y_train)

X_train_test = sc_X_train_test.fit_transform(X_train_test)
y_train_test = sc_y_train_test.fit_transform(y_train_test)

X_test = sc_X_test.fit_transform(X_test)
y_test = sc_y_test.fit_transform(y_test)


X = sc_X_fd.fit_transform(X)
y = sc_y_fd.fit_transform(y)
print("****************")
print(y.shape)
print(y_test.shape)
print(y_train.shape)
print(y_train_test.shape)

(1460,)
[12.10901644 12.31717117 11.84940484 12.4292202 ]
(1460, 1)
[[12.10901644]
 [12.31717117]
 [11.84940484]
 [12.4292202 ]]
****************
(1460, 1)
(1459, 1)
(1314, 1)
(146, 1)




******************Data Processing ends here.***********************

In [24]:
from datetime import datetime
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
#Validation function
n_folds = 5

kfolds = KFold(n_splits=n_folds, shuffle=True, random_state=42)


# rmsle
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


# build our model scoring function
def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

# setup models    
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

xgboost = xgb.XGBRegressor(colsample_bytree=0.4, gamma=0,learning_rate=0.03, max_depth=3, min_child_weight=1.5,
                 n_estimators=10000, reg_alpha=0.75, reg_lambda=0.45,subsample=0.6, seed=42) 


ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_alt, cv=kfolds,))

lasso = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=1e7, alphas=alphas2,
                              random_state=42, cv=kfolds))

elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=1e7, alphas=e_alphas,
                                        cv=kfolds, random_state=42, l1_ratio=e_l1ratio))
                                        
svr = make_pipeline(RobustScaler(),
                      SVR(C= 20, epsilon= 0.008, gamma=0.0003,))
rf = RandomForestRegressor(random_state=RANDOM_SEED)
svr = SVR(kernel = 'rbf',gamma='auto')
gbr = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt',
                                min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)
lightgbm = LGBMRegressor(objective='regression', num_leaves=4,learning_rate=0.01, n_estimators=5000,
                         max_bin=200, bagging_fraction=0.75,bagging_freq=5, bagging_seed=7,feature_fraction=0.2,
                         feature_fraction_seed=7,verbose=-1)




stack_gen = StackingCVRegressor(regressors=(ridge, lasso, svr, lightgbm, gbr, xgboost, rf),
                                meta_regressor=xgboost,use_features_in_secondary=True)

print('TEST score on CV')

score = cv_rmse(ridge)
print("\nKernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("\nSVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("\nLightgbm score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("\nGradientBoosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("\nXgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(rf)
print("\nRandomForestRegressor score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )



print('START Fit')
print(datetime.now(), 'StackingCVRegressor')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
print(datetime.now(), 'lasso')
lasso_model_full_data = lasso.fit(X, y)
print(datetime.now(), 'ridge')
ridge_model_full_data = ridge.fit(X, y)
print(datetime.now(), 'svr')
svr_model_full_data = svr.fit(X, y)
print(datetime.now(), 'GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)
print(datetime.now(), 'xgboost')
xgb_model_full_data = xgboost.fit(X, y)
print(datetime.now(), 'lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)
print(datetime.now(), 'RandomForestRegressor')
rf_model_full_data = rf.fit(X, y)


TEST score on CV

Kernel Ridge score: 0.3324 (0.0552)
 2020-01-01 16:18:48.736658


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



Lasso score: 0.3309 (0.0589)
 2020-01-01 16:19:19.796389


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



SVR score: 0.4736 (0.0614)
 2020-01-01 16:19:22.155391


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



Lightgbm score: 0.3159 (0.0407)
 2020-01-01 16:19:31.645010


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



GradientBoosting score: 0.3166 (0.0457)
 2020-01-01 16:20:37.970764

Xgboost score: 0.3199 (0.0491)
 2020-01-01 16:24:45.138843


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)



RandomForestRegressor score: 0.3823 (0.0397)
 2020-01-01 16:24:46.871294
START Fit
2020-01-01 16:24:46.872294 StackingCVRegressor


  y = column_or_1d(y, warn=True)




KeyboardInterrupt: 

In [None]:
def blend_models_predict(X_pred):
    return ((0.1 * rf_model_full_data.predict(X_pred)) + \
            (0.05 * lasso_model_full_data.predict(X_pred)) + \
            (0.1 * ridge_model_full_data.predict(X_pred)) + \
            (0.1 * svr_model_full_data.predict(X_pred)) + \
            (0.1 * gbr_model_full_data.predict(X_pred)) + \
            (0.15 * xgb_model_full_data.predict(X_pred)) + \
            (0.1 * lgb_model_full_data.predict(X_pred)) + \
            (0.3 * stack_gen_model.predict(np.array(X_pred))))


print('RMSLE score on train data:')
print(X.shape)
print(y.shape)
#print(y.shape)
#print(rmsle(y, blend_models_predict(X)))

In [None]:
## Try 

#try gbr on test data
clf_pred=gbr.predict(X_test)
clf_pred= clf_pred.reshape(-1,1)
clf_pred = np.expm1(sc_y_test.inverse_transform(clf_pred))
#print(clf_pred)
df_clf_pred = pd.DataFrame(clf_pred, columns=['SalePrice'])
df_clf_pred.head()

In [None]:
print('Predict submission', datetime.now(),)
submission = pd.read_csv("./data/sample_submission.csv")
submission.iloc[:,1] = np.floor(np.expm1(sc_y_test.inverse_transform(blend_models_predict(X_test))))
print(submission.head())
submission.to_csv("submission.csv", index=False)
print('Save submission', datetime.now(),)

In [None]:
if(STACK_MODELLING == True):
    print("Stopping execution!!")
    raise StopExecution

In [None]:
ModelA = 'xgb'

#xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
#                max_depth = 5, alpha = 10, n_estimators = 1000)
eval_set = [(X_train, y_train), (X_train_test, y_train_test)]
eval_metric = ["rmse","error"]
xg_reg = xgb.XGBRegressor(colsample_bytree=0.4, gamma=0,learning_rate=0.07, max_depth=3, min_child_weight=1.5,
                 n_estimators=10000, reg_alpha=0.75, reg_lambda=0.45,subsample=0.6, seed=42) 
#xg_reg = xgb.XGBRegressor(gamma=1, colsample_bytree = .5, learning_rate = 0.1,
#                max_depth = 3, alpha = 10, n_estimators = 1000)
#xgbmodel = xg_reg.fit(X,y,verbose=True)
score = cv_rmse(xg_reg)
print("\nxg_reg score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
xgbmodel = xg_reg.fit(X, y, eval_metric=eval_metric, eval_set=eval_set, verbose=True)
print(xgbmodel.feature_importances_)
print("?????????????????????????????")
#X, y = make_regression(n_features=4, n_informative=2,random_state=0, shuffle=False)
#regr = RandomForestRegressor(max_depth=2, random_state=0)
#regr.fit(X, y)
#RandomForestRegressor(max_depth=2, random_state=0)
#print(regr.feature_importances_)
#print(regr.feature_importances_.shape)
#print(X.shape)
Xcols=df_train_clean.columns
result = zip(Xcols,xgbmodel.feature_importances_.tolist())
df_impf = pd.DataFrame(set(result))
df_impf.columns = ['field','score']
print(df_impf.sort_values('score',ascending=False))

In [None]:
predictions = xgbmodel.predict(X_test)
print(predictions)
print(predictions.shape)
predictions = np.expm1(sc_y_test.inverse_transform(predictions))
#predictions = sc_y_test.inverse_transform(predictions)
df_predictions = pd.DataFrame(predictions, columns=['SalePrice'])
#df_predictions.head()

modelaMAE = ''
modelaMSE =  ''
modelaRMSE = ''

#submission data
df_sub1 = pd.concat([df_test_id,df_predictions],axis=1)
print(df_sub1.head())
df_sub1.to_csv('./data/xgb.csv',index=False)

In [None]:
ModelB='SVR'


svr = SVR(kernel = 'rbf',gamma='auto')

score = cv_rmse(svr)
print("\nSVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:

svr.fit(X, y)

svr_pred = sc_y_train_test.inverse_transform(svr.predict(X_train_test))
svr_pred= svr_pred.reshape(-1,1)

modelbMAE = metrics.mean_absolute_error(y_train_test, svr_pred)
modelbMSE = metrics.mean_squared_error(y_train_test, svr_pred)
modelbRMSE = np.sqrt(metrics.mean_squared_error(y_train_test, svr_pred))

print('MAE:', modelbMAE)
print('MSE:', modelbMSE)
print('RMSE:', modelbRMSE)

In [None]:
ModelC = 'GradientBoostingRegressor'

params = {'n_estimators': 2500, 'max_depth': 4, 'min_samples_split': 5,
          'learning_rate': 0.04, 'loss': 'ls'}
params1={'n_estimators':3500, 'learning_rate':0.05, 'max_depth':4, 'max_features':'sqrt',
                                   'min_samples_leaf':15, 'min_samples_split':10, 
                                   'loss':'huber', 'random_state':5}
clf = ensemble.GradientBoostingRegressor(**params1)

score = cv_rmse(clf)
print("\nGradientBoostingRegressor score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
clf.fit(X, y)

clf_pred = sc_y_train_test.inverse_transform(clf.predict(X_train_test))
clf_pred = clf_pred.reshape(-1,1)

modelcMAE = metrics.mean_absolute_error(y_train_test, clf_pred)
modelcMSE = metrics.mean_squared_error(y_train_test, clf_pred)
modelcRMSE = np.sqrt(metrics.mean_squared_error(y_train_test, clf_pred))

print('MAE:', modelbMAE)
print('MSE:', modelbMSE)
print('RMSE:', modelbRMSE)

In [None]:
clf_pred = np.expm1(clf_pred)
#print(clf_pred)
df_clf_pred = pd.DataFrame(clf_pred, columns=['SalePrice'])
df_clf_pred.head()

In [None]:
#try gbr on test data
clf_pred=clf.predict(X_test)
clf_pred= clf_pred.reshape(-1,1)
clf_pred = np.expm1(sc_y_test.inverse_transform(clf_pred))
#print(clf_pred)
df_clf_pred = pd.DataFrame(clf_pred, columns=['SalePrice'])
df_clf_pred.head()

In [None]:
#submission data
df_sub = pd.concat([df_test_id,df_clf_pred],axis=1)
print(df_sub.head())

In [None]:
df_sub.to_csv('./data/submission.csv',index=False)

In [None]:
#Write model parameters to log

# using now() to get current time 
current_time = datetime.datetime.now() 
outliers = '2' if REMOVE_ONLY_2_Outliers == True else '100'
fields = ['date','model','outliers','skewness','cols_dropped','mae', 'mse','rmse','Kaggle_score']
logfileexist = path.exists("Regression_log.csv")
with open('Regression_log.csv', mode='a', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=fields)
    if(logfileexist == False):
        writer.writeheader()
        
    writer.writerow({'date': current_time, 'model': ModelA, 'outliers': outliers,'skewness':skewness_threshold,
                     'cols_dropped':len(dropcols),'mae':modelaMAE, 'mse':modelaMSE,'rmse':modelaRMSE })
    writer.writerow({'date': current_time, 'model': ModelB, 'outliers': outliers,'skewness':skewness_threshold,
                     'cols_dropped':len(dropcols),'mae':modelbMAE, 'mse':modelbMSE,'rmse':modelbRMSE })
    writer.writerow({'date': current_time, 'model': ModelC, 'outliers': outliers,'skewness':skewness_threshold,
                     'cols_dropped':len(dropcols),'mae':modelcMAE, 'mse':modelcMSE,'rmse':modelcRMSE })


In [None]:
### References
# Remove Outliers - https://www.kaggle.com/zoupet/neural-network-model-for-house-prices-tensorflow 
# GridSearchCV - https://medium.com/datadriveninvestor/an-introduction-to-grid-search-ff57adcc0998
# Skewness & Stacked models - https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
# Stacking Models - https://www.kaggle.com/itslek/blend-stack-lr-gb-0-10649-house-prices-v57
#
#
#
############### TODO's / Issues
### 1. Number of features dropped by multicollinearity analysis is 46 - seems to be very high.
##### 1.1 Lets try collinearity approach discussed here - https://www.kaggle.com/yingbao/feature-engineering-for-house-price-prediction
###### 1.1.1 Removing about 18 numeric fields brought down the score.


