In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/train', './input/test']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

# HOUSE PRICES PREDICTION

# Importing libraries

In [1]:
import numpy as np
# import pandas as pd
import os
exec(os.environ['IREWR_IMPORTS'])
# FIRST-AUTHOR: remove plotting
# from matplotlib import pyplot as plt
# import seaborn as sns
# %matplotlib inline

In [2]:
#Loading the Data
train = pd.read_csv('./input/train.scaled.csv')

# MISSING VALUES IMPUTATION

In [3]:
nulls = train.isnull().sum().sort_values(ascending=False)
nulls.head(20)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
Id                 0
dtype: int64

From the above dataframe -'nulls' we came to know that the attributes PoolQC,MiscFeature,Alley and Fence are having morethan 60% of the values as 'nan'.so, its better to remove them as these columns won't give much info about the SalePrice.

In [4]:
train = train.drop(['Id','PoolQC','MiscFeature','Alley','Fence'],axis = 1)

## FireplaceQu

In [5]:
train[['Fireplaces','FireplaceQu']].head(10)

Unnamed: 0,Fireplaces,FireplaceQu
0,0,
1,1,TA
2,1,TA
3,1,Gd
4,1,TA
5,0,
6,1,Gd
7,2,TA
8,2,TA
9,2,TA


In [6]:
train['FireplaceQu'].isnull().sum()

690

In [7]:
train['Fireplaces'].value_counts()

Fireplaces
0    690
1    650
2    115
3      5
Name: count, dtype: int64

The attribute 'FireplaceQu' is having 690 null values.If we compare the columns 'FireplaceQu' and 'Fireplaces' the indexes which are having the zeros in the Fireplaces column are having the 'nan' values in FireplaceQu. It tells that the houses which are not having the Fireplaces are having nan values in FireplaceQu so, i will replace these nulls with "no Fireplace" i,e 'NF'  

In [8]:
train['FireplaceQu']=train['FireplaceQu'].fillna('NF')

## LotFrontage

In [9]:
train['LotFrontage'] =train['LotFrontage'].fillna(value=train['LotFrontage'].mean())

## Attributes related to "GARAGE"

In [10]:
train['GarageType'].isnull().sum()

81

In [11]:
train['GarageCond'].isnull().sum()

81

In [12]:
train['GarageFinish'].isnull().sum()

81

In [13]:
train['GarageYrBlt'].isnull().sum()

81

In [14]:
train['GarageQual'].isnull().sum()

81

In [15]:
train['GarageArea'].value_counts().head()

GarageArea
0      81
440    49
576    47
240    38
484    34
Name: count, dtype: int64

We can observe that all the columns related to Garage are having the sama number of null values. so, there should be a relationship among them and if we look at the 'GarageArea' column it is having the 81 zeros which is equal to no: of 'nans' in these columns.Hence we can conclude that the houses without Garage Area are having 'nan' at all these columns.

>> I will replace these nans with 'No GarageArea'----> 'NG' 

In [16]:
train['GarageType']=train['GarageType'].fillna('NG')
train['GarageCond']=train['GarageCond'].fillna('NG')
train['GarageFinish']=train['GarageFinish'].fillna('NG')
train['GarageYrBlt']=train['GarageYrBlt'].fillna('NG')
train['GarageQual']=train['GarageQual'].fillna('NG')

## Bsmt

In [17]:
train.BsmtExposure.isnull().sum()

38

In [18]:
train.BsmtFinType2.isnull().sum()

38

In [19]:
train.BsmtFinType1.isnull().sum()

37

In [20]:
train.BsmtCond.isnull().sum() 

37

In [21]:
train.BsmtQual.isnull().sum()

37

In [22]:
train.TotalBsmtSF.value_counts().head()

TotalBsmtSF
0       37
864     35
672     17
912     15
1040    14
Name: count, dtype: int64

In [23]:
train['BsmtExposure']=train['BsmtExposure'].fillna('NB')
train['BsmtFinType2']=train['BsmtFinType2'].fillna('NB')
train['BsmtFinType1']=train['BsmtFinType1'].fillna('NB')
train['BsmtCond']=train['BsmtCond'].fillna('NB')
train['BsmtQual']=train['BsmtQual'].fillna('NB')

## MasVnr

In [24]:
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mean())

In [25]:
train['MasVnrType'] = train['MasVnrType'].fillna('none')

## Electrical

In [26]:
train.Electrical = train.Electrical.fillna('SBrkr')

In [27]:
train.isnull().sum().sum()

0

# OUTLIERS

In [28]:
num_train = train._get_numeric_data()

In [29]:
num_train.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [30]:
def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

num_train.apply(lambda x: var_summary(x)).T


Unnamed: 0,N,NMISS,SUM,MEAN,MEDIAN,STD,VAR,MIN,P1,P5,P10,P25,P50,P75,P90,P95,P99,MAX
MSSubClass,1460.0,0.0,83070.0,56.89726,50.0,42.300571,1789.338,20.0,20.0,20.0,20.0,20.0,50.0,70.0,120.0,160.0,190.0,190.0
LotFrontage,1460.0,0.0,102272.9,70.049958,70.049958,22.024023,485.0576,21.0,21.0,35.95,49.0,60.0,70.049958,79.0,92.0,104.0,137.41,313.0
LotArea,1460.0,0.0,15354570.0,10516.828082,9478.5,9981.264932,99625650.0,1300.0,1680.0,3311.7,5000.0,7553.5,9478.5,11601.5,14381.7,17401.15,37567.64,215245.0
OverallQual,1460.0,0.0,8905.0,6.099315,6.0,1.382997,1.912679,1.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,8.0,10.0,10.0
OverallCond,1460.0,0.0,8140.0,5.575342,5.0,1.112799,1.238322,1.0,3.0,4.0,5.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0
YearBuilt,1460.0,0.0,2878051.0,1971.267808,1973.0,30.202904,912.2154,1872.0,1899.18,1916.0,1924.9,1954.0,1973.0,2000.0,2006.0,2007.0,2009.0,2010.0
YearRemodAdd,1460.0,0.0,2897904.0,1984.865753,1994.0,20.645407,426.2328,1950.0,1950.0,1950.0,1950.0,1967.0,1994.0,2004.0,2006.0,2007.0,2009.0,2010.0
MasVnrArea,1460.0,0.0,151380.5,103.685262,0.0,180.569112,32605.2,0.0,0.0,0.0,0.0,0.0,0.0,164.25,335.0,456.0,791.28,1600.0
BsmtFinSF1,1460.0,0.0,647714.0,443.639726,383.5,456.098091,208025.5,0.0,0.0,0.0,0.0,0.0,383.5,712.25,1065.5,1274.0,1572.41,5644.0
BsmtFinSF2,1460.0,0.0,67962.0,46.549315,0.0,161.319273,26023.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.2,396.2,830.38,1474.0


In [31]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot([num_train.LotFrontage])
_ = [num_train.LotFrontage]

In [32]:
# FIRST-AUTHOR: make notebook run
# train['LotFrontage']= train['LotFrontage'].clip_upper(train['LotFrontage'].quantile(0.99))
train['LotFrontage']= train['LotFrontage'].clip(upper=train['LotFrontage'].quantile(0.99))

In [33]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(num_train.LotArea)
_ = num_train.LotArea

In [34]:
# FIRST-AUTHOR: make notebook run
# train['LotArea']= train['LotArea'].clip_upper(train['LotArea'].quantile(0.99))
train['LotArea']= train['LotArea'].clip(upper=train['LotArea'].quantile(0.99))

In [35]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['MasVnrArea'])
_ = train['MasVnrArea']

In [36]:
# FIRST-AUTHOR: make notebook run
# train['MasVnrArea']= train['MasVnrArea'].clip_upper(train['MasVnrArea'].quantile(0.99))
train['MasVnrArea']= train['MasVnrArea'].clip(upper=train['MasVnrArea'].quantile(0.99))

In [37]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['BsmtFinSF1'])
_ = train['BsmtFinSF1']

In [38]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['BsmtFinSF2'])
_ = train['BsmtFinSF2']

In [39]:
# FIRST-AUTHOR: make notebook run
# train['BsmtFinSF1']= train['BsmtFinSF1'].clip_upper(train['BsmtFinSF1'].quantile(0.99)) 
# train['BsmtFinSF2']= train['BsmtFinSF2'].clip_upper(train['BsmtFinSF2'].quantile(0.99))
train['BsmtFinSF1']= train['BsmtFinSF1'].clip(upper=train['BsmtFinSF1'].quantile(0.99)) 
train['BsmtFinSF2']= train['BsmtFinSF2'].clip(upper=train['BsmtFinSF2'].quantile(0.99))

In [40]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['TotalBsmtSF'])
_ = train['TotalBsmtSF']

In [41]:
# FIRST-AUTHOR: make notebook run
# train['TotalBsmtSF']= train['TotalBsmtSF'].clip_upper(train['TotalBsmtSF'].quantile(0.99))
train['TotalBsmtSF']= train['TotalBsmtSF'].clip(upper=train['TotalBsmtSF'].quantile(0.99))

In [42]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['1stFlrSF'])
_ = train['1stFlrSF']

In [43]:
# FIRST-AUTHOR: make notebook run
# train['1stFlrSF']= train['1stFlrSF'].clip_upper(train['1stFlrSF'].quantile(0.99))
train['1stFlrSF']= train['1stFlrSF'].clip(upper=train['1stFlrSF'].quantile(0.99))

In [44]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['2ndFlrSF'])
_ = train['2ndFlrSF']

In [45]:
# FIRST-AUTHOR: make notebook run
# train['2ndFlrSF']= train['2ndFlrSF'].clip_upper(train['2ndFlrSF'].quantile(0.99))
train['2ndFlrSF']= train['2ndFlrSF'].clip(upper=train['2ndFlrSF'].quantile(0.99))

In [46]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['GrLivArea'])
_ = train['GrLivArea']

In [47]:
# FIRST-AUTHOR: make notebook run
# train['GrLivArea']= train['GrLivArea'].clip_upper(train['GrLivArea'].quantile(0.99))
train['GrLivArea']= train['GrLivArea'].clip(upper=train['GrLivArea'].quantile(0.99))

In [48]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['BedroomAbvGr'])
_ = train['BedroomAbvGr']

In [49]:
# FIRST-AUTHOR: make notebook run
# train['BedroomAbvGr']= train['BedroomAbvGr'].clip_upper(train['BedroomAbvGr'].quantile(0.99))
# train['BedroomAbvGr']= train['BedroomAbvGr'].clip_lower(train['BedroomAbvGr'].quantile(0.01))
train['BedroomAbvGr']= train['BedroomAbvGr'].clip(upper=train['BedroomAbvGr'].quantile(0.99))
train['BedroomAbvGr']= train['BedroomAbvGr'].clip(lower=train['BedroomAbvGr'].quantile(0.01))

In [50]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['GarageCars'])
_ = train['GarageCars']

In [51]:
# FIRST-AUTHOR: make notebook run
# train['GarageCars']= train['GarageCars'].clip_upper(train['GarageCars'].quantile(0.99))
train['GarageCars']= train['GarageCars'].clip(upper=train['GarageCars'].quantile(0.99))

In [52]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['GarageArea'])
_ = train['GarageArea']

In [53]:
# FIRST-AUTHOR: make notebook run
# train['GarageArea']= train['GarageArea'].clip_upper(train['GarageArea'].quantile(0.99))
train['GarageArea']= train['GarageArea'].clip(upper=train['GarageArea'].quantile(0.99))

In [54]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['WoodDeckSF'])
_ = train['WoodDeckSF']

In [55]:
# FIRST-AUTHOR: make notebook run
# train['WoodDeckSF']= train['WoodDeckSF'].clip_upper(train['WoodDeckSF'].quantile(0.99))
train['WoodDeckSF']= train['WoodDeckSF'].clip(upper=train['WoodDeckSF'].quantile(0.99))

In [56]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['OpenPorchSF'])
_ = train['OpenPorchSF']

In [57]:
# FIRST-AUTHOR: make notebook run
# train['OpenPorchSF']= train['OpenPorchSF'].clip_upper(train['OpenPorchSF'].quantile(0.99))
train['OpenPorchSF']= train['OpenPorchSF'].clip(upper=train['OpenPorchSF'].quantile(0.99))

In [58]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['EnclosedPorch'])
_ = train['EnclosedPorch']

In [59]:
# FIRST-AUTHOR: make notebook run
# train['EnclosedPorch']= train['EnclosedPorch'].clip_upper(train['EnclosedPorch'].quantile(0.99))
train['EnclosedPorch']= train['EnclosedPorch'].clip(upper=train['EnclosedPorch'].quantile(0.99))

In [60]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['3SsnPorch'])
_ = train['3SsnPorch']

In [61]:
# FIRST-AUTHOR: make notebook run
# train['3SsnPorch']= train['3SsnPorch'].clip_upper(train['3SsnPorch'].quantile(0.99))
train['3SsnPorch']= train['3SsnPorch'].clip(upper=train['3SsnPorch'].quantile(0.99))

In [62]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['ScreenPorch'])
_ = train['ScreenPorch']

In [63]:
# FIRST-AUTHOR: make notebook run
# train['ScreenPorch']= train['ScreenPorch'].clip_upper(train['ScreenPorch'].quantile(0.99))
train['ScreenPorch']= train['ScreenPorch'].clip(upper=train['ScreenPorch'].quantile(0.99))

In [64]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['PoolArea'])
_ = train['PoolArea']

In [65]:
# FIRST-AUTHOR: make notebook run
# train['PoolArea']= train['PoolArea'].clip_upper(train['PoolArea'].quantile(0.99))
train['PoolArea']= train['PoolArea'].clip(upper=train['PoolArea'].quantile(0.99))

In [66]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train['MiscVal'])
_ = train['MiscVal']

In [67]:
# FIRST-AUTHOR: remove plotting
# sns.boxplot(train.SalePrice)
_ = train.SalePrice

In [68]:
# FIRST-AUTHOR: make notebook run
# train['SalePrice']= train['SalePrice'].clip_upper(train['SalePrice'].quantile(0.99))
# train['SalePrice']= train['SalePrice'].clip_lower(train['SalePrice'].quantile(0.01))
train['SalePrice']= train['SalePrice'].clip(upper=train['SalePrice'].quantile(0.99))
train['SalePrice']= train['SalePrice'].clip(lower=train['SalePrice'].quantile(0.01))

In [69]:
# FIRST-AUTHOR: make notebook run
# train['MiscVal']= train['MiscVal'].clip_upper(train['MiscVal'].quantile(0.99))
train['MiscVal']= train['MiscVal'].clip(upper=train['MiscVal'].quantile(0.99))

In [70]:
num_corr=num_train .corr()
# FIRST-AUTHOR: remove plotting
# plt.subplots(figsize=(13,10))
# sns.heatmap(num_corr,vmax =.8 ,square = True)

In [71]:
k = 14
cols = num_corr.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(num_train[cols].values.T)
# FIRST-AUTHOR: remove plotting
# sns.set(font_scale=1.35)
# f, ax = plt.subplots(figsize=(10,10))
# hm=sns.heatmap(cm, annot = True,vmax =.8, yticklabels=cols.values, xticklabels = cols.values)
_ = cols.values
_ = cols.values

# FEATURE SELECTION

Selecting the significant features is an important step in building a best model.We can use multiple techniques to select the significant features and some of them are:
                Statsmodels,
                Hypothesis Testing,
                PCA - Dimensionality Reduction and 
                Feature importance using Random Forests.

## STATSMODELS

In [72]:
# FIRST-AUTHOR: remove ML code
# import statsmodels.api as sm
# import statsmodels.formula.api as smf

In [73]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   float64
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [74]:
train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF'

In [75]:
s1 = set(train.columns)
s2 = set(['SalePrice'])

In [76]:
features = "+".join((set(s1)-s2))
features

'GarageQual+SaleCondition+MasVnrArea+Condition1+LotConfig+LotArea+Electrical+BldgType+OpenPorchSF+Functional+OverallQual+LowQualFinSF+HalfBath+MiscVal+BsmtQual+GrLivArea+3SsnPorch+BsmtHalfBath+Exterior2nd+TotalBsmtSF+MasVnrType+BsmtCond+YearRemodAdd+BsmtFinType2+MoSold+MSSubClass+Foundation+BsmtFinSF2+YearBuilt+GarageArea+GarageType+BsmtFullBath+Condition2+KitchenAbvGr+HeatingQC+LandSlope+WoodDeckSF+LotShape+HouseStyle+EnclosedPorch+LandContour+ExterCond+BsmtExposure+Heating+Neighborhood+BsmtFinSF1+OverallCond+ExterQual+2ndFlrSF+KitchenQual+TotRmsAbvGrd+BsmtFinType1+CentralAir+GarageYrBlt+Street+PoolArea+Exterior1st+FullBath+ScreenPorch+1stFlrSF+GarageCars+GarageFinish+FireplaceQu+BedroomAbvGr+Fireplaces+Utilities+RoofMatl+YrSold+PavedDrive+MSZoning+SaleType+GarageCond+BsmtUnfSF+RoofStyle+LotFrontage'

In [77]:
train = train.rename(columns ={'1stFlrSF':'firstFlrSF','2ndFlrSF':'iindFlrSF','3SsnPorch':'iiiSsnPorch'})

In [78]:
# FIRST-AUTHOR: remove ML code
# lm=smf.ols('SalePrice~firstFlrSF+MasVnrType+GarageFinish+KitchenAbvGr+WoodDeckSF+LandContour+LandSlope+GarageCars+Street+Exterior1st+iindFlrSF+SaleCondition+Electrical+LotConfig+HeatingQC+PavedDrive+LotArea+BsmtUnfSF+RoofMatl+TotRmsAbvGrd+BsmtFullBath+ExterQual+BedroomAbvGr+EnclosedPorch+BsmtQual+BsmtFinSF2+GarageCond+HouseStyle+GrLivArea+PoolArea+Utilities+BsmtExposure+HalfBath+Condition1+YrSold+MasVnrArea+BldgType+MSZoning+Fireplaces+FireplaceQu+BsmtFinType1+YearBuilt+BsmtHalfBath+Heating+SaleType+BsmtCond+MSSubClass+ScreenPorch+OpenPorchSF+FullBath+BsmtFinSF1+MoSold+LowQualFinSF+GarageType+Exterior2nd+iiiSsnPorch+TotalBsmtSF+ExterCond+Neighborhood+OverallQual+GarageArea+LotShape+MiscVal+YearRemodAdd+OverallCond+BsmtFinType2+Condition2+CentralAir+LotFrontage+Functional+RoofStyle+GarageYrBlt+KitchenQual+Foundation+GarageQual',data = train).fit()

In [79]:
# FIRST-AUTHOR: remove ML code
# lm.summary()

In [80]:
# FIRST-AUTHOR: remove ML code
# imc = pd.DataFrame(lm.pvalues)
# imc
imc = pd.DataFrame(pd.Series(0, index=['firstFlrSF', 'MasVnrType', 'GarageFinish+KitchenAbvGr',
                                       'WoodDeckSF', 'LandContour', 'LandSlope', 'GarageCars',
                                       'Street', 'Exterior1st', 'iindFlrSF', 'SaleCondition',
                                       'Electrical', 'LotConfig', 'HeatingQC', 'PavedDrive', 'LotArea',
                                       'BsmtUnfSF', 'RoofMatl', 'TotRmsAbvGrd', 'BsmtFullBath', 'ExterQual',
                                       'BedroomAbvGr', 'EnclosedPorch', 'BsmtQual', 'BsmtFinSF2', 'GarageCond',
                                       'HouseStyle', 'GrLivArea', 'PoolArea', 'Utilities', 'BsmtExposure',
                                       'HalfBath', 'Condition1', 'YrSold', 'MasVnrArea', 'BldgType',
                                       'MSZoning', 'Fireplaces', 'FireplaceQu', 'BsmtFinType1', 'YearBuilt',
                                       'BsmtHalfBath', 'Heating', 'SaleType', 'BsmtCond', 'MSSubClass',
                                       'ScreenPorch', 'OpenPorchSF', 'FullBath', 'BsmtFinSF1', 'MoSold',
                                       'LowQualFinSF', 'GarageType', 'Exterior2nd', 'iiiSsnPorch',
                                       'TotalBsmtSF', 'ExterCond', 'Neighborhood', 'OverallQual',
                                       'GarageArea', 'LotShape', 'MiscVal', 'YearRemodAdd',
                                       'OverallCond', 'BsmtFinType2', 'Condition2', 'CentralAir',
                                       'LotFrontage', 'Functional', 'RoofStyle', 'GarageYrBlt', 'KitchenQual',
                                       'Foundation', 'GarageQual']))
imc

Unnamed: 0,0
firstFlrSF,0
MasVnrType,0
GarageFinish+KitchenAbvGr,0
WoodDeckSF,0
LandContour,0
...,...
RoofStyle,0
GarageYrBlt,0
KitchenQual,0
Foundation,0


In [81]:
best_features = imc[imc[0] <= 0.05].index
best_features

Index(['firstFlrSF', 'MasVnrType', 'GarageFinish+KitchenAbvGr', 'WoodDeckSF',
       'LandContour', 'LandSlope', 'GarageCars', 'Street', 'Exterior1st',
       'iindFlrSF', 'SaleCondition', 'Electrical', 'LotConfig', 'HeatingQC',
       'PavedDrive', 'LotArea', 'BsmtUnfSF', 'RoofMatl', 'TotRmsAbvGrd',
       'BsmtFullBath', 'ExterQual', 'BedroomAbvGr', 'EnclosedPorch',
       'BsmtQual', 'BsmtFinSF2', 'GarageCond', 'HouseStyle', 'GrLivArea',
       'PoolArea', 'Utilities', 'BsmtExposure', 'HalfBath', 'Condition1',
       'YrSold', 'MasVnrArea', 'BldgType', 'MSZoning', 'Fireplaces',
       'FireplaceQu', 'BsmtFinType1', 'YearBuilt', 'BsmtHalfBath', 'Heating',
       'SaleType', 'BsmtCond', 'MSSubClass', 'ScreenPorch', 'OpenPorchSF',
       'FullBath', 'BsmtFinSF1', 'MoSold', 'LowQualFinSF', 'GarageType',
       'Exterior2nd', 'iiiSsnPorch', 'TotalBsmtSF', 'ExterCond',
       'Neighborhood', 'OverallQual', 'GarageArea', 'LotShape', 'MiscVal',
       'YearRemodAdd', 'OverallCond', 'BsmtFin

In [82]:
# FIRST-AUTHOR: remove ML code
# from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [83]:
# FIRST-AUTHOR: remove ML code
# train['intercept'] = lm.params[0]
train['intercept'] = 210016.3483272409

In [84]:
for i in range(18):
# FIRST-AUTHOR: make notebook run, remove ML code
#     print (vif(train[['firstFlrSF', 'WoodDeckSF', 'GarageCars', 'iindFlrSF', 'LotArea',
#        'BsmtUnfSF', 'GrLivArea', 'PoolArea', 'Fireplaces', 'YearBuilt',
#        'ScreenPorch', 'LowQualFinSF', 'TotalBsmtSF', 'OverallQual',
#        'GarageArea', 'YearRemodAdd', 'OverallCond','intercept']].as_matrix(), i))
    _ = train[['firstFlrSF', 'WoodDeckSF', 'GarageCars', 'iindFlrSF', 'LotArea',
       'BsmtUnfSF', 'GrLivArea', 'PoolArea', 'Fireplaces', 'YearBuilt',
       'ScreenPorch', 'LowQualFinSF', 'TotalBsmtSF', 'OverallQual',
       'GarageArea', 'YearRemodAdd', 'OverallCond','intercept']].to_numpy()

In [85]:
train_a = train[ ['GarageFinish','Exterior1st','SaleCondition', 'LotConfig', 'RoofMatl', 'ExterQual', 'BsmtQual',  'GarageCond',
        'BsmtExposure', 'Condition1','BldgType', 'MSZoning', 'SaleType','GarageType', 'Exterior2nd','Neighborhood', 'Condition2',
       'Functional', 'GarageYrBlt', 'KitchenQual','Foundation', 'GarageQual', 'WoodDeckSF', 'LotArea',
       'BsmtUnfSF', 'Fireplaces', 'YearBuilt','ScreenPorch', 'LowQualFinSF', 'TotalBsmtSF', 'OverallQual',
       'YearRemodAdd', 'OverallCond','SalePrice']]

In [86]:
best_train = train_a
best_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 34 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   GarageFinish   1460 non-null   object 
 1   Exterior1st    1460 non-null   object 
 2   SaleCondition  1460 non-null   object 
 3   LotConfig      1460 non-null   object 
 4   RoofMatl       1460 non-null   object 
 5   ExterQual      1460 non-null   object 
 6   BsmtQual       1460 non-null   object 
 7   GarageCond     1460 non-null   object 
 8   BsmtExposure   1460 non-null   object 
 9   Condition1     1460 non-null   object 
 10  BldgType       1460 non-null   object 
 11  MSZoning       1460 non-null   object 
 12  SaleType       1460 non-null   object 
 13  GarageType     1460 non-null   object 
 14  Exterior2nd    1460 non-null   object 
 15  Neighborhood   1460 non-null   object 
 16  Condition2     1460 non-null   object 
 17  Functional     1460 non-null   object 
 18  GarageYr

## Random Forest Feature Importance

In [87]:
# FIRST-AUTHOR: remove ML code
# from sklearn.ensemble import RandomForestRegressor

In [88]:
train_d = pd.get_dummies(train)

In [89]:
numeric = train._get_numeric_data()
category = train.drop(numeric.columns,axis = 1)

In [90]:
train_dx = train_d.drop(["SalePrice"],axis = 1)
train_dy = train_d.SalePrice

In [91]:
# FIRST-AUTHOR: remove ML code
# from sklearn.cross_validation import train_test_split

In [92]:
# FIRST-AUTHOR: remove ML code
# X_train, X_test, Y_train, Y_test = train_test_split(
#         train_dx,
#         train_dy,
#         test_size=0.20,
#         random_state=123)

In [93]:
# FIRST-AUTHOR: remove ML code
# radm_clf = RandomForestRegressor(oob_score=True,n_estimators=100 )
# radm_clf.fit( X_train, Y_train )

In [94]:
# FIRST-AUTHOR: remove ML code
# indices = np.argsort(radm_clf.feature_importances_)[::-1]
feature_rank = pd.DataFrame( columns = ['rank', 'feature', 'importance'] )
# FIRST-AUTHOR: remove ML code
# for f in range(X_train.shape[1]):
#     feature_rank.loc[f] = [f+1,
#                          X_train.columns[indices[f]],
#                          radm_clf.feature_importances_[indices[f]]]
# f, ax = plt.subplots(figsize=(10,100))
# sns.barplot( y = 'feature', x = 'importance', data = feature_rank, color = 'Yellow')
# plt.show()

In [95]:
ff = feature_rank.head(30)
ff

Unnamed: 0,rank,feature,importance


In [96]:
list(ff.feature)

[]

In [97]:
final_cols = train_d[['OverallQual','GrLivArea','GarageCars', 'TotalBsmtSF', 'BsmtFinSF1', 'firstFlrSF',
 'GarageArea', 'LotArea', 'YearBuilt', 'OpenPorchSF', 'FullBath', 'LotFrontage', 'BsmtUnfSF', 'YearRemodAdd',
 'OverallCond','iindFlrSF','MasVnrArea','GarageType_Detchd','WoodDeckSF','MoSold','BsmtQual_Gd','TotRmsAbvGrd',
 'Neighborhood_Edwards','KitchenAbvGr','MSZoning_RM','MSSubClass','BsmtQual_Ex','GarageType_Attchd',
'ExterQual_Ex','KitchenQual_Gd']]

In [98]:
data_x = final_cols
data_y = train.SalePrice
final_data = pd.concat([data_x,data_y],axis = 1)

In [99]:
feats = "+".join(data_x)
feats

'OverallQual+GrLivArea+GarageCars+TotalBsmtSF+BsmtFinSF1+firstFlrSF+GarageArea+LotArea+YearBuilt+OpenPorchSF+FullBath+LotFrontage+BsmtUnfSF+YearRemodAdd+OverallCond+iindFlrSF+MasVnrArea+GarageType_Detchd+WoodDeckSF+MoSold+BsmtQual_Gd+TotRmsAbvGrd+Neighborhood_Edwards+KitchenAbvGr+MSZoning_RM+MSSubClass+BsmtQual_Ex+GarageType_Attchd+ExterQual_Ex+KitchenQual_Gd'

In [100]:
# FIRST-AUTHOR: remove ML code
# import statsmodels.api as sm
# import statsmodels.formula.api as smf

In [101]:
final_data = final_data.rename(columns ={'1stFlrSF':'firstFlrSF','2ndFlrSF':'iindFlrSF'})

In [102]:
# FIRST-AUTHOR: remove ML code
# lm=smf.ols('SalePrice~OverallQual+GrLivArea+GarageCars+TotalBsmtSF+BsmtFinSF1+firstFlrSF+GarageArea+LotArea+YearBuilt+OpenPorchSF+FullBath+LotFrontage+BsmtUnfSF+YearRemodAdd+OverallCond+iindFlrSF+MasVnrArea+GarageType_Detchd+WoodDeckSF+MoSold+BsmtQual_Gd+TotRmsAbvGrd+Neighborhood_Edwards+KitchenAbvGr+MSZoning_RM+MSSubClass+BsmtQual_Ex+GarageType_Attchd+ExterQual_Ex+KitchenQual_Gd',final_data).fit()

In [103]:
# FIRST-AUTHOR: remove ML code
# lm.summary()

In [104]:
# FIRST-AUTHOR: remove ML code
# lm.pvalues

## Multi-Collinearity

In [105]:
# FIRST-AUTHOR: remove ML code
# from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [106]:
# FIRST-AUTHOR: remove ML code
# final_data['intercept'] = lm.params[0]
final_data['intercept'] = -831389.4696019923

In [107]:
final_data.columns

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'BsmtFinSF1',
       'firstFlrSF', 'GarageArea', 'LotArea', 'YearBuilt', 'OpenPorchSF',
       'FullBath', 'LotFrontage', 'BsmtUnfSF', 'YearRemodAdd', 'OverallCond',
       'iindFlrSF', 'MasVnrArea', 'GarageType_Detchd', 'WoodDeckSF', 'MoSold',
       'BsmtQual_Gd', 'TotRmsAbvGrd', 'Neighborhood_Edwards', 'KitchenAbvGr',
       'MSZoning_RM', 'MSSubClass', 'BsmtQual_Ex', 'GarageType_Attchd',
       'ExterQual_Ex', 'KitchenQual_Gd', 'SalePrice', 'intercept'],
      dtype='object')

# Variance Inflation Factor

In [108]:
for i in range(31):
# FIRST-AUTHOR: make notebook run
#     print (vif(final_data[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'BsmtFinSF1',
#        'firstFlrSF', 'GarageArea', 'LotArea', 'YearBuilt', 'OpenPorchSF',
#        'FullBath', 'LotFrontage', 'BsmtUnfSF', 'YearRemodAdd', 'OverallCond',
#        'iindFlrSF', 'MasVnrArea', 'GarageType_Detchd', 'WoodDeckSF', 'MoSold',
#        'BsmtQual_Gd', 'TotRmsAbvGrd', 'Neighborhood_Edwards', 'KitchenAbvGr',
#        'MSZoning_RM', 'MSSubClass', 'BsmtQual_Ex', 'GarageType_Attchd',
#        'ExterQual_Ex', 'KitchenQual_Gd','intercept']].as_matrix(), i))
    final_data[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'BsmtFinSF1',
       'firstFlrSF', 'GarageArea', 'LotArea', 'YearBuilt', 'OpenPorchSF',
       'FullBath', 'LotFrontage', 'BsmtUnfSF', 'YearRemodAdd', 'OverallCond',
       'iindFlrSF', 'MasVnrArea', 'GarageType_Detchd', 'WoodDeckSF', 'MoSold',
       'BsmtQual_Gd', 'TotRmsAbvGrd', 'Neighborhood_Edwards', 'KitchenAbvGr',
       'MSZoning_RM', 'MSSubClass', 'BsmtQual_Ex', 'GarageType_Attchd',
       'ExterQual_Ex', 'KitchenQual_Gd','intercept']].to_numpy()

In [109]:
final_data = final_data.rename(columns ={'firstFlrSF':'1stFlrSF','iindFlrSF':'2ndFlrSF'})

In [110]:
final_data1 = final_data.drop(['GrLivArea', 'GarageCars', 'BsmtFinSF1', 'TotalBsmtSF',
       '1stFlrSF', 'GarageArea','YearBuilt','BsmtUnfSF','2ndFlrSF'],axis = 1)

# F-TEST / ANOVA

In [111]:
# FIRST-AUTHOR: remove ML code
# import scipy.stats as stats

In [112]:
train.Neighborhood.value_counts()

Neighborhood
NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SWISU       25
MeadowV     17
Blmngtn     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: count, dtype: int64

In [113]:
nb1 = train.SalePrice[train.Neighborhood=='NAmes']
nb2 = train.SalePrice[train.Neighborhood=='CollgCr']
nb3 = train.SalePrice[train.Neighborhood=='Edwards']
nb4 = train.SalePrice[train.Neighborhood=='Somerst']
nb5 = train.SalePrice[train.Neighborhood=='Gilbert']
nb6 = train.SalePrice[train.Neighborhood=='NridgHt']
nb7 = train.SalePrice[train.Neighborhood=='Sawyer']
nb8 = train.SalePrice[train.Neighborhood=='NWAmes']
nb9 = train.SalePrice[train.Neighborhood=='SawyerW']
nb10 = train.SalePrice[train.Neighborhood=='BrkSide']
nb11 = train.SalePrice[train.Neighborhood=='Crawfor']
nb12= train.SalePrice[train.Neighborhood=='Mitchel']
nb13 = train.SalePrice[train.Neighborhood=='NoRidge']
nb14 = train.SalePrice[train.Neighborhood=='Timber']
nb15 = train.SalePrice[train.Neighborhood=='IDOTRR']
nb16 = train.SalePrice[train.Neighborhood=='ClearCr']
nb17 = train.SalePrice[train.Neighborhood=='StoneBr']
nb18 = train.SalePrice[train.Neighborhood=='SWISU']
nb19 = train.SalePrice[train.Neighborhood=='Blmngtn']
nb20 = train.SalePrice[train.Neighborhood=='MeadowV']
nb21 = train.SalePrice[train.Neighborhood=='BrDale']
nb22 = train.SalePrice[train.Neighborhood=='Veenker']
nb23 = train.SalePrice[train.Neighborhood=='NPkVill']
nb24 = train.SalePrice[train.Neighborhood=='Blueste']

In [114]:
# FIRST-AUTHOR: remove ML code
# stats.f_oneway(nb1,nb2,nb3,nb4,nb5,nb6,nb7,nb8,nb9,nb10,nb11,nb12,nb13,nb14,nb15,nb16,nb17,nb18,nb19,nb20,nb21,nb22,nb23,nb24)

## GarageQual

In [115]:
train.GarageQual.value_counts()

GarageQual
TA    1311
NG      81
Fa      48
Gd      14
Ex       3
Po       3
Name: count, dtype: int64

In [116]:
gq1 = train.SalePrice[train.GarageQual=='TA']
gq2 = train.SalePrice[train.GarageQual=='NG']
gq3 = train.SalePrice[train.GarageQual=='Fa']
gq4 = train.SalePrice[train.GarageQual=='Gd']
gq5 = train.SalePrice[train.GarageQual=='Ex']
gq6 = train.SalePrice[train.GarageQual=='Po']

In [117]:
# FIRST-AUTHOR: remove ML code
# stats.f_oneway(gq1,gq2,gq3,gq4,gq5)

## GarageCond

In [118]:
train.GarageCond.value_counts()

GarageCond
TA    1326
NG      81
Fa      35
Gd       9
Po       7
Ex       2
Name: count, dtype: int64

In [119]:
gc1 = train.SalePrice[train.GarageQual=='TA']
gc2 = train.SalePrice[train.GarageQual=='NG']
gc3 = train.SalePrice[train.GarageQual=='Fa']
gc4 = train.SalePrice[train.GarageQual=='Gd']
gc5 = train.SalePrice[train.GarageQual=='Po']
gc6 = train.SalePrice[train.GarageQual=='Ex']

In [120]:
# FIRST-AUTHOR: remove ML code
# stats.f_oneway(gc1,gc2,gc3,gc4,gc5)

In [121]:
train.BsmtExposure.value_counts()

BsmtExposure
No    953
Av    221
Gd    134
Mn    114
NB     38
Name: count, dtype: int64

In [122]:
be1 = train.SalePrice[train.BsmtExposure=="No"]
be2 = train.SalePrice[train.BsmtExposure=="Av"]
be3 = train.SalePrice[train.BsmtExposure=="Gd"]
be4 = train.SalePrice[train.BsmtExposure=="Mn"]
be5 = train.SalePrice[train.BsmtExposure=="NB"]

In [123]:
# FIRST-AUTHOR: remove ML code
# stats.f_oneway(be1,be2,be3,be4,be5)

In [124]:
test_data = pd.read_csv('./input/test.scaled.csv')

In [125]:
test1 = test_data[['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 
       'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood', 'Condition1',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual',
       'Foundation', 'BsmtQual', 'BsmtExposure', 'CentralAir', 'FireplaceQu',
       'GarageFinish', 'GarageQual', 'GarageCond']]

In [126]:
nulls1 = test1.isnull().sum().sort_values(ascending = False)
nulls1

FireplaceQu     730
GarageCond       78
GarageQual       78
GarageFinish     78
BsmtExposure     44
BsmtQual         44
MasVnrArea       15
MSZoning          4
BsmtFinSF1        1
TotalBsmtSF       1
RoofStyle         0
CentralAir        0
Foundation        0
ExterQual         0
RoofMatl          0
OverallQual       0
HouseStyle        0
BldgType          0
YearBuilt         0
Neighborhood      0
LotConfig         0
LotShape          0
Fireplaces        0
TotRmsAbvGrd      0
FullBath          0
YearRemodAdd      0
Condition1        0
dtype: int64

In [127]:
test1['FireplaceQu']=test1['FireplaceQu'].fillna('NF')
test1['GarageCond']=test1['GarageCond'].fillna('NG')
test1['GarageFinish']=test1['GarageFinish'].fillna('NG')
test1['GarageQual']=test1['GarageQual'].fillna('NG')
test1['BsmtExposure']=test1['BsmtExposure'].fillna('NB')
test1['BsmtQual'] = test1['BsmtQual'].fillna('NB')
test1['MasVnrArea'] = test1['MasVnrArea'].fillna(test1['MasVnrArea'].mean())
test1['MSZoning'] = test1['MSZoning'].fillna('RL')
test1['BsmtFinSF1'] = test1['BsmtFinSF1'].fillna(test1['BsmtFinSF1'].mean())
test1['TotalBsmtSF'] = test1['TotalBsmtSF'].fillna(test1['TotalBsmtSF'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test1['FireplaceQu']=test1['FireplaceQu'].fillna('NF')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test1['GarageCond']=test1['GarageCond'].fillna('NG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test1['GarageFinish']=test1['GarageFinish'].fillna('NG')
A value is trying to be set on a copy of

# OUTLIERS

In [128]:
test2 = test1._get_numeric_data()

In [129]:
def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

test2.apply(lambda x: var_summary(x)).T


Unnamed: 0,N,NMISS,SUM,MEAN,MEDIAN,STD,VAR,MIN,P1,P5,P10,P25,P50,P75,P90,P95,P99,MAX
OverallQual,1459.0,0.0,8869.0,6.078821,6.0,1.436812,2.064428,1.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0,10.0
YearBuilt,1459.0,0.0,2876211.0,1971.357779,1973.0,30.390071,923.556405,1879.0,1900.0,1915.0,1924.0,1953.0,1973.0,2001.0,2006.0,2007.0,2008.0,2010.0
YearRemodAdd,1459.0,0.0,2894164.0,1983.662783,1992.0,21.130467,446.496632,1950.0,1950.0,1950.0,1950.0,1963.0,1992.0,2004.0,2007.0,2007.0,2009.0,2010.0
MasVnrArea,1459.0,0.0,146934.6,100.709141,0.0,176.709824,31226.36203,0.0,0.0,0.0,0.0,0.0,0.0,162.0,309.2,473.7,735.68,1290.0
BsmtFinSF1,1459.0,0.0,640798.2,439.203704,351.0,455.111888,207126.830247,0.0,0.0,0.0,0.0,0.0,351.0,752.0,1039.2,1290.4,1682.84,4010.0
TotalBsmtSF,1459.0,0.0,1526286.0,1046.11797,988.0,442.746712,196024.651378,0.0,0.0,392.0,564.2,784.0,988.0,1304.0,1615.4,1782.0,2203.36,5095.0
FullBath,1459.0,0.0,2292.0,1.570939,2.0,0.55519,0.308236,0.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,3.0,4.0
TotRmsAbvGrd,1459.0,0.0,9316.0,6.385195,6.0,1.508895,2.276763,3.0,4.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,11.0,15.0
Fireplaces,1459.0,0.0,848.0,0.58122,0.0,0.64742,0.419153,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,4.0


In [130]:
test3 = test1.drop(test2.columns,axis = 1)

In [131]:
# FIRST-AUTHOR: make notebook run
# test2['OverallQual']= test2['OverallQual'].clip_upper(test2['OverallQual'].quantile(0.99))
# test2['OverallQual']= test2['OverallQual'].clip_lower(test2['OverallQual'].quantile(0.01))
# test2['YearBuilt']= test2['YearBuilt'].clip_upper(test2['YearBuilt'].quantile(0.99))
# test2['YearBuilt']= test2['YearBuilt'].clip_lower(test2['YearBuilt'].quantile(0.01))
# test2['MasVnrArea']= test2['MasVnrArea'].clip_upper(test2['MasVnrArea'].quantile(0.99))
# test2['BsmtFinSF1']= test2['BsmtFinSF1'].clip_upper(test2['BsmtFinSF1'].quantile(0.99))
# test2['TotalBsmtSF']= test2['TotalBsmtSF'].clip_upper(test2['TotalBsmtSF'].quantile(0.99))
# test2['TotalBsmtSF']= test2['TotalBsmtSF'].clip_upper(test2['TotalBsmtSF'].quantile(0.99))
# test2['TotRmsAbvGrd']= test2['TotRmsAbvGrd'].clip_upper(test2['TotRmsAbvGrd'].quantile(0.99))
test2['OverallQual']= test2['OverallQual'].clip(upper=test2['OverallQual'].quantile(0.99))
test2['OverallQual']= test2['OverallQual'].clip(lower=test2['OverallQual'].quantile(0.01))
test2['YearBuilt']= test2['YearBuilt'].clip(upper=test2['YearBuilt'].quantile(0.99))
test2['YearBuilt']= test2['YearBuilt'].clip(lower=test2['YearBuilt'].quantile(0.01))
test2['MasVnrArea']= test2['MasVnrArea'].clip(upper=test2['MasVnrArea'].quantile(0.99))
test2['BsmtFinSF1']= test2['BsmtFinSF1'].clip(upper=test2['BsmtFinSF1'].quantile(0.99))
test2['TotalBsmtSF']= test2['TotalBsmtSF'].clip(upper=test2['TotalBsmtSF'].quantile(0.99))
test2['TotalBsmtSF']= test2['TotalBsmtSF'].clip(upper=test2['TotalBsmtSF'].quantile(0.99))
test2['TotRmsAbvGrd']= test2['TotRmsAbvGrd'].clip(upper=test2['TotRmsAbvGrd'].quantile(0.99))

In [132]:
finaltest = pd.concat([test2,test3],axis = 1)

In [133]:
finaltest1 = pd.get_dummies(finaltest)

In [134]:
finaltest.columns

Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'MSZoning',
       'LotShape', 'LotConfig', 'Neighborhood', 'Condition1', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'Foundation',
       'BsmtQual', 'BsmtExposure', 'CentralAir', 'FireplaceQu', 'GarageFinish',
       'GarageQual', 'GarageCond'],
      dtype='object')

In [135]:
final_data1.head()

Unnamed: 0,OverallQual,LotArea,OpenPorchSF,FullBath,LotFrontage,YearRemodAdd,OverallCond,MasVnrArea,GarageType_Detchd,WoodDeckSF,...,Neighborhood_Edwards,KitchenAbvGr,MSZoning_RM,MSSubClass,BsmtQual_Ex,GarageType_Attchd,ExterQual_Ex,KitchenQual_Gd,SalePrice,intercept
0,7,8450.0,61.0,2,65.0,2003,5,196.0,False,0.0,...,False,1,False,60,False,True,False,True,208500.0,-831389.469602
1,6,9600.0,0.0,2,80.0,1976,8,0.0,False,298.0,...,False,1,False,20,False,True,False,False,181500.0,-831389.469602
2,7,11250.0,42.0,2,68.0,2002,5,162.0,False,0.0,...,False,1,False,60,False,True,False,True,223500.0,-831389.469602
3,7,9550.0,35.0,1,60.0,1970,5,0.0,True,0.0,...,False,1,False,70,False,False,False,True,140000.0,-831389.469602
4,8,14260.0,84.0,2,84.0,2000,5,350.0,False,192.0,...,False,1,False,60,False,True,False,True,250000.0,-831389.469602


## Statsmodels--- train data

In [136]:
# FIRST-AUTHOR: make notebook run with input scaling
# train1 =final_data1.sample(n = 730 ,random_state = 123)
train1 =final_data1.sample(n = min(len(final_data) - 10, 730) ,random_state = 123)
train2 = final_data1.drop(train1.index)

In [137]:
train1x = train1.drop(['intercept','SalePrice'], axis = 1)
train1y = train1.SalePrice

In [138]:
train2x = train2.drop(['SalePrice','intercept'],axis = 1)
train2y = train2.SalePrice

## Random Forests feature imp----train data

In [139]:
best_train = pd.get_dummies(best_train)

In [140]:
train_s1 = best_train.sample(n = min(len(final_data) - 10, 730) ,random_state = 123)
train_s2 = best_train.drop(train_s1.index)                             

In [141]:
train_s1x = train_s1.drop(['SalePrice'], axis = 1)
train_s1y = train_s1.SalePrice

In [142]:
train_s2x = train_s2.drop(['SalePrice'],axis = 1)
train_s2y = train_s2.SalePrice

# Linear Regression

In [143]:
# FIRST-AUTHOR: remove ML code
# from sklearn.linear_model import LinearRegression

In [144]:
# FIRST-AUTHOR: remove ML code
# X_train = train1x
# Y_train = train1y

In [145]:
# FIRST-AUTHOR: remove ML code
# linreg = LinearRegression()
# linreg.fit(X_train, Y_train)

In [146]:
# FIRST-AUTHOR: remove ML code
# X_train , X_test, Y_train, Y_test = train_test_split(
#         train2x,
#         train2y,
#         test_size=0.20,
#         random_state=123)

In [147]:
# FIRST-AUTHOR: remove ML code
# y_pred = linreg.predict(X_test)

In [148]:
# FIRST-AUTHOR: remove ML code
# from sklearn import metrics

In [149]:
# FIRST-AUTHOR: remove ML code
# rmse = np.sqrt(metrics.mean_squared_error(Y_test, y_pred))
# rmse

In [150]:
# FIRST-AUTHOR: remove ML code
# metrics.r2_score(Y_test, y_pred)

# Decision Tree

>>>> Here we use the GridSearch-Cross validation to get the best Hyper parameters

In [151]:
# FIRST-AUTHOR: remove ML code
# from sklearn import metrics
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.grid_search import GridSearchCV

In [152]:
# FIRST-AUTHOR: remove ML code
# X_train = train1x 
# Y_train = train1y

In [153]:
# FIRST-AUTHOR: remove ML code
# depth_list = list(range(1,20))
# for depth in depth_list:
#     dt_obj = DecisionTreeRegressor(max_depth=depth)
#     dt_obj.fit(X_train, Y_train)
#     print ('depth:', depth, 'R_squared:', metrics.r2_score(Y_test, dt_obj.predict(X_test)))

In [154]:
# FIRST-AUTHOR: remove ML code
# param_grid = {'max_depth': np.arange(3,20)}
# tree = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=10)
# tree.fit(X_train, Y_train)

In [155]:
# FIRST-AUTHOR: remove ML code
# tree.best_params_

In [156]:
# FIRST-AUTHOR: remove ML code
# tree.best_score_

In [157]:
# FIRST-AUTHOR: remove ML code
# tree_final = DecisionTreeRegressor(max_depth=8)
# tree_final.fit(X_train, Y_train)

In [158]:
# FIRST-AUTHOR: remove ML code
# X_train, X_test, Y_train, Y_test = train_test_split(
#         train2x,
#         train2y,
#         test_size=0.20,
#         random_state=123)

In [159]:
# FIRST-AUTHOR: remove ML code
# tree_test_pred = pd.DataFrame({'actual': Y_test, 'predicted': tree_final.predict(X_test)})
tree_test_pred = pd.DataFrame({'actual': train2y, 'predicted': train2y})

In [160]:
tree_test_pred.sample(10)

Unnamed: 0,actual,predicted
1458,142125.0,142125.0
618,314813.0,314813.0
1171,163000.0,163000.0
495,61815.97,61815.97
383,76000.0,76000.0
726,222000.0,222000.0
1268,381000.0,381000.0
288,122000.0,122000.0
1313,333168.0,333168.0
321,354000.0,354000.0


In [161]:
# FIRST-AUTHOR: remove ML code
# metrics.r2_score(Y_test, tree_test_pred.predicted)
_ = tree_test_pred.predicted

In [162]:
# FIRST-AUTHOR: remove ML code
# rmse = np.sqrt(metrics.mean_squared_error(Y_test, tree_test_pred.predicted))
# rmse
_ = tree_test_pred.predicted

# Random Forest

In [163]:
# FIRST-AUTHOR: remove ML code
# from sklearn.ensemble import RandomForestRegressor

In [164]:
# FIRST-AUTHOR: remove ML code
# X_train = train1x
# Y_train = train1y

In [165]:
# FIRST-AUTHOR: remove ML code
# depth_list = list(range(1,20))
# for depth in depth_list:
#     dt_obj = RandomForestRegressor(max_depth=depth)
#     dt_obj.fit(X_train, Y_train)
#     print ('depth:', depth, 'R_Squared:', metrics.r2_score(Y_test, dt_obj.predict(X_test)))

In [166]:
# FIRST-AUTHOR: remove ML code
# radm_clf = RandomForestRegressor(oob_score=True,n_estimators=100)
# radm_clf.fit( X_train, Y_train )

In [167]:
# FIRST-AUTHOR: remove ML code
# X_train, X_test, Y_train, Y_test = train_test_split(
#         train2x,
#         train2y,
#         test_size=0.20,
#         random_state=123)

In [168]:
# FIRST-AUTHOR: remove ML code
# radm_test_pred = pd.DataFrame( { 'actual':  Y_test,
#                             'predicted': radm_clf.predict( X_test ) } )
radm_test_pred = pd.DataFrame( { 'actual':  train2y,
                            'predicted': train2y } )

In [169]:
# FIRST-AUTHOR: remove ML code
# metrics.r2_score( radm_test_pred.actual, radm_test_pred.predicted )
_ = radm_test_pred.actual
_ = radm_test_pred.predicted

In [170]:
# FIRST-AUTHOR: remove ML code
# rmse = np.sqrt(metrics.mean_squared_error(radm_test_pred.actual, radm_test_pred.predicted))
# rmse
_ = radm_test_pred.actual
_ = radm_test_pred.predicted

# Bagged Decision Trees

In [171]:
# FIRST-AUTHOR: remove ML code
# from sklearn.ensemble import BaggingRegressor

In [172]:
# FIRST-AUTHOR: remove ML code, plotting
# from sklearn import metrics
# import matplotlib.pyplot as plt 
# import seaborn as sns

In [173]:
# FIRST-AUTHOR: remove ML code
# param_bag = {'n_estimators': list(range(100, 801, 100)),
#              }

In [174]:
# FIRST-AUTHOR: remove ML code
# from sklearn.grid_search import GridSearchCV
# bag_cl = GridSearchCV(estimator=BaggingRegressor(),
#                   param_grid=param_bag,
#                   cv=5,
#                   verbose=True, n_jobs=-1)

In [175]:
# FIRST-AUTHOR: remove ML code
# bag_cl.get_params()

In [176]:
# FIRST-AUTHOR: remove ML code
# X_train = train1x
# Y_train = train1y

In [177]:
# FIRST-AUTHOR: remove ML code
# bag_cl.fit(X_train, Y_train)

In [178]:
# FIRST-AUTHOR: remove ML code
# bag_cl.best_params_

In [179]:
# FIRST-AUTHOR: remove ML code
# bagclm = BaggingRegressor(oob_score=True, n_estimators=600)
# bagclm.fit(X_train, Y_train)

In [180]:
# FIRST-AUTHOR: remove ML code
# X_train, X_test, Y_train, Y_test = train_test_split(
#         train2x,
#         train2y,
#         test_size=0.20,
#         random_state=123)

In [181]:
# FIRST-AUTHOR: remove ML code
# y_pred = pd.DataFrame( { 'actual':  Y_test,
#                             'predicted': bagclm.predict( X_test) } )
y_pred = pd.DataFrame( { 'actual':  train2y,
                            'predicted': train2y } )

In [182]:
# FIRST-AUTHOR: remove ML code
# bagclm.estimators_features_

In [183]:
# FIRST-AUTHOR: remove ML code
# metrics.r2_score(y_pred.actual, y_pred.predicted)
_ = y_pred.actual, y_pred.predicted

In [184]:
# FIRST-AUTHOR: remove ML code
# rmse = np.sqrt(metrics.mean_squared_error(Y_test, y_pred.predicted))
# rmse
_ = y_pred.predicted

# AdaBoost 

In [185]:
# FIRST-AUTHOR: remove ML code
# from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

In [186]:
# FIRST-AUTHOR: remove ML code
# [10**x for x in range(-3, 3)]

In [187]:
# FIRST-AUTHOR: remove ML code
# paragrid_ada = {'n_estimators': [100, 200, 400, 600, 800],
#                'learning_rate': [10**x for x in range(-3, 3)]}

In [188]:
# FIRST-AUTHOR: remove ML code
# from sklearn.grid_search import GridSearchCV
# ada = GridSearchCV(estimator=AdaBoostRegressor(),
#                   param_grid=paragrid_ada,
#                   cv=5,
#                   verbose=True, n_jobs=-1)

In [189]:
# FIRST-AUTHOR: remove ML code
# X_train = train1x
# Y_train = train1y

In [190]:
# FIRST-AUTHOR: remove ML code
# ada.fit(X_train, Y_train)

In [191]:
# FIRST-AUTHOR: remove ML code
# ada.best_params_

In [192]:
# FIRST-AUTHOR: remove ML code
# ada_clf = AdaBoostRegressor(learning_rate=0.1, n_estimators=800)

In [193]:
# FIRST-AUTHOR: remove ML code
# ada_clf.fit(X_train, Y_train)

In [194]:
# FIRST-AUTHOR: remove ML code
# X_train, X_test, Y_train, Y_test = train_test_split(
#         train2x,
#         train2y,
#         test_size=0.20,
#         random_state=123)

In [195]:
# FIRST-AUTHOR: remove ML code
# ada_test_pred = pd.DataFrame({'actual': Y_test,
#                             'predicted': ada_clf.predict(X_test)})
ada_test_pred = pd.DataFrame({'actual': train2y,
                            'predicted': train2y})

In [196]:
# FIRST-AUTHOR: remove ML code
# metrics.r2_score(ada_test_pred.actual, ada_test_pred.predicted)
_ = ada_test_pred.actual
_ = ada_test_pred.predicted

In [197]:
# FIRST-AUTHOR: remove ML code
# rmse = np.sqrt(metrics.mean_squared_error(Y_test, y_pred.predicted))
# rmse
_ = y_pred.predicted

## Gradient Boosting

In [198]:
# FIRST-AUTHOR: remove ML code
# param_test1 = {'n_estimators': [100, 200, 400, 600, 800],
#               'max_depth': list(range(1,10))}
# gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=500, min_samples_leaf=50,
#                                                                max_features='sqrt',subsample=0.8, random_state=10), 
#                         param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

In [199]:
# FIRST-AUTHOR: remove ML code
# X_train = train1x
# Y_train = train1y

In [200]:
# FIRST-AUTHOR: remove ML code
# gsearch1.fit(X_train, Y_train)

In [201]:
# FIRST-AUTHOR: remove ML code
# gsearch1.best_params_

In [202]:
# FIRST-AUTHOR: remove ML code
# gbm = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=500, min_samples_leaf=50,max_depth=1, n_estimators=200,
#                                                                max_features='sqrt',subsample=0.8, random_state=10)

In [203]:
# FIRST-AUTHOR: remove ML code
# gbm.fit(X_train, Y_train)

In [204]:
# FIRST-AUTHOR: remove ML code
# X_train, X_test, Y_train, Y_test = train_test_split(
#         train2x,
#         train2y,
#         test_size=0.20,
#         random_state=123)

In [205]:
# FIRST-AUTHOR: remove ML code
# gbm_test_pred = pd.DataFrame({'actual': Y_test,
#                             'predicted': gbm.predict(X_test)})
gbm_test_pred = pd.DataFrame({'actual': train2y,
                            'predicted': train2y})

In [206]:
# FIRST-AUTHOR: remove ML code
# metrics.r2_score(gbm_test_pred.actual, gbm_test_pred.predicted)
_ = gbm_test_pred.actual
_ = gbm_test_pred.predicted

In [207]:
# FIRST-AUTHOR: remove ML code
# rmse = np.sqrt(metrics.mean_squared_error(gbm_test_pred.actual, gbm_test_pred.predicted))
# rmse
_ = gbm_test_pred.actual
_ = gbm_test_pred.predicted