In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## EDA

### 1) train

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

### 2) test

In [81]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [82]:
test.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1459.0,1232.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.378341,68.580357,9819.161069,6.078821,5.553804,1971.357779,1983.662783,100.709141,439.203704,...,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,421.321334,42.74688,22.376841,4955.517327,1.436812,1.11374,30.390071,21.130467,177.6259,455.268042,...,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,1461.0,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,58.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,67.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,80.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [83]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

## Missing Value

### 1) train

In [6]:
train.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

#### 1) drop: Alley, FireplaceQu, PoolQC, Fence, MiscFeature

In [3]:
train_drop = train.drop(['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'], axis=1)
train_drop.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
LotShape           0
LandContour        0
Utilities          0
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        0
Exterior2nd        0
MasVnrType         8
MasVnrArea         8
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual          37
                ... 
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
KitchenQual        0
TotRmsAbvGrd       0
Functional         0
Fireplaces         0
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageCars         0
GarageArea         0
GarageQual        81
GarageCond   

#### 2) One-Hot-Encoding

In [4]:
train_dummy = pd.get_dummies(train_drop)
train_dummy.isnull().sum()

Id                         0
MSSubClass                 0
LotFrontage              259
LotArea                    0
OverallQual                0
OverallCond                0
YearBuilt                  0
YearRemodAdd               0
MasVnrArea                 8
BsmtFinSF1                 0
BsmtFinSF2                 0
BsmtUnfSF                  0
TotalBsmtSF                0
1stFlrSF                   0
2ndFlrSF                   0
LowQualFinSF               0
GrLivArea                  0
BsmtFullBath               0
BsmtHalfBath               0
FullBath                   0
HalfBath                   0
BedroomAbvGr               0
KitchenAbvGr               0
TotRmsAbvGrd               0
Fireplaces                 0
GarageYrBlt               81
GarageCars                 0
GarageArea                 0
WoodDeckSF                 0
OpenPorchSF                0
                        ... 
GarageFinish_RFn           0
GarageFinish_Unf           0
GarageQual_Ex              0
GarageQual_Fa 

In [5]:
train_dummy.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


#### 3) Fill : LotFrontage

In [6]:
train_dummy['LotFrontage'].isnull().sum()

259

In [7]:
train_dummy['LotFrontage'].mean(), train_dummy['LotFrontage'].median(), train_dummy['LotFrontage'].mode()

(70.04995836802665, 69.0, 0    60.0
 dtype: float64)

In [8]:
train_dummy['LotFrontage'].value_counts()

60.0     143
70.0      70
80.0      69
50.0      57
75.0      53
65.0      44
85.0      40
78.0      25
21.0      23
90.0      23
68.0      19
24.0      19
64.0      19
73.0      18
72.0      17
79.0      17
63.0      17
55.0      17
100.0     16
51.0      15
66.0      15
74.0      15
52.0      14
59.0      13
71.0      12
82.0      12
40.0      12
43.0      12
67.0      12
57.0      12
        ... 
129.0      2
124.0      2
118.0      2
101.0      2
122.0      2
121.0      2
115.0      2
109.0      2
116.0      2
114.0      2
150.0      1
111.0      1
153.0      1
182.0      1
46.0       1
112.0      1
149.0      1
141.0      1
33.0       1
152.0      1
160.0      1
168.0      1
128.0      1
144.0      1
39.0       1
106.0      1
38.0       1
138.0      1
140.0      1
137.0      1
Name: LotFrontage, Length: 110, dtype: int64

In [9]:
train_dummy['LotFrontage'].fillna(train_dummy['LotFrontage'].median(), inplace=True)
train_dummy['LotFrontage'].isnull().sum()

0

#### 4) Fill : MasVnrArea

In [10]:
train_dummy['MasVnrArea'].isnull().sum()

8

In [11]:
train_dummy['MasVnrArea'].mean(), train_dummy['MasVnrArea'].median(), train_dummy['MasVnrArea'].mode()

(103.68526170798899, 0.0, 0    0.0
 dtype: float64)

In [12]:
train_dummy['MasVnrArea'].value_counts()

0.0       861
72.0        8
180.0       8
108.0       8
120.0       7
16.0        7
80.0        6
200.0       6
106.0       6
340.0       6
170.0       5
132.0       5
360.0       5
84.0        5
320.0       5
100.0       4
196.0       4
246.0       4
216.0       4
160.0       4
183.0       4
178.0       4
270.0       4
300.0       4
210.0       4
268.0       4
252.0       4
168.0       4
336.0       4
220.0       4
         ... 
14.0        1
53.0        1
24.0        1
127.0       1
365.0       1
115.0       1
562.0       1
259.0       1
378.0       1
219.0       1
161.0       1
247.0       1
109.0       1
278.0       1
375.0       1
225.0       1
604.0       1
762.0       1
290.0       1
299.0       1
202.0       1
731.0       1
167.0       1
309.0       1
1129.0      1
651.0       1
337.0       1
415.0       1
293.0       1
621.0       1
Name: MasVnrArea, Length: 327, dtype: int64

In [13]:
train_dummy['MasVnrArea'].fillna(train_dummy['MasVnrArea'].mode()[0], inplace=True)
train_dummy['MasVnrArea'].isnull().sum()

0

#### 5) Fill : GarageYrBlt

In [14]:
train_dummy['GarageYrBlt'].isnull().sum()

81

In [15]:
train_dummy['GarageYrBlt'].mean(), train_dummy['GarageYrBlt'].median(), train_dummy['GarageYrBlt'].mode()

(1978.5061638868744, 1980.0, 0    2005.0
 dtype: float64)

In [16]:
train_dummy['GarageYrBlt'].value_counts()

2005.0    65
2006.0    59
2004.0    53
2003.0    50
2007.0    49
1977.0    35
1998.0    31
1999.0    30
2008.0    29
1976.0    29
2000.0    27
2002.0    26
1968.0    26
1950.0    24
1993.0    22
2009.0    21
1965.0    21
1966.0    21
1962.0    21
1958.0    21
2001.0    20
1996.0    20
1957.0    20
1970.0    20
1960.0    19
1997.0    19
1978.0    19
1954.0    19
1974.0    18
1994.0    18
          ..
1922.0     5
1936.0     5
1916.0     5
1931.0     4
1945.0     4
1935.0     4
1928.0     4
1946.0     4
1982.0     4
1938.0     3
1921.0     3
1924.0     3
1910.0     3
1952.0     3
1932.0     3
2010.0     3
1923.0     3
1937.0     2
1934.0     2
1918.0     2
1947.0     2
1929.0     2
1914.0     2
1915.0     2
1942.0     2
1908.0     1
1927.0     1
1933.0     1
1900.0     1
1906.0     1
Name: GarageYrBlt, Length: 97, dtype: int64

In [17]:
train_dummy['GarageYrBlt'].fillna(train_dummy['GarageYrBlt'].median(), inplace=True)
train_dummy['GarageYrBlt'].isnull().sum()

0

In [18]:
train_dummy.isnull().sum()

Id                       0
MSSubClass               0
LotFrontage              0
LotArea                  0
OverallQual              0
OverallCond              0
YearBuilt                0
YearRemodAdd             0
MasVnrArea               0
BsmtFinSF1               0
BsmtFinSF2               0
BsmtUnfSF                0
TotalBsmtSF              0
1stFlrSF                 0
2ndFlrSF                 0
LowQualFinSF             0
GrLivArea                0
BsmtFullBath             0
BsmtHalfBath             0
FullBath                 0
HalfBath                 0
BedroomAbvGr             0
KitchenAbvGr             0
TotRmsAbvGrd             0
Fireplaces               0
GarageYrBlt              0
GarageCars               0
GarageArea               0
WoodDeckSF               0
OpenPorchSF              0
                        ..
GarageFinish_RFn         0
GarageFinish_Unf         0
GarageQual_Ex            0
GarageQual_Fa            0
GarageQual_Gd            0
GarageQual_Po            0
G

### 2) test

In [19]:
test.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       227
LotArea             0
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         1
TotRmsAbvGrd        0
Functional          2
Fireplaces          0
FireplaceQu       730
GarageType         76
GarageYrBlt        78
GarageFinish       78
GarageCars          1
GarageArea          1
GarageQual         78
GarageCond

#### 1) drop: Alley, FireplaceQu, PoolQC, Fence, MiscFeature

In [20]:
test_drop = test.drop(['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'], axis=1)
test_drop.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
Street             0
LotShape           0
LandContour        0
Utilities          2
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual          44
                ... 
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
KitchenQual        1
TotRmsAbvGrd       0
Functional         2
Fireplaces         0
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual   

#### 2) One-Hot-Encoding

In [21]:
test_dummy = pd.get_dummies(test_drop)
test_dummy.isnull().sum()

Id                         0
MSSubClass                 0
LotFrontage              227
LotArea                    0
OverallQual                0
OverallCond                0
YearBuilt                  0
YearRemodAdd               0
MasVnrArea                15
BsmtFinSF1                 1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
1stFlrSF                   0
2ndFlrSF                   0
LowQualFinSF               0
GrLivArea                  0
BsmtFullBath               2
BsmtHalfBath               2
FullBath                   0
HalfBath                   0
BedroomAbvGr               0
KitchenAbvGr               0
TotRmsAbvGrd               0
Fireplaces                 0
GarageYrBlt               78
GarageCars                 1
GarageArea                 1
WoodDeckSF                 0
OpenPorchSF                0
                        ... 
GarageFinish_Fin           0
GarageFinish_RFn           0
GarageFinish_Unf           0
GarageQual_Fa 

In [22]:
test_dummy.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


#### 3) Fill : LotFrontage

In [23]:
test_dummy['LotFrontage'].isnull().sum()

227

In [24]:
test_dummy['LotFrontage'].mean(), test_dummy['LotFrontage'].median(), test_dummy['LotFrontage'].mode()

(68.58035714285714, 67.0, 0    60.0
 dtype: float64)

In [25]:
test_dummy['LotFrontage'].value_counts()

60.0     133
80.0      68
70.0      63
50.0      60
75.0      52
65.0      49
85.0      36
63.0      30
24.0      30
21.0      27
68.0      25
74.0      24
64.0      24
90.0      23
72.0      22
62.0      22
78.0      21
82.0      16
73.0      15
59.0      14
57.0      14
56.0      14
76.0      14
53.0      14
52.0      14
100.0     12
88.0      12
51.0      12
81.0      12
43.0      11
        ... 
128.0      2
160.0      2
124.0      2
37.0       2
36.0       2
104.0      2
109.0      2
115.0      1
123.0      1
150.0      1
134.0      1
121.0      1
101.0      1
135.0      1
140.0      1
117.0      1
119.0      1
195.0      1
133.0      1
155.0      1
25.0       1
28.0       1
49.0       1
200.0      1
126.0      1
22.0       1
136.0      1
149.0      1
31.0       1
131.0      1
Name: LotFrontage, Length: 115, dtype: int64

In [26]:
test_dummy['LotFrontage'].fillna(test_dummy['LotFrontage'].median(), inplace=True)
test_dummy['LotFrontage'].isnull().sum()

0

#### 4) Fill : MasVnrArea

In [27]:
test_dummy['MasVnrArea'].isnull().sum()

15

In [28]:
test_dummy['MasVnrArea'].mean(), test_dummy['MasVnrArea'].median(), test_dummy['MasVnrArea'].mode()

(100.70914127423822, 0.0, 0    0.0
 dtype: float64)

In [29]:
test_dummy['MasVnrArea'].value_counts()

0.0      877
176.0     10
144.0      9
120.0      8
216.0      8
200.0      7
128.0      6
504.0      6
302.0      6
198.0      6
196.0      5
140.0      5
203.0      5
260.0      5
210.0      5
60.0       5
256.0      5
164.0      5
23.0       4
16.0       4
112.0      4
178.0      4
40.0       4
194.0      4
174.0      4
420.0      4
143.0      4
306.0      4
50.0       4
186.0      4
        ... 
146.0      1
257.0      1
138.0      1
222.0      1
276.0      1
125.0      1
95.0       1
292.0      1
680.0      1
119.0      1
153.0      1
572.0      1
444.0      1
394.0      1
500.0      1
308.0      1
284.0      1
101.0      1
85.0       1
366.0      1
286.0      1
886.0      1
268.0      1
418.0      1
226.0      1
634.0      1
177.0      1
615.0      1
549.0      1
442.0      1
Name: MasVnrArea, Length: 303, dtype: int64

In [30]:
test_dummy['MasVnrArea'].fillna(test_dummy['MasVnrArea'].mode()[0], inplace=True)
test_dummy['MasVnrArea'].isnull().sum()

0

#### 5) Fill : GarageYrBlt

In [31]:
test_dummy['GarageYrBlt'].isnull().sum()

78

In [32]:
test_dummy['GarageYrBlt'].mean(), test_dummy['GarageYrBlt'].median(), test_dummy['GarageYrBlt'].mode()

(1977.7212165097756, 1979.0, 0    2005.0
 dtype: float64)

In [33]:
test_dummy['GarageYrBlt'].value_counts()

2005.0    77
2007.0    66
2006.0    56
2004.0    46
2003.0    42
2008.0    32
1977.0    31
2000.0    28
1993.0    27
1950.0    27
2002.0    27
1998.0    27
1956.0    25
1997.0    25
1999.0    24
1978.0    22
1968.0    22
1958.0    21
1967.0    21
1994.0    21
1976.0    21
2001.0    21
1979.0    20
1996.0    20
1920.0    19
1930.0    19
1959.0    19
1975.0    19
1954.0    18
1966.0    18
          ..
1946.0     5
1981.0     5
1924.0     5
1982.0     5
1915.0     5
1900.0     5
1925.0     5
1942.0     4
1941.0     4
1927.0     4
1937.0     4
1983.0     4
1935.0     4
1928.0     3
1923.0     3
1947.0     3
1922.0     3
1934.0     2
1921.0     2
1917.0     2
1936.0     2
2010.0     2
1932.0     1
1943.0     1
2207.0     1
1918.0     1
1895.0     1
1919.0     1
1896.0     1
1916.0     1
Name: GarageYrBlt, Length: 97, dtype: int64

In [34]:
test_dummy['GarageYrBlt'].fillna(test_dummy['GarageYrBlt'].median(), inplace=True)
test_dummy['GarageYrBlt'].isnull().sum()

0

#### 6) Fill : BsmtFinSF1

In [35]:
test_dummy['BsmtFinSF1'].isnull().sum()

1

In [36]:
test_dummy['BsmtFinSF1'].mean(), test_dummy['BsmtFinSF1'].median(), test_dummy['BsmtFinSF1'].mode()

(439.2037037037037, 350.5, 0    0.0
 dtype: float64)

In [37]:
test_dummy['BsmtFinSF1'].value_counts()

0.0       462
24.0       15
276.0       6
602.0       6
300.0       5
758.0       5
16.0        5
288.0       5
330.0       4
624.0       4
476.0       4
864.0       4
544.0       4
600.0       4
468.0       4
384.0       4
456.0       4
700.0       4
375.0       4
915.0       4
60.0        4
252.0       4
500.0       4
368.0       4
779.0       3
435.0       3
800.0       3
637.0       3
36.0        3
68.0        3
         ... 
388.0       1
286.0       1
568.0       1
417.0       1
362.0       1
134.0       1
1200.0      1
552.0       1
994.0       1
460.0       1
262.0       1
1104.0      1
332.0       1
728.0       1
492.0       1
261.0       1
154.0       1
1048.0      1
1129.0      1
438.0       1
908.0       1
1080.0      1
924.0       1
234.0       1
324.0       1
278.0       1
210.0       1
580.0       1
1328.0      1
771.0       1
Name: BsmtFinSF1, Length: 669, dtype: int64

In [38]:
test_dummy['BsmtFinSF1'].fillna(test_dummy['BsmtFinSF1'].mode()[0], inplace=True)
test_dummy['BsmtFinSF1'].isnull().sum()

0

#### 7) Fill : BsmtFinSF2

In [39]:
test_dummy['BsmtFinSF2'].isnull().sum()

1

In [40]:
test_dummy['BsmtFinSF2'].mean(), test_dummy['BsmtFinSF2'].median(), test_dummy['BsmtFinSF2'].mode()

(52.61934156378601, 0.0, 0    0.0
 dtype: float64)

In [41]:
test_dummy['BsmtFinSF2'].value_counts()

0.0       1278
162.0        3
294.0        3
483.0        3
144.0        2
288.0        2
590.0        2
596.0        2
435.0        2
116.0        2
72.0         2
270.0        2
247.0        2
159.0        2
60.0         2
42.0         2
168.0        2
252.0        2
465.0        1
691.0        1
841.0        1
1526.0       1
344.0        1
761.0        1
259.0        1
393.0        1
474.0        1
613.0        1
774.0        1
522.0        1
          ... 
174.0        1
859.0        1
755.0        1
799.0        1
497.0        1
419.0        1
373.0        1
679.0        1
273.0        1
543.0        1
202.0        1
95.0         1
492.0        1
382.0        1
904.0        1
105.0        1
127.0        1
210.0        1
154.0        1
138.0        1
748.0        1
404.0        1
206.0        1
278.0        1
121.0        1
308.0        1
167.0        1
186.0        1
250.0        1
750.0        1
Name: BsmtFinSF2, Length: 161, dtype: int64

In [42]:
test_dummy['BsmtFinSF2'].fillna(test_dummy['BsmtFinSF2'].mode()[0], inplace=True)
test_dummy['BsmtFinSF2'].isnull().sum()

0

#### 8) Fill : BsmtUnfSF

In [43]:
test_dummy['BsmtUnfSF'].isnull().sum()

1

In [44]:
test_dummy['BsmtUnfSF'].mean(), test_dummy['BsmtUnfSF'].median(), test_dummy['BsmtUnfSF'].mode()

(554.2949245541838, 460.0, 0    0.0
 dtype: float64)

In [45]:
test_dummy['BsmtUnfSF'].value_counts()

0.0       123
384.0      11
624.0       8
480.0       7
100.0       7
672.0       7
348.0       7
738.0       7
120.0       6
216.0       6
784.0       6
294.0       6
816.0       6
322.0       5
728.0       5
546.0       5
525.0       5
585.0       5
226.0       5
228.0       5
30.0        5
832.0       5
240.0       5
150.0       5
768.0       5
186.0       5
600.0       5
396.0       5
456.0       5
306.0       5
         ... 
1270.0      1
129.0       1
390.0       1
161.0       1
1105.0      1
1214.0      1
470.0       1
290.0       1
747.0       1
613.0       1
615.0       1
1146.0      1
1694.0      1
809.0       1
577.0       1
1721.0      1
2140.0      1
1090.0      1
1254.0      1
354.0       1
671.0       1
1043.0      1
1153.0      1
1335.0      1
1324.0      1
406.0       1
1604.0      1
1369.0      1
549.0       1
1211.0      1
Name: BsmtUnfSF, Length: 793, dtype: int64

In [46]:
test_dummy['BsmtUnfSF'].fillna(test_dummy['BsmtUnfSF'].mode()[0], inplace=True)
test_dummy['BsmtUnfSF'].isnull().sum()

0

#### 9) Fill : TotalBsmtSF

In [47]:
test_dummy['TotalBsmtSF'].isnull().sum()

1

In [48]:
test_dummy['TotalBsmtSF'].mean(), test_dummy['TotalBsmtSF'].median(), test_dummy['TotalBsmtSF'].mode()

(1046.1179698216736, 988.0, 0    0.0
 dtype: float64)

In [49]:
test_dummy['TotalBsmtSF'].value_counts()

0.0       41
864.0     39
960.0     13
546.0     12
672.0     12
384.0     12
1008.0    12
768.0     12
1040.0    11
912.0     11
624.0     10
988.0     10
816.0     10
600.0      9
720.0      9
738.0      8
936.0      8
728.0      8
780.0      8
756.0      8
1168.0     7
784.0      7
483.0      7
832.0      7
1100.0     6
894.0      6
840.0      6
572.0      5
1073.0     5
984.0      5
          ..
1829.0     1
1388.0     1
629.0      1
1748.0     1
1172.0     1
1166.0     1
1702.0     1
245.0      1
1127.0     1
1116.0     1
1678.0     1
690.0      1
908.0      1
1236.0     1
1260.0     1
2024.0     1
994.0      1
407.0      1
969.0      1
810.0      1
1436.0     1
1406.0     1
1369.0     1
1726.0     1
1196.0     1
918.0      1
1910.0     1
1700.0     1
750.0      1
1075.0     1
Name: TotalBsmtSF, Length: 736, dtype: int64

In [50]:
test_dummy['TotalBsmtSF'].fillna(test_dummy['TotalBsmtSF'].median(), inplace=True)
test_dummy['TotalBsmtSF'].isnull().sum()

0

#### 10) Fill : BsmtFullBath

In [51]:
test_dummy['BsmtFullBath'].isnull().sum()

2

In [52]:
test_dummy['BsmtFullBath'].mean(), test_dummy['BsmtFullBath'].median(), test_dummy['BsmtFullBath'].mode()

(0.4344543582704187, 0.0, 0    0.0
 dtype: float64)

In [53]:
test_dummy['BsmtFullBath'].value_counts()

0.0    849
1.0    584
2.0     23
3.0      1
Name: BsmtFullBath, dtype: int64

In [54]:
test_dummy['BsmtFullBath'].fillna(test_dummy['BsmtFullBath'].mode()[0], inplace=True)
test_dummy['BsmtFullBath'].isnull().sum()

0

#### 11) Fill : BsmtHalfBath

In [55]:
test_dummy['BsmtHalfBath'].isnull().sum()

2

In [56]:
test_dummy['BsmtHalfBath'].mean(), test_dummy['BsmtHalfBath'].median(), test_dummy['BsmtHalfBath'].mode()

(0.06520247083047358, 0.0, 0    0.0
 dtype: float64)

In [57]:
test_dummy['BsmtHalfBath'].value_counts()

0.0    1364
1.0      91
2.0       2
Name: BsmtHalfBath, dtype: int64

In [58]:
test_dummy['BsmtHalfBath'].fillna(test_dummy['BsmtHalfBath'].mode()[0], inplace=True)
test_dummy['BsmtHalfBath'].isnull().sum()

0

#### 12) Fill : GarageCars

In [59]:
test_dummy['GarageCars'].isnull().sum()

1

In [60]:
test_dummy['GarageCars'].mean(), test_dummy['GarageCars'].median(), test_dummy['GarageCars'].mode()

(1.7661179698216736, 2.0, 0    2.0
 dtype: float64)

In [61]:
test_dummy['GarageCars'].value_counts()

2.0    770
1.0    407
3.0    193
0.0     76
4.0     11
5.0      1
Name: GarageCars, dtype: int64

In [62]:
test_dummy['GarageCars'].fillna(test_dummy['GarageCars'].mode()[0], inplace=True)
test_dummy['GarageCars'].isnull().sum()

0

#### 13) Fill : GarageArea

In [63]:
test_dummy['GarageArea'].isnull().sum()

1

In [64]:
test_dummy['GarageArea'].mean(), test_dummy['GarageArea'].median(), test_dummy['GarageArea'].mode()

(472.76886145404666, 480.0, 0    0.0
 dtype: float64)

In [65]:
test_dummy['GarageArea'].value_counts()

0.0       76
576.0     50
440.0     47
484.0     34
400.0     33
528.0     32
240.0     31
480.0     30
308.0     28
264.0     27
288.0     23
336.0     17
280.0     14
506.0     14
462.0     13
495.0     13
384.0     12
216.0     11
286.0     11
525.0     11
420.0     10
312.0     10
624.0     10
550.0      9
504.0      9
672.0      8
478.0      8
390.0      8
470.0      7
460.0      7
          ..
494.0      1
404.0      1
476.0      1
412.0      1
326.0      1
1200.0     1
332.0      1
836.0      1
760.0      1
428.0      1
283.0      1
485.0      1
543.0      1
443.0      1
569.0      1
581.0      1
787.0      1
711.0      1
1174.0     1
609.0      1
776.0      1
904.0      1
345.0      1
619.0      1
984.0      1
364.0      1
369.0      1
316.0      1
226.0      1
353.0      1
Name: GarageArea, Length: 459, dtype: int64

In [66]:
test_dummy['GarageArea'].fillna(round(test_dummy['GarageArea'].mean()), inplace=True)
test_dummy['GarageArea'].isnull().sum()

0

In [67]:
test_dummy.isnull().sum()

Id                       0
MSSubClass               0
LotFrontage              0
LotArea                  0
OverallQual              0
OverallCond              0
YearBuilt                0
YearRemodAdd             0
MasVnrArea               0
BsmtFinSF1               0
BsmtFinSF2               0
BsmtUnfSF                0
TotalBsmtSF              0
1stFlrSF                 0
2ndFlrSF                 0
LowQualFinSF             0
GrLivArea                0
BsmtFullBath             0
BsmtHalfBath             0
FullBath                 0
HalfBath                 0
BedroomAbvGr             0
KitchenAbvGr             0
TotRmsAbvGrd             0
Fireplaces               0
GarageYrBlt              0
GarageCars               0
GarageArea               0
WoodDeckSF               0
OpenPorchSF              0
                        ..
GarageFinish_Fin         0
GarageFinish_RFn         0
GarageFinish_Unf         0
GarageQual_Fa            0
GarageQual_Gd            0
GarageQual_Po            0
G

## Feature Selection

In [68]:
x = train_dummy.drop(['SalePrice'], axis=1)
y = train_dummy['SalePrice']

In [69]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1022, 271), (438, 271), (1022,), (438,))

In [70]:
train_dummy.dtypes

Id                         int64
MSSubClass                 int64
LotFrontage              float64
LotArea                    int64
OverallQual                int64
OverallCond                int64
YearBuilt                  int64
YearRemodAdd               int64
MasVnrArea               float64
BsmtFinSF1                 int64
BsmtFinSF2                 int64
BsmtUnfSF                  int64
TotalBsmtSF                int64
1stFlrSF                   int64
2ndFlrSF                   int64
LowQualFinSF               int64
GrLivArea                  int64
BsmtFullBath               int64
BsmtHalfBath               int64
FullBath                   int64
HalfBath                   int64
BedroomAbvGr               int64
KitchenAbvGr               int64
TotRmsAbvGrd               int64
Fireplaces                 int64
GarageYrBlt              float64
GarageCars                 int64
GarageArea                 int64
WoodDeckSF                 int64
OpenPorchSF                int64
          

## Feature Scaling

In [71]:
std_scaler = StandardScaler().fit(x_train)
x_train_std = std_scaler.transform(x_train)
x_test_std = std_scaler.transform(x_test)

  return self.partial_fit(X, y)
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [72]:
minmax_scaler = MinMaxScaler().fit(x_train)
x_train_minmax = minmax_scaler.transform(x_train)
x_test_minmax = minmax_scaler.transform(x_test)

  return self.partial_fit(X, y)


## Machine Learning : Regression

### 1) LinearRegression

In [106]:
linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)
pred_lr = linear_regression.predict(x_test)

In [107]:
linear_regression.intercept_, linear_regression.coef_

(235289.65897679538,
 array([ 1.67896640e+00, -5.83964888e+00,  1.15652116e+02,  7.24270719e-01,
         7.86255911e+03,  5.48817823e+03,  2.64510932e+02,  9.45659945e+01,
         2.22287644e+01,  1.53801656e+01,  8.72223685e+00, -2.91707226e+00,
         2.11853070e+01,  2.20133502e+01,  3.93996582e+01, -3.83624375e+01,
         2.30505459e+01, -8.64894364e+02, -4.61219158e+03,  2.89582885e+03,
         3.13944513e+02, -5.09822925e+03, -2.09028590e+04,  2.18176213e+03,
         1.06878153e+03,  3.78353514e+01,  3.54015714e+03,  1.72595406e+01,
         1.05727266e+01, -6.22455158e-01, -7.03706225e+00,  3.08704157e+01,
         1.41977786e+01,  8.37040090e+01, -1.01007479e-01, -3.01946930e+02,
        -5.38509257e+02, -1.34718970e+04,  7.23915479e+03,  1.49551779e+04,
        -3.00194010e+03, -5.72049558e+03, -1.52090282e+04,  1.52090282e+04,
        -2.48066369e+03,  3.51139962e+03,  1.40246860e+03, -2.43320454e+03,
        -6.01586899e+02,  5.19543690e+03, -9.98275023e+03,  5.38890

In [108]:
r2_score(y_test, pred_lr)

0.72858914471345

In [109]:
mean_absolute_error(y_test, pred_lr), mean_squared_error(y_test, pred_lr), np.sqrt(mean_squared_error(y_test, pred_lr))

(18687.67265435129, 1707943110.585058, 41327.26836587507)

In [110]:
np.mean(cross_val_score(linear_regression, x_train, y_train, scoring='r2', cv=5)), np.mean(cross_val_score(linear_regression, x_test, y_test, scoring='r2', cv=5))

(0.8139732790898748, 0.3633315512573051)

In [111]:
print('train score: ', linear_regression.score(x_train, y_train))
print('test score:  ', linear_regression.score(x_test, y_test))

train score:  0.9308579821461429
test score:   0.72858914471345


### 2) Ridge 

In [112]:
ridge = Ridge()  # alpha = ?
ridge.fit(x_train, y_train)
pred_ridge = ridge.predict(x_test)

In [113]:
r2_score(y_test, pred_ridge)

0.7472655456763602

In [114]:
mean_absolute_error(y_test, pred_ridge), mean_squared_error(y_test, pred_ridge), np.sqrt(mean_squared_error(y_test, pred_ridge))

(17958.872457345587, 1590415643.5224423, 39880.01559079989)

In [156]:
np.mean(cross_val_score(ridge, x_train, y_train, cv=5)), np.mean(cross_val_score(ridge, x_test, y_test, cv=5))

(0.844570440172873, 0.4150856477554263)

In [116]:
print('train score: ', ridge.score(x_train, y_train))
print('test score:  ', ridge.score(x_test, y_test))

train score:  0.927865057325873
test score:   0.7472655456763602


### 3) Lasso

In [117]:
lasso = Lasso()  # alpha = ?, max_iter = ?
lasso.fit(x_train, y_train)
pred_lasso = lasso.predict(x_test)



In [118]:
r2_score(y_test, pred_lasso)

0.7374760849104058

In [119]:
mean_absolute_error(y_test, pred_lasso), mean_squared_error(y_test, pred_lasso), np.sqrt(mean_squared_error(y_test, pred_lasso))

(18531.744018094494, 1652019082.5370758, 40645.03761269112)

In [157]:
np.mean(cross_val_score(lasso, x_train, y_train, cv=5)), np.mean(cross_val_score(lasso, x_test, y_test, cv=5))



(0.8175343412562338, 0.3865664789409827)

In [121]:
print('train score: ', lasso.score(x_train, y_train))
print('test score:  ', lasso.score(x_test, y_test))

train score:  0.9308183761681194
test score:   0.7374760849104058


### 4) GradientBoostingRegressor

In [100]:
gradient = GradientBoostingRegressor(n_estimators=1000, subsample=0.5, max_depth=2, learning_rate=0.05)
gradient.fit(x_train, y_train)
pred_gbr = gradient.predict(x_test)

In [154]:
gradient

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=1000, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=0.5, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [102]:
r2_score(y_test, pred_gbr)

0.9122327633641496

In [103]:
mean_absolute_error(y_test, pred_gbr), mean_squared_error(y_test, pred_gbr), np.sqrt(mean_squared_error(y_test, pred_gbr))

(14898.102001575442, 552304538.4055346, 23501.160362959414)

In [104]:
np.mean(cross_val_score(gradient, x_train, y_train, scoring='r2', cv=5)), np.mean(cross_val_score(gradient, x_test, y_test, scoring='r2', cv=5))

(0.8565520157673389, 0.7545559744179188)

In [105]:
print('train score: ', gradient.score(x_train, y_train))
print('test score:  ', gradient.score(x_test, y_test))

train score:  0.9846469988154608
test score:   0.9122327633641496


### 5) GradientBoostingRegressor + GridSearchCV

In [134]:
param_grid = {'n_estimators':[500,1000],
             'learning_rate':[0.1,0.05],
             'subsample':[0.4,0.5],
             'max_depth':[2,4],
             'min_samples_leaf':[3,5],
             'max_features':[1.0,0.3]}

estimator = GradientBoostingRegressor(warm_start=True)
cv = ShuffleSplit(x_train.shape[0], test_size=0.2)

grid_gbr = GridSearchCV(estimator=estimator, param_grid = param_grid, cv=cv, n_jobs=-1, verbose=1)
grid_gbr.fit(x_train, y_train)
pred_grid_gbr = grid_gbr.predict(x_test)

Fitting 1022 folds for each of 64 candidates, totalling 65408 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 24.4min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 28.2min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 32.6min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 41.0min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 49.9min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 52.1min
[Parallel(n_jobs=-1)]: Done 11234 tasks      |

In [135]:
grid_gbr.best_score_, grid_gbr.best_params_, grid_gbr.best_estimator_

(0.8587775387165633,
 {'learning_rate': 0.05,
  'max_depth': 4,
  'max_features': 0.3,
  'min_samples_leaf': 5,
  'n_estimators': 500,
  'subsample': 0.5},
 GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='ls', max_depth=4, max_features=0.3,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=5,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, n_iter_no_change=None, presort='auto',
              random_state=None, subsample=0.5, tol=0.0001,
              validation_fraction=0.1, verbose=0, warm_start=True))

In [136]:
r2_score(y_test, pred_grid_gbr)

0.8927405036376319

In [137]:
mean_absolute_error(y_test, pred_grid_gbr), mean_squared_error(y_test, pred_grid_gbr), np.sqrt(mean_squared_error(y_test, pred_grid_gbr))

(15212.66609836621, 674966068.1903024, 25980.109087344157)

In [138]:
print('train score: ', grid_gbr.score(x_train, y_train))
print('test score:  ', grid_gbr.score(x_test, y_test))

train score:  0.98474606350716
test score:   0.8927405036376318


In [139]:
best_estimator = grid_gbr.best_estimator_
best_estimator.fit(x_train, y_train)
pred_best = best_estimator.predict(x_test)

In [140]:
r2_score(y_test, pred_best)

0.8927405036376319

In [141]:
print('train score: ', best_estimator.score(x_train, y_train))
print('test score:  ', best_estimator.score(x_test, y_test))

train score:  0.98474606350716
test score:   0.8927405036376318


## Evaluation

In [142]:
models = [linear_regression, ridge, lasso, gradient, best_estimator]

for model in models:
    print(model)
    print('r2 score: ', r2_score(y_test, model.predict(x_test)))
    print('train score: ', model.score(x_train, y_train))
    print('test score', model.score(x_test, y_test))
    print('-'*20)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
r2 score:  0.72858914471345
train score:  0.9308579821461429
test score 0.72858914471345
--------------------
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
r2 score:  0.7472655456763602
train score:  0.927865057325873
test score 0.7472655456763602
--------------------
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
r2 score:  0.7374760849104058
train score:  0.9308183761681194
test score 0.7374760849104058
--------------------
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_

## Predict Test dataset

In [143]:
train_dummy.shape

(1460, 272)

In [144]:
train_dummy.columns.unique()

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=272)

In [145]:
test_dummy.shape

(1459, 255)

In [146]:
test_dummy.columns

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=255)

In [147]:
col_list = []
for col in train_dummy.columns.to_list():
    if col not in test_dummy.columns.to_list():
        col_list.append(col)
col_list

['SalePrice',
 'Utilities_NoSeWa',
 'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'HouseStyle_2.5Fin',
 'RoofMatl_ClyTile',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'Exterior1st_ImStucc',
 'Exterior1st_Stone',
 'Exterior2nd_Other',
 'Heating_Floor',
 'Heating_OthW',
 'Electrical_Mix',
 'GarageQual_Ex']

In [148]:
train_dummy['Utilities_NoSeWa'].value_counts()

0    1459
1       1
Name: Utilities_NoSeWa, dtype: int64

In [149]:
train_dummy['Utilities_NoSeWa'].mode()[0]

0

In [150]:
test_copy = test_dummy.copy()
for col in col_list[1:]:
    test_copy[col] = train_dummy[col].mode()[0]
test_copy.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,Exterior1st_ImStucc,Exterior1st_Stone,Exterior2nd_Other,Heating_Floor,Heating_OthW,Electrical_Mix,GarageQual_Ex
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,0,0,0,0,0,0,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,0,0,0,0,0,0,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,0,0,0,0,0,0,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,0,0,0,0,0,0,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,0,0,0,0,0,0,0


In [151]:
test_final = pd.DataFrame(columns=x.columns)
test_final

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial


In [152]:
for col in test_final.columns.to_list():
    test_final[col] = test_copy[col]
test_final.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [153]:
models = [linear_regression, ridge, lasso, gradient, best_estimator]
cnt = 1
for model in models:
    predictions = model.predict(test_final)
    result = np.vstack([test_final['Id'], predictions]).T
    pd.DataFrame(result, dtype=int, columns=['Id','SalePrice']).set_index('Id').to_csv('house_price_result_{}.csv'.format(cnt))
    cnt += 1