In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing

In [2]:
train_df = pd.read_csv('../data/orignal/train.csv', index_col = 0)
test_df = pd.read_csv('../data/orignal/test.csv', index_col = 0)
combine_df = pd.concat([train_df, test_df])

### MSSubClass
涉及销售的寓所类型


In [3]:
combine_df[combine_df['MSSubClass'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### MSZoning
售卖的地产区域类型

In [4]:
combine_df['MSZoning'] = combine_df['MSZoning'].fillna('RL')
le = preprocessing.LabelEncoder()
le.fit(combine_df['MSZoning'])
combine_df['MSZoning'] = le.transform(combine_df['MSZoning'])

### LotFrontage
距离最近的街道的直线距离  
填充中位数  
数值标准化

In [5]:
lot_frontage_df = combine_df['LotFrontage'].fillna(combine_df['LotFrontage'].median())
lot_frontage_df = pd.DataFrame(preprocessing.scale(lot_frontage_df.values), np.array(range(1, 2920)), columns=['LotFrontage'])
lot_frontage_df.index.name = 'Id'

### LotArea
房产占地面积  
数值标准化

In [6]:
lot_area_df = pd.DataFrame(preprocessing.scale(combine_df['LotArea']), np.array(range(1, 2920)), columns=['LotArea'])
lot_area_df.index.name = 'Id'



### Street
取值不平衡 丢弃该特征

In [7]:
combine_df['Street'].value_counts()

Pave    2907
Grvl      12
Name: Street, dtype: int64

### Alley


In [8]:
combine_df['Alley_Access'] = combine_df['Alley'].apply(lambda x : 0 if pd.isnull(x) else 1)
combine_df['Alley'] = combine_df['Alley'].fillna('NoAccess')
combine_df['Alley'].value_counts()
le = preprocessing.LabelEncoder()
le.fit(combine_df['Alley'])
combine_df['Alley'] = le.transform(combine_df['Alley'])

### LotShape
住宅的房型

In [9]:
combine_df['LotShape'].value_counts()
le = preprocessing.LabelEncoder()
le.fit(combine_df['LotShape'])
combine_df['LotShape'] = le.transform(combine_df['LotShape'])

### LandContour
住宅的地面是否平坦

In [10]:
combine_df['LandContour'].value_counts()
le = preprocessing.LabelEncoder()
le.fit(combine_df['LandContour'])
combine_df['LandContour'] = le.transform(combine_df['LandContour'])

### Utilities
配套设施  
[不平衡] 丢弃

In [11]:
combine_df['Utilities'].value_counts()

AllPub    2916
NoSeWa       1
Name: Utilities, dtype: int64

### LotConfig
住宅的地理类型

In [12]:
combine_df['LotConfig'].value_counts()
le = preprocessing.LabelEncoder()
le.fit(combine_df['LotConfig'])
combine_df['LotConfig'] = le.transform(combine_df['LotConfig'])

### LandSlope
住宅的倾斜度

In [13]:
combine_df['LandSlope'].value_counts() 
le = preprocessing.LabelEncoder()
le.fit(combine_df['LandSlope'])
combine_df['LandSlope'] = le.transform(combine_df['LandSlope'])

### Neighborhood
在AME城中的物理位置

In [14]:
combine_df['Neighborhood'].value_counts()
le = preprocessing.LabelEncoder()
le.fit(combine_df['Neighborhood'])
combine_df['Neighborhood'] = le.transform(combine_df['Neighborhood'])

### Condition1
附近的情况

In [15]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['Condition1'])
combine_df['Condition1'] = le.transform(combine_df['Condition1'])
combine_df['Condition1'].value_counts()

2    2511
1     164
0      92
6      50
4      39
5      28
3      20
8       9
7       6
Name: Condition1, dtype: int64

### Condition2
附近的情况

In [16]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['Condition2'])
combine_df['Condition2'] = le.transform(combine_df['Condition2'])
combine_df['Condition2'].value_counts()

2    2889
1      13
0       5
3       4
4       4
7       2
5       1
6       1
Name: Condition2, dtype: int64

### BldgType
住宅类型

In [17]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['BldgType'])
combine_df['BldgType'] = le.transform(combine_df['BldgType'])
combine_df['BldgType'].value_counts()

0    2425
4     227
2     109
3      96
1      62
Name: BldgType, dtype: int64

### HouseStyle
住宅风格

In [18]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['HouseStyle'])
combine_df['HouseStyle'] = le.transform(combine_df['HouseStyle'])
combine_df['HouseStyle'].value_counts()

2    1471
5     872
0     314
7     128
6      83
4      24
1      19
3       8
Name: HouseStyle, dtype: int64

### OverallQual
装修覆盖率及装修完成度

In [19]:
overall_qual_df = pd.DataFrame(preprocessing.scale(combine_df['OverallQual'].values), np.array(range(1, 2920)), columns=['OverallQual'])
overall_qual_df.index.name = 'Id'



### OverallCond
住宅的整体状况

In [20]:
overall_cond_df = pd.DataFrame(preprocessing.scale(combine_df['OverallCond'].values), np.array(range(1, 2920)), columns=['OverallCond'])
overall_cond_df.index.name = 'Id'



### YearBuilt
原始施工日期  
计算原始施工日到目前(2016年)总共多少年

In [21]:
year_built_df = pd.DataFrame(2016 - combine_df['YearBuilt'])

### YearRemodAdd
改造时间年份
计算原始施工日到目前(2016年)总共多少年

In [22]:
year_remodadd_df = pd.DataFrame(2016 - combine_df['YearRemodAdd'])

### RoofStyle
屋顶类型

In [23]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['RoofStyle'])
combine_df['RoofStyle'] = le.transform(combine_df['RoofStyle'])
combine_df['RoofStyle'].value_counts()

1    2310
3     551
2      22
0      20
4      11
5       5
Name: RoofStyle, dtype: int64

### RoofMatl
屋顶材料

In [24]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['RoofMatl'])
combine_df['RoofMatl'] = le.transform(combine_df['RoofMatl'])
combine_df['RoofMatl'].value_counts()

1    2876
5      23
6       9
7       7
3       1
4       1
2       1
0       1
Name: RoofMatl, dtype: int64

### Exterior1st
房子的外观

In [25]:
combine_df[combine_df['Exterior1st'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2152,1518,0,0,1,2,0,TA,Gd,1035.0,0.0,...,0,Pave,5,1580.0,AllPub,0,1940,2007,2008,0


In [26]:
combine_df['Exterior1st'] = combine_df['Exterior1st'].fillna('VinylSd')
le = preprocessing.LabelEncoder()
le.fit(combine_df['Exterior1st'])
combine_df['Exterior1st'] = le.transform(combine_df['Exterior1st'])
combine_df['Exterior1st'].value_counts()

12    1026
8      450
6      442
13     411
9      221
5      126
3       87
14      56
0       44
11      43
2        6
1        2
10       2
4        2
7        1
Name: Exterior1st, dtype: int64

### Exterior2nd
房子的外观

In [27]:
combine_df[combine_df['Exterior2nd'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2152,1518,0,0,1,2,0,TA,Gd,1035.0,0.0,...,0,Pave,5,1580.0,AllPub,0,1940,2007,2008,0


In [28]:
combine_df['Exterior2nd'] = combine_df['Exterior2nd'].fillna('VinylSd')
le = preprocessing.LabelEncoder()
le.fit(combine_df['Exterior2nd'])
combine_df['Exterior2nd'] = le.transform(combine_df['Exterior2nd'])
combine_df['Exterior2nd'].value_counts()

13    1015
8      447
6      406
14     391
10     270
5      126
15      81
3       47
12      47
0       38
2       22
7       15
11       6
1        4
4        3
9        1
Name: Exterior2nd, dtype: int64

### MasVnrType
表层砌体类型

In [29]:
combine_df['MasVnrType'] = combine_df['MasVnrType'].fillna('None')
le = preprocessing.LabelEncoder()
le.fit(combine_df['MasVnrType'])
combine_df['MasVnrType'] = le.transform(combine_df['MasVnrType'])
combine_df['MasVnrType'].value_counts()

2    1766
1     879
3     249
0      25
Name: MasVnrType, dtype: int64

### MasVnrArea
表层砌面面积

In [30]:
combine_df['MasVnrArea'].median()

0.0

In [31]:
combine_df['MasVnrArea'] = combine_df['MasVnrArea'].fillna(combine_df['MasVnrArea'].median())
mas_vnr_area_df = pd.DataFrame(preprocessing.scale(combine_df['MasVnrArea']), np.array(range(1, 2920)), columns=['MasVnrArea'])
mas_vnr_area_df.index.name = 'Id'

### ExterQual
外观材料质量

In [32]:
combine_df['ExterQual'].isnull().any()

False

In [33]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['ExterQual'])
combine_df['ExterQual'] = le.transform(combine_df['ExterQual'])

In [34]:
combine_df['ExterQual'].value_counts()

3    1798
2     979
0     107
1      35
Name: ExterQual, dtype: int64

### ExterCond
外部材料现状

In [35]:
combine_df['ExterCond'].isnull().any()

False

In [36]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['ExterCond'])
combine_df['ExterCond'] = le.transform(combine_df['ExterCond'])

In [37]:
combine_df['ExterCond'].value_counts()

4    2538
2     299
1      67
0      12
3       3
Name: ExterCond, dtype: int64

### Foundation
地基类型

In [38]:
combine_df['Foundation'].isnull().any()

False

In [39]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['Foundation'])
combine_df['Foundation'] = le.transform(combine_df['Foundation'])

In [40]:
combine_df['Foundation'].value_counts()

2    1308
1    1235
0     311
3      49
4      11
5       5
Name: Foundation, dtype: int64

### Bsmt
是否有地下室

In [41]:
combine_df['Has_Bsmt'] = combine_df['BsmtQual'].apply(lambda x : 0 if pd.isnull(x) else 1)

### BsmtQual
地下室高度

In [42]:
combine_df['BsmtQual'] = combine_df['BsmtQual'].fillna('No_Bsmt')

In [43]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['BsmtQual'])
combine_df['BsmtQual'] = le.transform(combine_df['BsmtQual'])

### BsmtCond
地下室的环境条件

In [44]:
combine_df['BsmtCond'] = combine_df['BsmtCond'].fillna('No_Bsmt')

In [45]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['BsmtCond'])
combine_df['BsmtCond'] = le.transform(combine_df['BsmtCond'])

### BsmtExposure
光照条件

In [46]:
combine_df['BsmtExposure'] = combine_df['BsmtExposure'].fillna('No_Bsmt')

In [47]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['BsmtExposure'])
combine_df['BsmtExposure'] = le.transform(combine_df['BsmtExposure'])

### BsmtFinType1
地下室装修完成度

In [48]:
combine_df['BsmtFinType1'] = combine_df['BsmtFinType1'].fillna('No_Bsmt')

In [49]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['BsmtFinType1'])
combine_df['BsmtFinType1'] = le.transform(combine_df['BsmtFinType1'])

### BsmtFinSF1
Type1完成的面积

In [50]:
combine_df['BsmtFinSF1'] = combine_df['BsmtFinSF1'].fillna(0)

In [51]:
bsmt_fin_SF1_df = pd.DataFrame(preprocessing.scale(combine_df['BsmtFinSF1']), np.array(range(1, 2920)), columns=['BsmtFinSF1'])
bsmt_fin_SF1_df.index.name = 'Id'

### BsmtFinType2
地下室装修完成度

In [52]:
combine_df['BsmtFinType2'] = combine_df['BsmtFinType2'].fillna('No_Bsmt')

In [53]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['BsmtFinType2'])
combine_df['BsmtFinType2'] = le.transform(combine_df['BsmtFinType2'])

### BsmtFinSF2
Type2完成的面积

In [54]:
combine_df['BsmtFinSF2'] = combine_df['BsmtFinSF2'].fillna(0)

In [55]:
bsmt_fin_SF2_df = pd.DataFrame(preprocessing.scale(combine_df['BsmtFinSF2']), np.array(range(1, 2920)), columns=['BsmtFinSF2'])
bsmt_fin_SF2_df.index.name = 'Id'

### BsmtUnfSF
未完成的地下室面积

In [56]:
combine_df[combine_df['BsmtUnfSF'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2121,896,0,0,1,2,0,2,4,0.0,0.0,...,Pave,4,,AllPub,0,1946,1950,2008,0,0


In [57]:
combine_df.ix[2121, 'BsmtUnfSF'] = 0

In [58]:
bsmt_unf_sf_df = pd.DataFrame(preprocessing.scale(combine_df['BsmtUnfSF']), np.array(range(1, 2920)), columns=['BsmtUnfSF'])
bsmt_unf_sf_df.index.name = 'Id'

### TotalBsmtSF
地下室总面积

In [59]:
combine_df[combine_df['TotalBsmtSF'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2121,896,0,0,1,2,0,2,4,0.0,0.0,...,Pave,4,,AllPub,0,1946,1950,2008,0,0


In [60]:
combine_df.ix[2121, 'TotalBsmtSF'] = 0

In [61]:
total_bsmt_sf_df = pd.DataFrame(preprocessing.scale(combine_df['TotalBsmtSF']), np.array(range(1, 2920)), columns=['TotalBsmtSF'])
total_bsmt_sf_df.index.name = 'Id'

### Heating
供暖类型

In [62]:
combine_df[combine_df['Heating'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [63]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['Heating'])
combine_df['Heating'] = le.transform(combine_df['Heating'])

### HeatingQC
供暖效果

In [64]:
combine_df[combine_df['HeatingQC'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [65]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['HeatingQC'])
combine_df['HeatingQC'] = le.transform(combine_df['HeatingQC'])

### CentralAir
中央空调

In [66]:
combine_df[combine_df['CentralAir'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [67]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['CentralAir'])
combine_df['CentralAir'] = le.transform(combine_df['CentralAir'])

### Electrical
电力系统

In [68]:
combine_df[combine_df['Electrical'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1380,754,640,0,1,3,0,4,3,0.0,0.0,...,Pave,7,384.0,AllPub,100,2006,2007,2008,0,1


In [69]:
combine_df['Electrical'].value_counts()

SBrkr    2671
FuseA     188
FuseF      50
FuseP       8
Mix         1
Name: Electrical, dtype: int64

In [70]:
combine_df.ix[1380, 'Electrical'] = 'SBrkr'

In [71]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['Electrical'])
combine_df['Electrical'] = le.transform(combine_df['Electrical'])

### 1stFlrSF
一楼面积

In [72]:
combine_df[combine_df['1stFlrSF'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [73]:
fst_flr_sf_df = pd.DataFrame(preprocessing.scale(combine_df['1stFlrSF']), np.array(range(1, 2920)), columns=['1stFlrSF'])
fst_flr_sf_df.index.name = 'Id'



### 2ndFlrSF
二楼面积

In [74]:
combine_df[combine_df['2ndFlrSF'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [75]:
snd_flr_sf_df = pd.DataFrame(preprocessing.scale(combine_df['2ndFlrSF']), np.array(range(1, 2920)), columns=['2ndFlrSF'])
snd_flr_sf_df.index.name = 'Id'



### LowQualFinSF
低质量完成的面积

In [76]:
combine_df[combine_df['LowQualFinSF'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [77]:
low_qual_fin_sf_df = pd.DataFrame(preprocessing.scale(combine_df['LowQualFinSF']), np.array(range(1, 2920)), columns=['LowQualFinSF'])
low_qual_fin_sf_df.index.name = 'Id'



### GrLivArea
地面以上居住面积

In [78]:
combine_df[combine_df['GrLivArea'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [79]:
gr_liv_area_df = pd.DataFrame(preprocessing.scale(combine_df['GrLivArea']), np.array(range(1, 2920)), columns=['GrLivArea'])
gr_liv_area_df.index.name = 'Id'



### BsmtFullBath
地下室全浴室

In [80]:
combine_df[combine_df['BsmtFullBath'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2121,896,0,0,1,2,0,2,4,0.0,0.0,...,Pave,4,0.0,AllPub,0,1946,1950,2008,0,0
2189,3820,0,0,1,5,0,2,4,0.0,0.0,...,Pave,11,0.0,AllPub,0,1959,1996,2008,0,0


In [81]:
combine_df['BsmtFullBath'].value_counts()

0.0    1705
1.0    1172
2.0      38
3.0       2
Name: BsmtFullBath, dtype: int64

In [82]:
combine_df.ix[2121, 'Has_Bsmt']

0

In [83]:
combine_df.ix[2189, 'Has_Bsmt']

0

In [84]:
combine_df['BsmtFullBath'] = combine_df['BsmtFullBath'].fillna(0).astype(np.int)

### BsmtHalfBath
底下室半浴室

In [85]:
combine_df[combine_df['BsmtHalfBath'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2121,896,0,0,1,2,0,2,4,0.0,0.0,...,Pave,4,0.0,AllPub,0,1946,1950,2008,0,0
2189,3820,0,0,1,5,0,2,4,0.0,0.0,...,Pave,11,0.0,AllPub,0,1959,1996,2008,0,0


In [86]:
combine_df['BsmtHalfBath'].value_counts()

0.0    2742
1.0     171
2.0       4
Name: BsmtHalfBath, dtype: int64

In [87]:
combine_df['BsmtHalfBath'] = combine_df['BsmtHalfBath'].fillna(0).astype(np.int)

### FullBath
地上全浴室个数

In [88]:
combine_df[combine_df['FullBath'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [89]:
combine_df['FullBath'].value_counts()

2    1530
1    1309
3      64
0      12
4       4
Name: FullBath, dtype: int64

### HalfBath
地上半浴室个数

In [90]:
combine_df[combine_df['HalfBath'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [91]:
combine_df['HalfBath'].value_counts()

0    1834
1    1060
2      25
Name: HalfBath, dtype: int64

### BedroomAbvGr
地上卧室

In [92]:
combine_df[combine_df['BedroomAbvGr'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [93]:
combine_df['BedroomAbvGr'].value_counts()

3    1596
2     742
4     400
1     103
5      48
6      21
0       8
8       1
Name: BedroomAbvGr, dtype: int64

### KitchenAbvGr
地上厨房

In [94]:
combine_df[combine_df['KitchenAbvGr'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [95]:
combine_df['KitchenAbvGr'].value_counts()

1    2785
2     129
0       3
3       2
Name: KitchenAbvGr, dtype: int64

### KitchenQual
厨房质量

In [96]:
combine_df[combine_df['KitchenQual'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1556,725,499,0,1,3,0,0,3,0.0,0.0,...,Pave,6,689.0,AllPub,0,1917,1950,2010,0,1


In [97]:
combine_df['KitchenQual'].value_counts()

TA    1492
Gd    1151
Ex     205
Fa      70
Name: KitchenQual, dtype: int64

In [98]:
combine_df.ix[1556, 'KitchenQual'] = 'TA'

In [99]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['KitchenQual'])
combine_df['KitchenQual'] = le.transform(combine_df['KitchenQual'])

### TotRmsAbvGrd
地上的房间总数量

In [100]:
combine_df[combine_df['TotRmsAbvGrd'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [101]:
combine_df['TotRmsAbvGrd'].value_counts()

6     844
7     649
5     583
8     347
4     196
9     143
10     80
11     32
3      25
12     16
15      1
13      1
14      1
2       1
Name: TotRmsAbvGrd, dtype: int64

### Functional
家庭功能

In [102]:
combine_df[combine_df['Functional'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2217,733,0,0,1,2,0,2,4,0.0,0.0,...,Pave,4,0.0,AllPub,0,1952,1952,2008,0,0
2474,866,504,0,0,3,0,0,3,0.0,0.0,...,Pave,6,771.0,AllPub,14,1910,1950,2007,1,1


In [103]:
combine_df['Functional'].value_counts()

Typ     2717
Min2      70
Min1      65
Mod       35
Maj1      19
Maj2       9
Sev        2
Name: Functional, dtype: int64

In [104]:
combine_df.ix[2217, 'Functional'] = 'Typ'
combine_df.ix[2474, 'Functional'] = 'Typ'

In [105]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['Functional'])
combine_df['Functional'] = le.transform(combine_df['Functional'])

### Fireplaces
壁炉数量

In [106]:
combine_df[combine_df['Fireplaces'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [107]:
combine_df['Fireplaces'].value_counts()

0    1420
1    1268
2     219
3      11
4       1
Name: Fireplaces, dtype: int64

###  HasFireplace
是否有壁炉

In [108]:
combine_df['Has_Fireplace'] = combine_df['FireplaceQu'].apply(lambda x : 0 if pd.isnull(x) else 1)

###  FireplaceQu
壁炉质量

In [109]:
combine_df['FireplaceQu'] = combine_df['FireplaceQu'].fillna('No_Fp')

In [110]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['FireplaceQu'])
combine_df['FireplaceQu'] = le.transform(combine_df['FireplaceQu'])

### Has_GarageType
是否有车库、

In [111]:
combine_df['Has_Garage'] = combine_df['GarageType'].apply(lambda x : 0 if pd.isnull(x) else 1)

In [112]:
combine_df.ix[2127, 'Has_Garage'] = 0
combine_df.ix[2577, 'Has_Garage'] = 0

### GarageType
车库所在位置

In [113]:
type_df = combine_df[combine_df['GarageType'].isnull()]

In [114]:
combine_df['GarageType'] = combine_df['GarageType'].fillna('No_GT')
combine_df.ix[2127, 'GarageType'] = 'No_GT'
combine_df.ix[2577, 'GarageType'] = 'No_GT'

In [115]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['GarageType'])
combine_df['GarageType'] = le.transform(combine_df['GarageType'])

### GarageYrBlt
车库建造年份

In [116]:
yt_df = combine_df[combine_df['GarageYrBlt'].isnull()]

In [117]:
set(yt_df.index) - set(type_df.index)

{2127, 2577}

In [118]:
combine_df['GarageYrBlt'] = combine_df['GarageYrBlt'].fillna(2016)

In [119]:
year_garage_df = 2016 - combine_df['GarageYrBlt']

### GarageCars
车库能停几辆车

In [120]:
combine_df[combine_df['GarageCars'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2577,942,886,0,1,3,0,4,3,548.0,0.0,...,859.0,AllPub,174,1923,1999,2007,0,1,0,0


In [121]:
combine_df['GarageCars'].median()

2.0

In [122]:
combine_df.ix[2577, 'GarageCars'] = 0

In [123]:
garage_cars_df = pd.DataFrame(preprocessing.scale(combine_df['GarageCars']), np.array(range(1, 2920)), columns=['GarageCars'])
garage_cars_df.index.name = 'Id'

### GarageArea
车库面积

In [124]:
combine_df[combine_df['GarageArea'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2577,942,886,0,1,3,0,4,3,548.0,0.0,...,859.0,AllPub,174,1923,1999,2007,0,1,0,0


In [125]:
combine_df.ix[2577, 'GarageArea'] = 0

In [126]:
garage_area_df = pd.DataFrame(preprocessing.scale(combine_df['GarageArea']), np.array(range(1, 2920)), columns=['GarageArea'])
garage_area_df.index.name = 'Id'

### GarageQual
车库质量

In [127]:
combine_df[combine_df['GarageQual'].isnull() & (combine_df['Has_Garage'] == 1)]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [128]:
combine_df['GarageQual'] = combine_df['GarageQual'].fillna('No_GT')

In [129]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['GarageQual'])
combine_df['GarageQual'] = le.transform(combine_df['GarageQual'])

### GarageCond
车库条件

In [130]:
combine_df[combine_df['GarageCond'].isnull() & (combine_df['Has_Garage'] == 1)]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [131]:
combine_df['GarageCond'] = combine_df['GarageQual'].fillna('No_GT')

In [132]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['GarageCond'])
combine_df['GarageCond'] = le.transform(combine_df['GarageCond'])

### PavedDrive
汽车开的道路情况

In [133]:
combine_df[combine_df['PavedDrive'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [134]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['PavedDrive'])
combine_df['PavedDrive'] = le.transform(combine_df['PavedDrive'])

### WoodDeckSF
木甲板面积平方英尺


In [135]:
combine_df[combine_df['WoodDeckSF'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [136]:
wood_deck_df = pd.DataFrame(preprocessing.scale(combine_df['WoodDeckSF']), np.array(range(1, 2920)), columns=['WoodDeckSF'])
wood_deck_df.index.name = 'Id'



### OpenPorchSF
开放玄关面积平方英尺

In [137]:
combine_df[combine_df['OpenPorchSF'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [138]:
open_porch_sf_df = pd.DataFrame(preprocessing.scale(combine_df['OpenPorchSF']), np.array(range(1, 2920)), columns=['OpenPorchSF'])
open_porch_sf_df.index.name = 'Id'



### EnclosedPorch
封闭走廊地区平方英尺

In [139]:
combine_df[combine_df['EnclosedPorch'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [140]:
enclose_porch_df = pd.DataFrame(preprocessing.scale(combine_df['EnclosedPorch']), np.array(range(1, 2920)), columns=['EnclosedPorch'])
enclose_porch_df.index.name = 'Id'



### 3SsnPorch
三面玄关面积平方英尺

In [141]:
combine_df[combine_df['3SsnPorch'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [142]:
three_ssn_porch_df = pd.DataFrame(preprocessing.scale(combine_df['3SsnPorch']), np.array(range(1, 2920)), columns=['3SsnPorch'])
three_ssn_porch_df.index.name = 'Id'



### ScreenPorch
窗口玄关面积平方英尺

In [143]:
combine_df[combine_df['ScreenPorch'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [144]:
screen_porch_df = pd.DataFrame(preprocessing.scale(combine_df['ScreenPorch']), np.array(range(1, 2920)), columns=['ScreenPorch'])
screen_porch_df.index.name = 'Id'



### Has_Pool
是否有游泳池

In [145]:
combine_df['Has_Pool'] = combine_df['PoolArea'].apply(lambda x:0 if x == 0 else 1 )

### PoolArea
游泳池面积

In [146]:
combine_df[combine_df['PoolArea'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage,Has_Pool
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [147]:
pool_area_df = pd.DataFrame(preprocessing.scale(combine_df['PoolArea']), np.array(range(1, 2920)), columns=['PoolArea'])
pool_area_df.index.name = 'Id'



### PoolQC
游泳池质量

In [148]:
combine_df[combine_df['PoolQC'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage,Has_Pool
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,856,854,0,1,3,0,4,3,706.0,0.0,...,AllPub,0,2003,2003,2008,0,1,0,1,0
2,1262,0,0,1,3,0,4,1,978.0,0.0,...,AllPub,298,1976,1976,2007,0,1,1,1,0
3,920,866,0,1,3,0,4,2,486.0,0.0,...,AllPub,0,2001,2002,2008,0,1,1,1,0
4,961,756,0,1,3,0,1,3,216.0,0.0,...,AllPub,0,1915,1970,2006,0,1,1,1,0
5,1145,1053,0,1,4,0,4,0,655.0,0.0,...,AllPub,192,2000,2000,2008,0,1,1,1,0
6,796,566,320,1,1,0,4,3,732.0,0.0,...,AllPub,40,1993,1995,2009,0,1,0,1,0
7,1694,0,0,1,3,0,4,0,1369.0,0.0,...,AllPub,255,2004,2005,2007,0,1,1,1,0
8,1107,983,0,1,3,0,4,2,859.0,32.0,...,AllPub,235,1973,1973,2009,0,1,1,1,0
9,1022,752,0,1,2,0,4,3,0.0,0.0,...,AllPub,90,1931,1950,2008,0,1,1,1,0
10,1077,0,0,1,2,1,4,3,851.0,0.0,...,AllPub,0,1939,1950,2008,0,1,1,1,0


In [149]:
combine_df['PoolQC'] = combine_df['PoolQC'].fillna('No_Pool')

In [150]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['PoolQC'])
combine_df['PoolQC'] = le.transform(combine_df['PoolQC'])

### Fence
栅栏质量

In [151]:
combine_df[combine_df['Fence'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage,Has_Pool
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,856,854,0,1,3,0,4,3,706.0,0.0,...,AllPub,0,2003,2003,2008,0,1,0,1,0
2,1262,0,0,1,3,0,4,1,978.0,0.0,...,AllPub,298,1976,1976,2007,0,1,1,1,0
3,920,866,0,1,3,0,4,2,486.0,0.0,...,AllPub,0,2001,2002,2008,0,1,1,1,0
4,961,756,0,1,3,0,1,3,216.0,0.0,...,AllPub,0,1915,1970,2006,0,1,1,1,0
5,1145,1053,0,1,4,0,4,0,655.0,0.0,...,AllPub,192,2000,2000,2008,0,1,1,1,0
7,1694,0,0,1,3,0,4,0,1369.0,0.0,...,AllPub,255,2004,2005,2007,0,1,1,1,0
8,1107,983,0,1,3,0,4,2,859.0,32.0,...,AllPub,235,1973,1973,2009,0,1,1,1,0
9,1022,752,0,1,2,0,4,3,0.0,0.0,...,AllPub,90,1931,1950,2008,0,1,1,1,0
10,1077,0,0,1,2,1,4,3,851.0,0.0,...,AllPub,0,1939,1950,2008,0,1,1,1,0
11,1040,0,0,1,3,0,4,3,906.0,0.0,...,AllPub,0,1965,1965,2008,0,1,0,1,0


In [152]:
combine_df['Fence'] = combine_df['Fence'].fillna('No_Fence')

In [153]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['Fence'])
combine_df['Fence'] = le.transform(combine_df['Fence'])

### MoSold
销售的月份


In [154]:
combine_df[combine_df['MoSold'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage,Has_Pool
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [155]:
combine_df['MoSold'].value_counts()

6     503
7     446
5     394
4     279
8     233
3     232
10    173
9     158
11    142
2     133
1     122
12    104
Name: MoSold, dtype: int64

### YrSold
销售的年份

In [156]:
combine_df[combine_df['YrSold'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage,Has_Pool
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [157]:
combine_df['YrSold'].value_counts()

2007    692
2009    647
2008    622
2006    619
2010    339
Name: YrSold, dtype: int64

### SaleType
销售的类型

In [158]:
combine_df[combine_df['SaleType'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage,Has_Pool
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2490,1176,0,0,1,3,0,4,2,190.0,873.0,...,AllPub,0,1958,1998,2007,0,1,1,1,0


In [159]:
combine_df['SaleType'].value_counts()

WD       2525
New       239
COD        87
ConLD      26
CWD        12
ConLI       9
ConLw       8
Oth         7
Con         5
Name: SaleType, dtype: int64

In [160]:
combine_df.ix[2490, 'SaleType'] = 'WD'

In [161]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['SaleType'])
combine_df['SaleType'] = le.transform(combine_df['SaleType'])

### SaleCondition
销售条件

In [162]:
combine_df[combine_df['SaleCondition'].isnull()]

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Alley_Access,Has_Bsmt,Has_Fireplace,Has_Garage,Has_Pool
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [163]:
le = preprocessing.LabelEncoder()
le.fit(combine_df['SaleCondition'])
combine_df['SaleCondition'] = le.transform(combine_df['SaleCondition'])

In [165]:
###  MiscFeature
#另外一些特征

In [None]:
combine_df['MiscFeature'].value_counts()

#### 排除的特征 
【Street】：不平衡  
【Utilities】：不平衡  
【Condition2】：不平衡

### 特征合并
合并所有特征   
分离训练集和测试集

In [166]:
X_df = pd.merge(gr_liv_area_df, overall_qual_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, bsmt_fin_SF1_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, total_bsmt_sf_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['LotShape']), left_index=True, right_index=True)
X_df = pd.merge(X_df, lot_area_df, left_index=True, right_index=True)

X_df = pd.merge(X_df, lot_frontage_df, left_index=True, right_index=True)
#*******************************************************************************************************************************
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Alley_Access']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Alley']), left_index=True, right_index=True)

X_df = pd.merge(X_df, pd.DataFrame(combine_df['LandContour']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['LotConfig']), left_index=True, right_index=True)

#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Neighborhood']), left_index=True, right_index=True)

#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Condition1']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Condition2']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['BldgType']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['HouseStyle']), left_index=True, right_index=True)
#*******************************************************************************************************************************

X_df = pd.merge(X_df, overall_cond_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, year_built_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, year_remodadd_df, left_index=True, right_index=True)

X_df = pd.merge(X_df, pd.DataFrame(combine_df['RoofStyle']), left_index=True, right_index=True)
#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['RoofMatl']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Exterior1st']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Exterior2nd']), left_index=True, right_index=True)
#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['MasVnrType']), left_index=True, right_index=True)
X_df = pd.merge(X_df, mas_vnr_area_df, left_index=True, right_index=True)

#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['ExterQual']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['ExterCond']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Foundation']), left_index=True, right_index=True)
#*******************************************************************************************************************************

#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Has_Bsmt']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtQual']), left_index=True, right_index=True)

#*******************************************************************************************************************************
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtCond']), left_index=True, right_index=True)
#*******************************************************************************************************************************

X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtExposure']), left_index=True, right_index=True)
#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtFinType1']), left_index=True, right_index=True)

X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtFinType2']), left_index=True, right_index=True)
X_df = pd.merge(X_df, bsmt_fin_SF2_df, left_index=True, right_index=True)

X_df = pd.merge(X_df, bsmt_unf_sf_df, left_index=True, right_index=True)
#*******************************************************************************************************************************


#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Heating']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['HeatingQC']), left_index=True, right_index=True)

#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['CentralAir']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Electrical']), left_index=True, right_index=True)
#*******************************************************************************************************************************

X_df = pd.merge(X_df, fst_flr_sf_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, snd_flr_sf_df, left_index=True, right_index=True)
#X_df = pd.merge(X_df, low_qual_fin_sf_df, left_index=True, right_index=True)

X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtFullBath']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtHalfBath']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['FullBath']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['HalfBath']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['BedroomAbvGr']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['KitchenAbvGr']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['KitchenQual']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['TotRmsAbvGrd']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Functional']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['Has_Fireplace']), left_index=True, right_index=True)

#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['FireplaceQu']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Has_Garage']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['GarageType']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['GarageYrBlt']), left_index=True, right_index=True)
#*******************************************************************************************************************************

X_df = pd.merge(X_df, garage_cars_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, garage_area_df, left_index=True, right_index=True)

#*******************************************************************************************************************************
X_df = pd.merge(X_df, pd.DataFrame(combine_df['GarageQual']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['GarageCond']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['PavedDrive']), left_index=True, right_index=True)
#*******************************************************************************************************************************

X_df = pd.merge(X_df, wood_deck_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, open_porch_sf_df, left_index=True, right_index=True)

#*******************************************************************************************************************************
#X_df = pd.merge(X_df, enclose_porch_df, left_index=True, right_index=True)
#X_df = pd.merge(X_df, three_ssn_porch_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, screen_porch_df, left_index=True, right_index=True)


#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Has_Pool']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pool_area_df, left_index=True, right_index=True)

#X_df = pd.merge(X_df, pd.DataFrame(combine_df['PoolQC']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Fence']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['MoSold']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['YrSold']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['SaleType']), left_index=True, right_index=True)

X_df = pd.merge(X_df, pd.DataFrame(combine_df['SaleCondition']), left_index=True, right_index=True)

In [222]:
X_df = pd.merge(gr_liv_area_df, overall_qual_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, bsmt_fin_SF1_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, total_bsmt_sf_df, left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['LotShape']), left_index=True, right_index=True)
X_df = pd.merge(X_df, lot_area_df, left_index=True, right_index=True)

# X_df = pd.merge(X_df, lot_frontage_df, left_index=True, right_index=True)
# X_df = pd.merge(pd.DataFrame(combine_df['LandSlope']), lot_area_df, left_index=True, right_index=True)
#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Alley_Access']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Alley']), left_index=True, right_index=True)



# X_df = pd.merge(X_df, pd.DataFrame(combine_df['LotShape']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['LandContour']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['LotConfig']), left_index=True, right_index=True)
#--------------------------------------------------------------------------------------------
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['LandSlope']), left_index=True, right_index=True)
#--------------------------------------------------------------------------------------------
#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Neighborhood']), left_index=True, right_index=True)

#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Condition1']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Condition2']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['BldgType']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['HouseStyle']), left_index=True, right_index=True)
#*******************************************************************************************************************************
# 
# X_df = pd.merge(X_df, overall_cond_df, left_index=True, right_index=True)
# X_df = pd.merge(X_df, year_built_df, left_index=True, right_index=True)
# X_df = pd.merge(X_df, year_remodadd_df, left_index=True, right_index=True)


# X_df = pd.merge(X_df, pd.DataFrame(combine_df['RoofStyle']), left_index=True, right_index=True)
#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['RoofMatl']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Exterior1st']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Exterior2nd']), left_index=True, right_index=True)
#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['MasVnrType']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, mas_vnr_area_df, left_index=True, right_index=True)

#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['ExterQual']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['ExterCond']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Foundation']), left_index=True, right_index=True)
#*******************************************************************************************************************************

# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Has_Bsmt']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtQual']), left_index=True, right_index=True)

#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtCond']), left_index=True, right_index=True)
#*******************************************************************************************************************************

# X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtExposure']), left_index=True, right_index=True)
#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtFinType1']), left_index=True, right_index=True)
# 
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtFinType2']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, bsmt_fin_SF2_df, left_index=True, right_index=True)

# X_df = pd.merge(X_df, bsmt_unf_sf_df, left_index=True, right_index=True)
#*******************************************************************************************************************************


#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Heating']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['HeatingQC']), left_index=True, right_index=True)

#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['CentralAir']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Electrical']), left_index=True, right_index=True)
#*******************************************************************************************************************************

# X_df = pd.merge(X_df, fst_flr_sf_df, left_index=True, right_index=True)
# X_df = pd.merge(X_df, snd_flr_sf_df, left_index=True, right_index=True)
#X_df = pd.merge(X_df, low_qual_fin_sf_df, left_index=True, right_index=True)
# X_df = pd.merge(X_df, gr_liv_area_df, left_index=True, right_index=True)

# X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtFullBath']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['BsmtHalfBath']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['FullBath']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['HalfBath']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['BedroomAbvGr']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['KitchenAbvGr']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['KitchenQual']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['TotRmsAbvGrd']), left_index=True, right_index=True)
#X_df = pd.merge(X_df, pd.DataFrame(combine_df['Functional']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Has_Fireplace']), left_index=True, right_index=True)

#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['FireplaceQu']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Has_Garage']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['GarageType']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['GarageYrBlt']), left_index=True, right_index=True)
#*******************************************************************************************************************************

# X_df = pd.merge(X_df, garage_cars_df, left_index=True, right_index=True)
# X_df = pd.merge(X_df, garage_area_df, left_index=True, right_index=True)

#*******************************************************************************************************************************
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['GarageQual']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['GarageCond']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['PavedDrive']), left_index=True, right_index=True)
#*******************************************************************************************************************************

# X_df = pd.merge(X_df, wood_deck_df, left_index=True, right_index=True)
# X_df = pd.merge(X_df, open_porch_sf_df, left_index=True, right_index=True)

#*******************************************************************************************************************************
# X_df = pd.merge(X_df, enclose_porch_df, left_index=True, right_index=True)
# X_df = pd.merge(X_df, three_ssn_porch_df, left_index=True, right_index=True)
# X_df = pd.merge(X_df, screen_porch_df, left_index=True, right_index=True)


# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Has_Pool']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pool_area_df, left_index=True, right_index=True)

# X_df = pd.merge(X_df, pd.DataFrame(combine_df['PoolQC']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['Fence']), left_index=True, right_index=True)
X_df = pd.merge(X_df, pd.DataFrame(combine_df['MoSold']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['YrSold']), left_index=True, right_index=True)
# X_df = pd.merge(X_df, pd.DataFrame(combine_df['SaleType']), left_index=True, right_index=True)
#*******************************************************************************************************************************

#X_df = pd.merge(X_df, pd.DataFrame(combine_df['SaleCondition']), left_index=True, right_index=True)

In [167]:
X_train_df = X_df.loc[1:1460]

In [168]:
X_test_df = X_df.loc[1461:2919]

In [169]:
y_train_df = train_df['SalePrice']

In [170]:
X_train_df.to_csv('../data/offline/X_train.csv', header = True, index=True)
X_test_df.to_csv('../data/offline/X_test.csv', header = True, index=True)
y_train_df.to_csv('../data/offline/y_train.csv', header = True, index=True)

In [171]:
len(X_test_df)

1459

In [172]:
y_train_df.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64