# 範例 : (Kaggle)房價預測
***
- 以下用房價預測資料, 觀察均值編碼的效果

# [教學目標]
- 以下用房價預測資料, 觀察均值編碼的效果

# [範例重點]
- 觀察標籤編碼與均值編碼, 在特徵數量 / 線性迴歸分數 / 線性迴歸時間上, 分別有什麼影響 (In[3], Out[3], In[4], Out[4]) 
- 觀察標籤編碼與均值編碼, 在特徵數量 / 梯度提升樹分數 / 梯度提升樹時間上, 分別有什麼影響 (In[5], Out[5], In[6], Out[6]) 

In [4]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder

data_path = 'data/'
df_train = pd.read_csv(data_path + 'house_train.csv.gz')
df_test = pd.read_csv(data_path + 'house_test.csv.gz')

train_Y = np.log1p(df_train['SalePrice'])
ids = df_test['Id']
df_train = df_train.drop(['Id', 'SalePrice'] , axis=1)
df_test = df_test.drop(['Id'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [5]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
print(f'{len(object_features)} Numeric Features : {object_features}\n')

# 只留類別型欄位
df = df[object_features]
df = df.fillna('None')
train_num = train_Y.shape[0]
df.head()

43 Numeric Features : ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']



Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [6]:
# 對照組 : 標籤編碼 + 線性迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LinearRegression()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.66156068668513
time : 0.07003569602966309 sec


In [9]:
train_Y

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1455    12.072547
1456    12.254868
1457    12.493133
1458    11.864469
1459    11.901590
Name: SalePrice, Length: 1460, dtype: float64

In [13]:
# 均值編碼 + 線性迴歸
data = pd.concat([df[:train_num], train_Y], axis=1) #多一個 SalePrice number
for c in df.columns:
    mean_df = data.groupby([c])['SalePrice'].mean().reset_index()
    print(mean_df)
    mean_df.columns = [c, f'{c}_mean']
    data = pd.merge(data, mean_df, on=c, how='left')
    data = data.drop([c] , axis=1)
data = data.drop(['SalePrice'] , axis=1)
estimator = LinearRegression()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, data, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

  MSZoning  SalePrice
0  C (all)  11.118275
1       FV  12.246621
2       RH  11.749848
3       RL  12.085891
4       RM  11.692901
  Street  SalePrice
0   Grvl  11.667444
1   Pave  12.025529
  Alley  SalePrice
0  Grvl  11.673363
1  None  12.037682
2  Pave  11.996819
  LotShape  SalePrice
0      IR1  12.163471
1      IR2  12.318455
2      IR3  12.205419
3      Reg  11.936101
  LandContour  SalePrice
0         Bnk  11.810748
1         HLS  12.259262
2         Low  12.120092
3         Lvl  12.022700
  Utilities  SalePrice
0    AllPub  12.024189
1    NoSeWa  11.831386
  LotConfig  SalePrice
0    Corner  12.027452
1   CulDSac  12.249815
2       FR2  12.034748
3       FR3  12.195689
4    Inside  12.001906
  LandSlope  SalePrice
0       Gtl  12.020571
1       Mod  12.076873
2       Sev  12.130567
   Neighborhood  SalePrice
0       Blmngtn  12.169421
1       Blueste  11.826543
2        BrDale  11.547874
3       BrkSide  11.679736
4       ClearCr  12.239905
5       CollgCr  12.163647
6       C

In [11]:
mean_df.columns

Index(['SaleCondition', 'SaleCondition_mean'], dtype='object')

In [21]:
data.head()

Unnamed: 0,MSZoning_mean,Street_mean,Alley_mean,LotShape_mean,LandContour_mean,Utilities_mean,LotConfig_mean,LandSlope_mean,Neighborhood_mean,Condition1_mean,...,GarageType_mean,GarageFinish_mean,GarageQual_mean,GarageCond_mean,PavedDrive_mean,PoolQC_mean,Fence_mean,MiscFeature_mean,SaleType_mean,SaleCondition_mean
0,12.085891,12.025529,12.037682,11.936101,12.0227,12.024189,12.001906,12.020571,12.163647,12.042923,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.00504
1,12.085891,12.025529,12.037682,11.936101,12.0227,12.024189,12.034748,12.020571,12.34418,11.818342,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.00504
2,12.085891,12.025529,12.037682,12.163471,12.0227,12.024189,12.001906,12.020571,12.163647,12.042923,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.00504
3,12.085891,12.025529,12.037682,12.163471,12.0227,12.024189,12.027452,12.020571,12.206664,12.042923,...,11.765651,11.818982,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,11.788783
4,12.085891,12.025529,12.037682,12.163471,12.0227,12.024189,12.034748,12.020571,12.676003,12.042923,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.00504


In [15]:
# 對照組 : 標籤編碼 + 梯度提升樹
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = GradientBoostingRegressor()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.777060850510822
time : 0.8589715957641602 sec


In [16]:
# 均值編碼 + 梯度提升樹
data = pd.concat([df[:train_num], train_Y], axis=1)
for c in df.columns:
    mean_df = data.groupby([c])['SalePrice'].mean().reset_index()
    mean_df.columns = [c, f'{c}_mean']
    data = pd.merge(data, mean_df, on=c, how='left')
    data = data.drop([c] , axis=1)
data = data.drop(['SalePrice'] , axis=1)
estimator = GradientBoostingRegressor()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, data, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.806806005059097
time : 0.7969541549682617 sec


# 作業1
* 請仿照範例，將鐵達尼範例中的類別型特徵改用均值編碼實作一次

# 作業2
* 觀察鐵達尼生存預測中，均值編碼與標籤編碼兩者比較，哪一個效果比較好? 可能的原因是什麼?