分类变量，数量是有限的。同样我们有三个方法对其进行预处理。

1. 直接丢弃；    
2.便签编码，例如将每天、从不、很少、大部分分别表述成0，1，2，3等等；  
3.独热编码。 
  
<img src='input/one-hot.png'>

独热编码相对比便签编码，它不考虑变量顺序问题，这在分类变量中通常表现的较好。但是也要注意，独热编码使用时最好不要超过15个不同值。

In [8]:
# 同样利用墨尔本房价的数据集
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [9]:
data = pd.read_csv('./input/melb_data.csv')
y = data.Price
X = data.drop(['Price'], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X_train_full.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
12167,St Kilda,11/22 Charnwood Cr,1,u,S,hockingstuart,29/07/2017,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,Port Phillip,-37.85984,144.9867,Southern Metropolitan,13240.0
6524,Williamstown,18 James St,2,h,SA,Hunter,17/09/2016,8.0,3016.0,2.0,2.0,1.0,193.0,,,Hobsons Bay,-37.858,144.9005,Western Metropolitan,6380.0
8413,Sunshine,10 Dundalk St,3,h,S,Barry,8/04/2017,12.6,3020.0,3.0,1.0,1.0,555.0,,,Brimbank,-37.7988,144.822,Western Metropolitan,3755.0
2919,Glenroy,1/2 Prospect St,3,u,SP,Brad,18/06/2016,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,Moreland,-37.7083,144.9158,Northern Metropolitan,8870.0
6043,Sunshine North,35 Furlong Rd,3,h,S,First,22/05/2016,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,Brimbank,-37.7623,144.8272,Western Metropolitan,4217.0


In [10]:
X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10864 entries, 12167 to 2732
Data columns (total 20 columns):
Suburb           10864 non-null object
Address          10864 non-null object
Rooms            10864 non-null int64
Type             10864 non-null object
Method           10864 non-null object
SellerG          10864 non-null object
Date             10864 non-null object
Distance         10864 non-null float64
Postcode         10864 non-null float64
Bedroom2         10864 non-null float64
Bathroom         10864 non-null float64
Car              10815 non-null float64
Landsize         10864 non-null float64
BuildingArea     5708 non-null float64
YearBuilt        6557 non-null float64
CouncilArea      9792 non-null object
Lattitude        10864 non-null float64
Longtitude       10864 non-null float64
Regionname       10864 non-null object
Propertycount    10864 non-null float64
dtypes: float64(11), int64(1), object(8)
memory usage: 1.7+ MB


In [11]:
# 通过上面信息，我们知道有4类数据存在缺失值，现在对缺失值进行处理
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
cols_with_missing

['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']

In [12]:
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

In [13]:
# 选择变量中低随机性的值，即该变量类型是‘object’，同时，差异值不超过10个
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique()<10 and 
                        X_train_full[cname].dtype == 'object']

# 选择数值型列
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [15]:
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [16]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [17]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
print('Categorical Variables:')
print(object_cols)

Categorical Variables:
['Type', 'Method', 'Regionname']


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train ,X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [19]:
# 查看三种方法不同得分
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approch 1:")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approch 1:
175750.98024934545


In [20]:
# 第二种方法，labelEncoder
from sklearn.preprocessing import LabelEncoder

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

print("MAE from Approch 2:")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approch 2:
165918.2209748229


In [35]:
# 第三种方法，独热编码One-hot
from sklearn.preprocessing import OneHotEncoder

# ignore保证交叉验证集中可能存在训练集中不存在的值，False确保返回的是numpy array
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.transform(X_train['Type']))
# OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
# OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# OH_cols_train.index = X_train.index
# OH_cols_valid.index = X_valid.index

# num_X_train = X_train.drop(object_cols, axis=1)
# num_X_valid = X_valid.drop(object_cols, axis=1)

# OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
# OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# print("MAE from Approach 3 (One-Hot Encoding):") 
# print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))


ValueError: could not convert string to float: 'u'