# Day_004

#### 在初步 EDA 的過程中，需要先思考幾個問題：

．不同資料類型各有多少個欄位？

．類別型欄位 (pandas 中的 object) 的類別數量？

．模型怎麼處理類別型的資料？有什麼表示方法？

#### 另外，處理類別型資料時會使用到兩種方法：

．Label encoding：把每個類別 mapping 到某個整數，不會增加新欄位

．One Hot encoding：為每個類別新增一個欄位，用 0/1 表示是否

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# 設定 data_path
dir_data = 'D:/Coding Project/GitHub/ML100-Days/data/Topic_1/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

#### ❖ 檢視資料中各個欄位類型的數量

In [3]:
 app_train.dtypes.value_counts() # DataFrame.dtypes.value_counts() 查看資料類型並計數

float64    65
int64      41
object     16
dtype: int64

#### ❖ 檢視資料中類別型欄位各自類別的數量

In [4]:
# Series 欄位(一維度) / DataFrame 表格（二維度） / Panel 三維表格
# nuinque()是查看該序列(axis=0 row/1 column 對應列或行)的不同值的數量
# DataFrame.select_dtypes([include, exclude]) 根據資料類型選擇對應欄位
# apply(func, axis=0)
app_train.select_dtypes(include=["object"]).apply(pd.Series.nunique, axis = 0)  

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64

In [5]:
pd.Series.value_counts(app_train['NAME_CONTRACT_TYPE'])

Cash loans         278232
Revolving loans     29279
Name: NAME_CONTRACT_TYPE, dtype: int64

In [6]:
pd.Series.nunique(app_train['NAME_CONTRACT_TYPE'])

2

In [7]:
app_col_obj = app_train.select_dtypes(include=["object"])
encoding_col = []

for i in range(len(app_col_obj.columns)):
    if pd.Series.nunique(app_train[app_col_obj.columns[i]]) <= 2:
        encoding_col.append(app_col_obj.columns[i])

### Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
# Create a label Encoder object
le = LabelEncoder()  
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories       
        if len(list(app_train[col].unique())) <= 2:  # unique() 計算唯一值(類別種類)
            # 1. Train on the training data, fit(X[, y]) Fit LabelEncoder to X.
            le.fit(app_train[col]) 
            # 2. Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
           
            # Keep track of how many columns were label encoded
            le_count += 1            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [10]:
pd.Series.value_counts(app_train['NAME_CONTRACT_TYPE'])

0    278232
1     29279
Name: NAME_CONTRACT_TYPE, dtype: int64

In [11]:
app_train_encode = app_train[encoding_col[0:3]]
app_train_encode.head()

Unnamed: 0,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY
0,0,0,1
1,0,0,0
2,1,1,1
3,0,0,1
4,0,0,1


### One Hot Encoding

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
# Create a One Hot Encoder object
ohe = OneHotEncoder()

for col in app_train_encode:
    # 1. Train on the training data, fit(X[, y]) Fit OneHotEncoder to X.
    ohe.fit(app_train_encode[col].reshape(-1,1))
    # 2. Transform both training and testing data
    ohe.transform(app_train_encode[col].reshape(-1,1)).toarray() #進行編碼
    ohe_ft = ohe.fit_transform(app_train_encode[col].reshape(-1,1)).toarray()

  
  
  if __name__ == '__main__':


#### ❖說明：

* n_values='auto' 表示每個特徵使用幾維的數值由資料集自動推斷

* categorical_features = 'all' 指定對哪些特徵進行編碼，默認為 All

* dtype=<class 'numpy.float64'> 表示編碼數值格式，預設是浮點型

* sparse=True 表示編碼的格式，預設為 True，即為稀疏的格式，指定 False 就不用 toarray()

* handle_unknown='error'，其值可以指定为 "error" 或者 "ignore"，若遇到未知的類別則返回錯誤還是忽略它

In [14]:
app_train_ohe = pd.get_dummies(app_train)
app_test_ohe = pd.get_dummies(app_test)

print(app_train_ohe['CODE_GENDER_F'].head())
print(app_train_ohe['CODE_GENDER_M'].head())
print(app_train_ohe['NAME_EDUCATION_TYPE_Academic degree'].head())

0    0
1    1
2    0
3    1
4    0
Name: CODE_GENDER_F, dtype: uint8
0    1
1    0
2    1
3    0
4    1
Name: CODE_GENDER_M, dtype: uint8
0    0
1    0
2    0
3    0
4    0
Name: NAME_EDUCATION_TYPE_Academic degree, dtype: uint8


### 練習時間
#### E.g. 使用 One Hot encoding 轉換以下欄位，並觀察轉換前後的欄位數量 ( shape) 與欄位名稱 (head) 變化

In [15]:
sub_train = pd.DataFrame(app_train['WEEKDAY_APPR_PROCESS_START'])
print(sub_train.shape)
sub_train.head()

(307511, 1)


Unnamed: 0,WEEKDAY_APPR_PROCESS_START
0,WEDNESDAY
1,MONDAY
2,MONDAY
3,WEDNESDAY
4,THURSDAY


In [16]:
sub_train_ohe = pd.get_dummies(sub_train)
print(sub_train_ohe.shape)                              
sub_train_ohe.head()

(307511, 7)


Unnamed: 0,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0
