# [教學目標]
- 知道 DataFrame 如何檢視欄位的型態數量以及各欄型態, 以及 Label Encoding / One Hot Encoding 如何寫?

# [範例重點]
- 檢視 DataFrame 的資料型態 (In[3], In[4])
- 了解 Label Encoding 如何寫 (In[6])
- 了解 One Hot Encoding 如何寫 (In[7])

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# 設定 data_path
dir_data = '/Users/anncheng/Documents/GitHub/4th-ML100Days/data'
# os.path.join(路徑目錄,'data')就是获取当前目录，并组合成新目录
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

# Read a comma-separated values (csv) file into DataFrame.
app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

In [3]:
app_train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# 查看各欄位的type
app_train.dtypes

SK_ID_CURR                      int64
TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 122, dtype: object

檢視資料中各個欄位類型的數量

In [5]:
# 檢視資料欄位各種資料類型的數量
# 由結果可知object有16個，所以下一個cell的結果會印出16個欄位
app_train.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

檢視資料中類別型欄位各自類別的數量

In [6]:
# print(app_train.columns)

app_train.select_dtypes(include=["object"]).apply(pd.Series.nunique, axis = 0)



NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64

#### Label encoding
有仔細閱讀[參考資料](https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621)的人可以發現，Label encoding 的表示方式會讓同一個欄位底下的類別之間有大小關係 (0<1<2<...)，所以在這裡我們只對有類別數量小於等於 2 的類別型欄位示範使用 Label encoding，但不表示這樣處理是最好的，一切取決於欄位本身的意義適合哪一種表示方法

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # fit(y) ：fit可看做一本空字典，y可看作要塞到字典中的词。reference:https://blog.csdn.net/quintind/article/details/79850455
            # Train on the training data
            le.fit(app_train[col])
            print(app_test[col])
            # fit_transform(y)：相当于先进行fit再进行transform，即把y塞到字典中去以后再进行transform得到索引值。reference:https://blog.csdn.net/quintind/article/details/79850455                          
            # fit_transform 只是把 fit 跟 transform 連著做，不分兩行程式le.fit與le.transform去寫，但兩種方法要達到的目的一樣             
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            print(app_test[col])
            # num +=1的意思是 num = num + 1             
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)
# 為什麼EMERGENCYSTATE_MODE的type也是object且也<=2但沒有被轉換。Ans: app_train['EMERGENCYSTATE_MODE']有NaN導致沒有被考慮到

0        Cash loans
1        Cash loans
2        Cash loans
3        Cash loans
4        Cash loans
            ...    
48739    Cash loans
48740    Cash loans
48741    Cash loans
48742    Cash loans
48743    Cash loans
Name: NAME_CONTRACT_TYPE, Length: 48744, dtype: object
0        0
1        0
2        0
3        0
4        0
        ..
48739    0
48740    0
48741    0
48742    0
48743    0
Name: NAME_CONTRACT_TYPE, Length: 48744, dtype: int64
0        N
1        N
2        Y
3        N
4        Y
        ..
48739    N
48740    N
48741    Y
48742    N
48743    Y
Name: FLAG_OWN_CAR, Length: 48744, dtype: object
0        0
1        0
2        1
3        0
4        1
        ..
48739    0
48740    0
48741    1
48742    0
48743    1
Name: FLAG_OWN_CAR, Length: 48744, dtype: int64
0        Y
1        Y
2        Y
3        Y
4        N
        ..
48739    Y
48740    N
48741    Y
48742    N
48743    N
Name: FLAG_OWN_REALTY, Length: 48744, dtype: object
0        1
1        1
2        1
3    

In [9]:
app_train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,M,0,0,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,0,F,0,1,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,0,F,0,1,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,0,F,0,1,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### One Hot encoding
pandas 中的 one hot encoding 非常方便，一行程式碼就搞定

In [10]:
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# print(app_train)
# print(app_train['CODE_GENDER'].head()) 會error是因為CODE_GENDER這個欄位經過pd.get_dummies後會轉出兩個欄位分別為CODE_GENDER_F與CODE_GENDER_M
# 被轉換的欄位有哪些？如果前面Label encoding已經轉換過這裡還會再轉換嗎？
print(app_train['CODE_GENDER_F'].head())
print(app_train['CODE_GENDER_M'].head())
print(app_train['NAME_EDUCATION_TYPE_Academic degree'].head())

0    0
1    1
2    0
3    1
4    0
Name: CODE_GENDER_F, dtype: uint8
0    1
1    0
2    1
3    0
4    1
Name: CODE_GENDER_M, dtype: uint8
0    0
1    0
2    0
3    0
4    0
Name: NAME_EDUCATION_TYPE_Academic degree, dtype: uint8


可以觀察到原來的類別型欄位都轉為 0/1 了

## 作業
將下列部分資料片段 sub_train 使用 One Hot encoding, 並觀察轉換前後的欄位數量 (使用 shape) 與欄位名稱 (使用 head) 變化

In [11]:
app_train = pd.read_csv(f_app_train)
sub_train = pd.DataFrame(app_train['WEEKDAY_APPR_PROCESS_START'])
print(sub_train.shape)
sub_train.head()

(307511, 1)


Unnamed: 0,WEEKDAY_APPR_PROCESS_START
0,WEDNESDAY
1,MONDAY
2,MONDAY
3,WEDNESDAY
4,THURSDAY


In [12]:
"""
Your Code Here
"""

'\nYour Code Here\n'