In [18]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## 使用SciKit-Learn套件做決策樹分析

In [2]:
df = pd.read_csv("airline.csv")

In [3]:
df.head()

Unnamed: 0,user_id,is_loyal,depart_on_time,arrive_on_time,register_method,register_rate,class,seat_rate,meal_rate,flight_rate,...,tv_ad,youtube_ad_1,youtube_ad_2,youtube_ad_3,dm_message,dm_post,dm_email,credit_card_vendor,credit_card_bonus,coupon
0,00411460f7c92d2124a67ea0f4cb5f85,Satisfied,1,1,phone,3,3,3,3,3,...,1,0,1,0,0,0,0,Vendor A,2,196.7
1,006f52e9102a8d3be2fe5614f42ba989,Satisfied,0,0,mobile_app,2,2,4,4,2,...,1,0,0,0,1,1,0,Vendor A,3,53.6
2,00ac8ed3b4327bdd4ebbebcb2ba10a00,Unsatisfied,1,1,others,4,3,1,3,2,...,0,1,1,1,1,0,1,Vendor C,1,107.8
3,00ec53c4682d36f5c4359f4ae7bd7ba1,Unsatisfied,0,0,others,3,3,1,2,2,...,1,1,1,1,1,0,1,Vendor B,2,37.5
4,01161aaa0b6d1345dd8fe4e481144d84,Satisfied,1,1,phone,4,2,4,4,5,...,1,0,0,1,1,0,0,Vendor B,1,43.3


In [4]:
df.drop("user_id", axis=1, inplace=True)

In [5]:
#轉換有階層的類別型資料為數字
size_mapping = {
           'Vendor A': 1,
           'Vendor B': 2,
           'Vendor C': 3}
df['credit_card_vendor'] = df['credit_card_vendor'].map(size_mapping)


In [6]:
#轉換無階層的類別型資料為dummy
from sklearn.preprocessing import OneHotEncoder
pf = pd.get_dummies(df[['register_method']])
df = pd.concat([df, pf], axis=1)
df.drop(['register_method'], axis=1, inplace=True)

In [7]:
df

Unnamed: 0,is_loyal,depart_on_time,arrive_on_time,register_rate,class,seat_rate,meal_rate,flight_rate,package_rate,tv_ad,...,dm_message,dm_post,dm_email,credit_card_vendor,credit_card_bonus,coupon,register_method_mobile_app,register_method_others,register_method_phone,register_method_website
0,Satisfied,1,1,3,3,3,3,3,3,1,...,0,0,0,1,2,196.7,0,0,1,0
1,Satisfied,0,0,2,2,4,4,2,4,1,...,1,1,0,1,3,53.6,1,0,0,0
2,Unsatisfied,1,1,4,3,1,3,2,2,0,...,1,0,1,3,1,107.8,0,1,0,0
3,Unsatisfied,0,0,3,3,1,2,2,3,1,...,1,0,1,2,2,37.5,0,1,0,0
4,Satisfied,1,1,4,2,4,4,5,3,1,...,1,0,0,2,1,43.3,0,0,1,0
5,Satisfied,1,1,4,3,2,3,4,2,1,...,0,0,1,2,1,160.1,0,0,1,0
6,Satisfied,0,0,2,3,3,2,3,2,1,...,0,0,0,1,3,146.0,0,1,0,0
7,Satisfied,1,1,2,2,4,3,4,3,1,...,0,1,1,2,2,41.4,1,0,0,0
8,Satisfied,1,1,3,3,3,3,4,2,0,...,0,0,0,2,3,124.5,0,1,0,0
9,Unsatisfied,1,1,3,3,2,1,3,2,0,...,1,0,1,2,2,94.8,0,1,0,0


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
#建立測試集跟訓練集
X = df.drop("is_loyal", axis = 1) #除去結果本身的所有變數
Y = df["is_loyal"]
x_train, x_test, y_train, y_test = train_test_split(X, Y,
                                                    test_size=0.20,
                                                    random_state=0)

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
#開函數學習機
dtree = DecisionTreeClassifier()

In [12]:
#訓練
dtree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [13]:
#預測
predictions = dtree.predict(x_test)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

In [15]:
#評估決策樹結果
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

   Satisfied       0.85      0.88      0.87       125
 Unsatisfied       0.79      0.75      0.77        75

    accuracy                           0.83       200
   macro avg       0.82      0.81      0.82       200
weighted avg       0.83      0.83      0.83       200



In [16]:
#製成混淆矩陣：
print(confusion_matrix(y_test,predictions))
print((94+34)/(94+31+41+34))

[[110  15]
 [ 19  56]]
0.64
