# 泰坦尼克号生存预测学习目标：
- 熟练使用决策树相关的API
- 决策树相关的可视化API

---

# 小结：
相比于其他学习模型，决策树模型在模型描述上有巨大的优势，决策树的逻辑推断非常直观，具有清晰的可解释性，也有很方便的模型可视化，在决策树的使用中无需考虑数据的量化和标准化，能达到比较好的识别率

In [1]:
# 导入Pandas模块
import pandas as pd

# 加载数据
titanic = pd.read_csv('../file/titanic.csv')
'''
数据特性：
    1、891 条目
    2、15个字段
    3、包含四种数据类型：布尔、浮点数、整形、对象、
    4、包含空的字段：age、embarked、deck、embark_town
    5、内存占用：92.4k
'''
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


In [9]:
# 1、观察数据基本特点
titanic.head()

# 2、查看pandas对数据的统计特征
titanic.info()

# 3、选择三个特征数据Sex、Pclass、Age
X = titanic[['class', 'age', 'sex']]
y = titanic['survived']
X.info()

# 4、对数据进行补全（对空值进行中位数填充，对类别的特征值进行one-hot编码）
# X['age'].fillna(X['age'].mean(), inplace = True)
X['age'].fillna(X['age'].mean())
X = pd.get_dummies(X)
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   

Unnamed: 0,age,class_First,class_Second,class_Third,sex_female,sex_male
0,22.0,False,False,True,False,True
1,38.0,True,False,False,True,False
2,26.0,False,False,True,True,False
3,35.0,True,False,False,True,False
4,35.0,False,False,True,False,True


In [11]:
# 导入数据集划分包
from sklearn.model_selection import train_test_split

# 对数据集进行划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 33)

In [12]:
# 导入决策树模型
from sklearn.tree import DecisionTreeClassifier

# 使用默认的配置初始化决策树模型
dtc = DecisionTreeClassifier()

# 使用分割数据进行模型学习
dtc.fit(X_train, y_train)

# 使用训练好的模型来对测试数据进行预测
y_predict = dtc.predict(X_test)

In [None]:
# 导入评估模型的性能模块（关键指标：精确率、召回率、F1-score、支持度）
from sklearn.metrics import classification_report

# 输出预测准确率
dtc.score(X_test, y_test)
# 输出更加详细的分类性能
report = classification_report(y_predict, y_test, target_names = ['died', 'survived'])
print(report)

# 决策树可视化
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize = (30, 20))

feature_names = ['class', 'age', 'sex_female', 'sex_male']
class_names = ['died','survived']
plot_tree(dtc, max_depth = 3, filled = True, feature_names = feature_names, class_names = class_names)
plt.show()

              precision    recall  f1-score   support

        died       0.90      0.84      0.87       143
    survived       0.74      0.82      0.78        80

    accuracy                           0.83       223
   macro avg       0.82      0.83      0.82       223
weighted avg       0.84      0.83      0.84       223



In [7]:
# 导入Pandas依赖
import pandas as pd
# 导入numpy包
import numpy as np
# 导入特征提取模块（DictVectorizer将字典格式数据转化为数值向量）
from sklearn.feature_extraction import DictVectorizer
# 导入训练集测试集划分包
from sklearn.model_selection import train_test_split
# 导入决策树模型和图形可视化模块
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# 导入绘制决策树模块
from sklearn.tree import plot_tree
# 导入模型保存和加载模块
import joblib
# 导入matplotlib绘图模块
import matplotlib.pyplot as plt

In [30]:
def model_training():
    '''
    模型训练
    :return: 
    '''
    # 1、加载数据
    data = pd.read_csv('../file/titanic.csv')
    print(data.head())
    
    # 2、确定特征和目标值
    x = data[['pclass', 'sex', 'age']]
    y = data['survived']
    print('处理前：', x.head())
    
    # 3、对数据集空缺值进行处理
    x['age'].fillna(x['age'].mean())
    # 类别特征进行独热编码
    x = pd.get_dummies(x, columns=['pclass', 'sex'])
    print('处理后：', x.head())
    
    
    # 4、数据集划分
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 22)

    print('测试')
    
    # 5、决策树机器学习
    estimator = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
    estimator.fit(x_train, y_train)


    # 6、模型评估
    print('准确度：', estimator.score(x_test, y_test))
    print('预测结果：', estimator.predict(x_test))
    
    # 7、模型保存
    joblib.dump(estimator, '../file/dt.pth')    

In [13]:
def decision_tree_visualization():
    '''
    决策树可视化
    :return: 
    '''
    # 1、模型加载
    estimator = joblib.load('../file/dt.pth')
    
    # 2、决策树可视化
    fig, ax = plt.subplots(figsize = (50, 50))
    feature_names_arr = ['class', 'age', 'sex_female', 'sex_male']
    class_names_arr = ['died','survived']
    plot_tree(estimator, max_depth = 3, filled = True, feature_names = feature_names_arr, class_names = class_names_arr)
    
    plt.savefig('../file/tree.png', dpi = 100)

In [31]:
# 模型训练
model_training()

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
处理前：    pclass     sex   age
0       3    male  22.0
1       1  female  38.0
2       3  female  26.0
3       1  female  35.0
4       3    male  35.0
处理后：     age  pclass_1  pclass_2  pclass_3  sex_female  sex_male
0  22

In [None]:
# 加载模型
decision_tree_visualization()