In [1]:
import pandas as pd

In [2]:
# 读取泰坦尼克乘客档案
titanic = pd.read_csv('datasets/Titanic/titanic.txt')

In [3]:
# 人工选取pclass、age以及sex作为判别乘客是否能够生还的特征。
X = titanic[['pclass', 'age', 'sex']]
y = titanic['survived']

In [7]:
# 对于缺失的年龄信息，我们使用全体乘客的平均年龄代替，
#这样可以在保证顺利训练模型的同时，尽可能不影响预测任务。
X['age'].fillna(X['age'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [8]:
# 对原始数据进行分割，25%的乘客数据用于测试。
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25, random_state = 33)

In [9]:
# 对类别型特征进行转化，成为特征向量。
from sklearn.feature_extraction import DictVectorizer

In [15]:
X_train.head()

Unnamed: 0,pclass,age,sex
1086,3rd,31.194181,male
12,1st,31.194181,female
1036,3rd,31.194181,male
833,3rd,32.0,male
1108,3rd,31.194181,male


In [22]:
X_train.to_dict(orient='record')[:5]

[{'age': 31.19418104265403, 'pclass': '3rd', 'sex': 'male'},
 {'age': 31.19418104265403, 'pclass': '1st', 'sex': 'female'},
 {'age': 31.19418104265403, 'pclass': '3rd', 'sex': 'male'},
 {'age': 32.0, 'pclass': '3rd', 'sex': 'male'},
 {'age': 31.19418104265403, 'pclass': '3rd', 'sex': 'male'}]

In [23]:
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))

In [26]:
X_train

array([[31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [31.19418104,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       ...,
       [12.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ],
       [18.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ]])

In [27]:
# 使用单一决策树进行模型训练以及预测分析。
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_y_pred = dtc.predict(X_test)

In [28]:
# 使用随机森林分类器进行集成模型的训练以及预测分析。
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)

In [29]:
# 使用梯度提升决策树进行集成模型的训练以及预测分析。
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
gbc_y_pred = gbc.predict(X_test)

In [30]:
# 从sklearn.metrics导入classification_report。
from sklearn.metrics import classification_report

In [32]:
# 输出单一决策树在测试集上的分类准确性，以及更加详细的精确率、召回率、F1指标。
print('The accuracy of decision tree is', dtc.score(X_test, y_test))
print("")
print(classification_report(dtc_y_pred, y_test))

The accuracy of decision tree is 0.7811550151975684

              precision    recall  f1-score   support

           0       0.91      0.78      0.84       236
           1       0.58      0.80      0.67        93

    accuracy                           0.78       329
   macro avg       0.74      0.79      0.75       329
weighted avg       0.81      0.78      0.79       329



In [34]:
# 输出随机森林分类器在测试集上的分类准确性，以及更加详细的精确率、召回率、F1指标。
print('The accuracy of random forest classifier is', rfc.score(X_test, y_test))
print("")
print(classification_report(rfc_y_pred, y_test))

The accuracy of random forest classifier is 0.7781155015197568

              precision    recall  f1-score   support

           0       0.89      0.78      0.83       231
           1       0.60      0.78      0.68        98

    accuracy                           0.78       329
   macro avg       0.74      0.78      0.75       329
weighted avg       0.80      0.78      0.78       329



In [35]:
# 输出梯度提升决策树在测试集上的分类准确性，以及更加详细的精确率、召回率、F1指标。
print('The accuracy of gradient tree boosting is', gbc.score(X_test, y_test))
print(classification_report(gbc_y_pred, y_test))

The accuracy of gradient tree boosting is 0.790273556231003
              precision    recall  f1-score   support

           0       0.92      0.78      0.84       239
           1       0.58      0.82      0.68        90

    accuracy                           0.79       329
   macro avg       0.75      0.80      0.76       329
weighted avg       0.83      0.79      0.80       329

