# 泰坦尼克乘客生存预测

In [1]:
import pandas as pd

train_data = pd.read_csv('./Titanic_Data/train.csv')
test_data = pd.read_csv('./Titanic_Data/test.csv')
# PassengerId -> 乘客编号
# Survived -> 是否幸存
# Pclass -> 船票等级
# Name -> 乘客姓名
# Sex -> 乘客性别
# SibSp -> 兄妹，配偶数
# Parch -> 父母，子女数
# Ticket -> 船票编号
# Fare -> 船票价格
# Cabin -> 船舱
# Embarked -> 登陆港口

## 1. 数据探索

In [2]:
print(train_data.info())
print('-'*30)
print(train_data.describe())
print('-'*30)
print(train_data.describe(include=['O']))
print('-'*30)
print(train_data.head())
print('-'*30)
print(train_data.tail())
print('-'*30)
print(test_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
------------------------------
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008  

## 2. 数据清洗

In [3]:
# 使用平均年龄来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(), inplace = True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace = True)

# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)

In [4]:
# 填充Embarked中的nan值
print(train_data['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [5]:
# S值的最多，将缺失值填充为S
train_data['Embarked'].fillna('S', inplace = True)
test_data['Embarked'].fillna('S', inplace = True)

## 3. 特征值选择

In [6]:
# PassengerId 为乘客编号，对分类没有作用，可以放弃
# Name 为乘客姓名，对分类没有作用，可以放弃
# Cabin 字段缺失值太多，可以放弃
# Ticket 字段为船票号码，杂乱无章且无规律，可以放弃
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]

In [7]:
# 特征Sex的二值化
from sklearn.feature_extraction import DictVectorizer
dvec = DictVectorizer(sparse = False)
train_features = dvec.fit_transform(train_features.to_dict(orient='record'))
test_features = dvec.transform(test_features.to_dict(orient='record'))
print(dvec.feature_names_)

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


## 4. 决策树模型

In [8]:
from sklearn.tree import DecisionTreeClassifier
# 构造ID决策树
clf = DecisionTreeClassifier(criterion = 'entropy')
# 训练决策树
clf.fit(train_features, train_labels)

DecisionTreeClassifier(criterion='entropy')

## 5. 模型预测&评估

In [11]:
# 决策树预测
pred_labels = clf.predict(test_features)

In [12]:
# 得到决策树准确率
acc_decision_tree = round(clf.score(train_features, train_labels),6)
print(u'score准确率为 %.4lf' % acc_decision_tree)

score准确率为 0.9820


In [13]:
# 用K折交叉验证验证决策树的准确率
import numpy as np
from sklearn.model_selection import cross_val_score
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf,train_features, train_labels,cv = 10)))

cross_val_score准确率为 0.7823
