1.处理数据

In [44]:
import pandas as pd

# 从CSV文件中读入数据
data = pd.read_csv('../../../../dataset/titanic/train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [45]:
# 取部分特征字段用于分类，并将所有缺失的字段填充为0
data['Sex'] = data['Sex'].apply(lambda s: 1 if s == 'male' else 0)
data = data.fillna(0)
dataset_X = data[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
dataset_X = dataset_X.as_matrix()

In [47]:
# 两种分类分别是幸存和死亡，‘Survived’字段是其中一种分类的标签，
# 新增‘Deceased’字段表示第二段分类的标签，取值为‘Survived’字段取非
data['Deceased'] = data['Survived'].apply(lambda s: int(not s))
dataset_Y = data[['Deceased', 'Survived']]
dataset_Y = dataset_Y.as_matrix()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
Deceased       891 non-null int64
dtypes: float64(2), int64(7), object(4)
memory usage: 90.6+ KB


2.初步处理数据

In [48]:
from sklearn.model_selection import train_test_split

# 使用sklearn的train_test_split函数将标记数据切分为“训练数据集和验证集”
# 将全部标记数据随机洗牌后切分，其中验证数据占20%，由test_size参数指定
X_train, X_test, y_train, y_test = train_test_split(
    dataset_X, dataset_Y, test_size = 0.2, random_state = 42)

3.建立模型

In [49]:
import numpy as np
import tensorflow as tf

# 声明输入数据占位符
# shape参数的第一个元素为None，表示可以同时放入任意条记录
X = tf.placeholder(tf.float32, shape=[None, 6])
y = tf.placeholder(tf.float32, shape=[None, 2])

# 声明变量
W = tf.Variable(tf.random_normal([6, 2]), name='weights')
b = tf.Variable(tf.zeros([2]), name='bias')

y_pred = tf.nn.softmax(tf.matmul(X, W) + b)

# 使用交叉熵作为代价函数
cross_entropy = - tf.reduce_sum(y * tf.log(y_pred + 1e-10))
# 批量样本的代价值为所有样本交叉熵的平均值
cost = tf.reduce_mean(cross_entropy)

# 使用随机梯度下降算法优化器来最小化代价，系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)

# 保存模型
saver = tf.train.Saver()

with tf.Session() as sess:
    # 初始化所有变量，必须最先执行
    tf.global_variables_initializer().run()
    # 以下为训练迭代，迭代10轮
    for epoch in range(100):
        total_loss = 0.
        for i in range(len(X_train)):
            feed = {X: [X_train[i]], y: [y_train[i]]}
            # 通过session.run接口触发运行
            _, loss = sess.run([train_op, cost], feed_dict = feed)
            total_loss += loss
        if(epoch%10 == 0):
            print('Epoch: %04d, total loss = %.9f' % (epoch, total_loss))
            saver.save(sess,"./mymodel.ckpt", global_step=epoch)
    print('Training complete!')
    
    # 评估校验数据集上的准确率
    pred = sess.run(y_pred, feed_dict={X: X_test})
    correct = np.equal(np.argmax(pred, 1), np.argmax(y_test, 1))
    accuracy = np.mean(correct.astype(np.float32))
    print("Accuracy on validation set: %.9f" % accuracy)

Epoch: 0000, total loss = 2382.947940569
Epoch: 0010, total loss = 1116.872451963
Epoch: 0020, total loss = 1071.427973176
Epoch: 0030, total loss = 1054.925065220
Epoch: 0040, total loss = 1011.089648127
Epoch: 0050, total loss = 1004.657435875
Epoch: 0060, total loss = 960.662514119
Epoch: 0070, total loss = 961.212779934
Epoch: 0080, total loss = 957.269151649
Epoch: 0090, total loss = 954.692457028
Training complete!
Accuracy on validation set: 0.636871517


In [51]:
# 读入测试数据集并完成预处理
testdata = pd.read_csv('../../../../dataset/titanic/test.csv')
testdata = testdata.fillna(0)
testdata['Sex'] = testdata['Sex'].apply(lambda s:1 if s == 'male' else 0)
X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]

# 开启session进行预测
with tf.Session() as sess:
    saver.restore(sess, './mymodel.ckpt')
    # 正向传播计算
    predictions = np.argmax(sess.run(y_pred, feed_dict={X: X_test}), 1)
    # 构建提交结果的数据结构，并将结果存为csv文件
    submission = pd.DataFrame({"PassengerId": testdata["PassengerId"], "Survived": predictions})
    submission.to_csv("titanic_submission.csv", index=False)

INFO:tensorflow:Restoring parameters from ./mymodel.ckpt
