1.处理数据

In [1]:
import tensorflow as tf
import pandas as pd
import re

# 从CSV文件中读入数据
data = pd.read_csv('../../../../dataset/titanic/train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [15]:
# 取部分特征字段用于分类，清洗Age等数据
data['Sex'] = data['Sex'].apply(lambda s: 1 if s == 'male' else 0)

mean_age = data["Age"].mean()
data.loc[data.Age.isnull(), "Age"] = mean_age

def get_title(name):
    if(pd.isnull(name)):
        return 'Null'
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1).lower()
    else:
        return 'None'
    
titles = {'mr': 1,
          'mrs': 2, 'mme': 2,
          'ms': 3, 'miss': 3, 'mlle': 3,
          'don': 4, 'sir': 4, 'jonkheer': 4,
          'major': 4, 'col': 4, 'dr': 4, 'master': 4, 'capt': 4,
          'dona': 5, 'lady': 5, 'countness': 5,
          'rev': 6}
data['Title'] = data['Name'].apply(lambda name: titles.get(get_title(name)))
data['Honor'] = data['Title'].apply(lambda title: 1 if title == 4 or title == 5 else 0)

data = data.fillna(0)

In [16]:
# 两种分类分别是幸存和死亡，‘Survived’字段是其中一种分类的标签，
# 新增‘Deceased’字段表示第二段分类的标签，取值为‘Survived’字段取非
data['Deceased'] = data['Survived'].apply(lambda s: int(not s))
dataset_Y = data[['Deceased', 'Survived']]
dataset_Y = dataset_Y.as_matrix()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
Title          891 non-null float64
Honor          891 non-null int64
Deceased       891 non-null int64
dtypes: float64(3), int64(8), object(4)
memory usage: 104.5+ KB


2.使用TFRecord格式存储数据

In [17]:
# 将train.csv转化为train.tfrecords
def transform_to_tfrecord(data, tfrecord_file):    
    def int_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    
    def float_feature(value):
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
    
    writer = tf.python_io.TFRecordWriter(tfrecord_file)
    for i in range(len(data)):
        features = tf.train.Features(feature={
                'Age': float_feature(data['Age'][i]),
                'Survived': int_feature(data['Survived'][i]),
                'Pclass': int_feature(data['Pclass'][i]),
                'Parch': int_feature(data['Parch'][i]),
                'SibSp': int_feature(data['SibSp'][i]),
                'Sex': int_feature(data['Sex'][i]),
                'Fare': float_feature(data['Fare'][i])})
        example = tf.train.Example(features=features)
        writer.write(example.SerializeToString())
    writer.close()
    
transform_to_tfrecord(data, '../../../../dataset/titanic/train.tfrecords')

3.多线程方式读取TFRecord格式数据

In [18]:
def read_and_decode(train_files, num_threads=2, num_epochs=100,
                    batch_size=10, min_after_dequeue=10):
    # read data from trainFile with TFRecord format
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer(train_files, num_epochs=num_epochs)
    _, serialized_example = reader.read(filename_queue)
    featuresdict = tf.parse_single_example(serialized_example, features={
            'Age': tf.FixedLenFeature([], tf.float32),
            'Survived': tf.FixedLenFeature([], tf.int64),
            'Pclass': tf.FixedLenFeature([], tf.int64),
            'Parch': tf.FixedLenFeature([], tf.int64),
            'SibSp': tf.FixedLenFeature([], tf.int64),
            'Sex': tf.FixedLenFeature([], tf.int64),
            'Fare': tf.FixedLenFeature([], tf.float32)})
    
    # decode features to same format of float32
    labels = featuresdict.pop('Survived')
    features = [tf.cast(value, tf.float32) for value in featuresdict.values()]
    
    # get data with shuffle hatch and return
    features, labels = tf.train.shuffle_batch(
        [features, labels],
        batch_size=batch_size,
        num_threads=num_threads,
        capacity=min_after_dequeue + 3 * batch_size,
        min_after_dequeue=min_after_dequeue)
    return features, labels

def train_with_queuerunner():
    x, y = read_and_decode(['../../../../dataset/titanic/train.tfrecords'])
    
    with tf.Session() as sess:
        tf.group(tf.global_variables_initializer(),
                tf.local_variables_initializer()).run()
        
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        
        try:
            step = 0
            while not coord.should_stop():
                # Run training steps or whatever
                features, labels = sess.run([x, y])
                if(step%100 == 0):
                    print('step %d:' % step, labels)
                step += 1
        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
        finally:
            # when done, ask the thread to stop.
            coord.request_stop()
            
        # wait for threads to finish.
        coord.join(threads)
        
train_with_queuerunner()

step 0: [0 1 1 0 1 1 0 0 0 0]
step 100: [0 0 0 0 0 0 0 0 0 0]
step 200: [0 1 0 1 0 0 0 0 0 0]
step 300: [1 0 1 1 0 0 1 0 1 0]
step 400: [1 1 1 1 1 1 0 1 0 0]
step 500: [1 1 1 1 0 0 1 1 0 1]
step 600: [0 1 0 0 0 1 1 0 1 1]
step 700: [0 1 0 0 0 0 0 1 0 0]
step 800: [1 0 0 0 1 0 0 1 0 0]
step 900: [0 0 0 1 0 0 0 0 1 0]
step 1000: [1 1 0 0 1 1 0 0 0 1]
step 1100: [0 1 0 0 1 1 0 0 1 1]
step 1200: [0 0 0 0 0 0 1 0 1 0]
step 1300: [0 0 0 1 0 0 1 0 1 1]
step 1400: [1 0 0 1 0 0 1 0 0 0]
step 1500: [1 0 1 0 1 0 0 0 0 0]
step 1600: [1 0 1 1 0 0 1 1 1 0]
step 1700: [1 1 0 0 0 0 1 1 0 0]
step 1800: [0 1 1 0 0 1 0 1 1 0]
step 1900: [0 0 1 0 1 1 1 1 1 0]
step 2000: [0 0 1 0 0 0 0 0 1 1]
step 2100: [1 0 0 0 0 0 1 0 1 0]
step 2200: [0 0 1 0 0 0 1 0 0 0]
step 2300: [0 1 0 1 0 0 0 0 1 0]
step 2400: [0 0 0 1 0 1 0 0 0 1]
step 2500: [0 1 1 0 0 0 1 1 0 0]
step 2600: [0 0 0 0 0 0 0 0 0 1]
step 2700: [0 1 0 1 1 0 1 0 1 0]
step 2800: [1 1 1 1 0 1 0 0 0 1]
step 2900: [1 0 0 0 0 0 0 0 0 0]
step 3000: [0 1 1 0 0 

4.建立模型

In [38]:
import numpy as np

tf.reset_default_graph()

# 声明输入数据占位符
# shape参数的第一个元素为None，表示可以同时放入任意条记录
with tf.name_scope('input'):
    X = tf.placeholder(tf.float32, shape=[None, 7], name='input_x')
    y = tf.placeholder(tf.float32, shape=[None, 2], name='input_y')

with tf.name_scope('classifier'):
    # 声明变量
    W = tf.Variable(tf.random_normal([7, 2]), name='weights')
    b = tf.Variable(tf.zeros([2]), name='bias')
    y_pred = tf.nn.softmax(tf.matmul(X, W) + b)
    
    # 添加直方图参数概要记录算子
    tf.summary.histogram('weights', W)
    tf.summary.histogram('bias',b)

with tf.name_scope('cost'):
    # 使用交叉熵作为代价函数
    cross_entropy = - tf.reduce_sum(y * tf.log(y_pred + 1e-10))
    # 批量样本的代价值为所有样本交叉熵的平均值
    cost = tf.reduce_mean(cross_entropy)
    # 添加损失函数标量概要
    tf.summary.scalar('loss', cost)

# 使用随机梯度下降算法优化器来最小化代价，系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.0002).minimize(cost)

with tf.name_scope('accuracy'):
    correct_pred = tf.equal(tf.argmax(y, 1), tf.argmax(y_pred, 1))
    acc_op = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    tf.summary.scalar('accuracy', acc_op)

# 保存模型
saver = tf.train.Saver()

with tf.Session() as sess:
    # 创建概要写入操作
    # Tensorboard可通过命令‘tensorboard --logdir=./logs’来启动
    writer = tf.summary.FileWriter('./logs', sess.graph)
    merged = tf.summary.merge_all()
    
    # 初始化所有变量，必须最先执行
    tf.global_variables_initializer().run()
    # 以下为训练迭代，迭代10轮
    for epoch in range(100):
        total_loss = 0.
        for i in range(len(X_train)):
            # 通过session.run接口触发运行
            _, loss = sess.run([train_op, cost], feed_dict = {X: [X_train[i]], y: [y_train[i]]})
            total_loss += loss
        
        summary, accuracy = sess.run([merged, acc_op], feed_dict = {X: X_train, y: y_train})
        writer.add_summary(summary, epoch)
        
        if(epoch%10 == 0):
            print('Epoch: %04d, total loss = %.9f' % (epoch, total_loss))
            saver.save(sess,"./mymodel.ckpt", global_step=epoch)
    
    writer.close()
    saver.save(sess,"./mymodel.ckpt")
    print('Training complete!')
    
    # 评估校验数据集上的准确率
    pred = sess.run(y_pred, feed_dict={X: X_test})
    correct = np.equal(np.argmax(pred, 1), np.argmax(y_test, 1))
    accuracy = np.mean(correct.astype(np.float32))
    print("Accuracy on validation set: %.9f" % accuracy)

Epoch: 0000, total loss = 4713.340104389
Epoch: 0010, total loss = 3237.428724157
Epoch: 0020, total loss = 845.970993774
Epoch: 0030, total loss = 661.420603499
Epoch: 0040, total loss = 585.708687306
Epoch: 0050, total loss = 539.824730469
Epoch: 0060, total loss = 506.740099301
Epoch: 0070, total loss = 483.490554612
Epoch: 0080, total loss = 467.280953822
Epoch: 0090, total loss = 455.878746660
Training complete!
Accuracy on validation set: 0.670391083


In [46]:
# 读入测试数据集并完成预处理
testdata = pd.read_csv('../../../../dataset/titanic/test.csv')

mean_age = testdata["Age"].mean()
testdata.loc[testdata.Age.isnull(), "Age"] = mean_age

testdata = testdata.fillna(0)
testdata['Sex'] = testdata['Sex'].apply(lambda s:1 if s == 'male' else 0)

testdata['Title'] = testdata['Name'].apply(lambda name: titles.get(get_title(name)))
testdata['Honor'] = testdata['Title'].apply(lambda title: 1 if title == 4 or title == 5 else 0)

X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Honor']]

# 开启session进行预测
with tf.Session() as sess:
    saver.restore(sess, './mymodel.ckpt')
    # 正向传播计算
    predictions = np.argmax(sess.run(y_pred, feed_dict={X: X_test}), 1)
    # 构建提交结果的数据结构，并将结果存为csv文件
    submission = pd.DataFrame({"PassengerId": testdata["PassengerId"], "Survived": predictions})
    submission.to_csv("../../../../dataset/titanic/titanic_submission.csv", index=False)

INFO:tensorflow:Restoring parameters from ./mymodel.ckpt
