In [None]:
# -*- coding: utf-8 -*-
import os
import sys
import csv
import time
import json
import datetime
import pickle as pkl
import tensorflow as tf
from tensorflow.contrib import learn
import wandb
wandb.init(project="snliclstm")

import data_helper
from rnn_classifier import rnn_clf
from cnn_classifier import cnn_clf
from clstm_classifier import clstm_clf

###### 这段作用是啥
try:
    from sklearn.model_selection import train_test_split
except ImportError as e:
    error = "Please install scikit-learn."
    print(str(e) + ': ' + error)
    sys.exit()

# Show warnings and errors only
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Parameters
# =============================================================================

# Model choices
tf.flags.DEFINE_string('clf', 'cnn', "Type of classifiers. Default: cnn. You have four choices: [cnn, lstm, blstm, clstm]")

# Data parameters
tf.flags.DEFINE_string('data_file', 'D:\\code\\Project_jly\\snli\\data\\train_data.csv', 'Data file path')
tf.flags.DEFINE_string('stop_word_file', None, 'Stop word file path')
tf.flags.DEFINE_string('language', 'en', "Language of the data file. You have two choices: [ch, en]")
tf.flags.DEFINE_integer('min_frequency', 0, 'Minimal word frequency')
tf.flags.DEFINE_integer('num_classes', 3, 'Number of classes')
tf.flags.DEFINE_integer('max_length', 0, 'Max document length')
tf.flags.DEFINE_integer('vocab_size', 0, 'Vocabulary size')
tf.flags.DEFINE_float('test_size', 0.1, 'Cross validation test size')  ############### add vy myself change 0.1 to 0

# Model hyperparameters
tf.flags.DEFINE_integer('embedding_size', 256, 'Word embedding size. For CNN, C-LSTM.')
tf.flags.DEFINE_string('filter_sizes', '3, 4, 5', 'CNN filter sizes. For CNN, C-LSTM.')
tf.flags.DEFINE_integer('num_filters', 128, 'Number of filters per filter size. For CNN, C-LSTM.')
tf.flags.DEFINE_integer('hidden_size', 128, 'Number of hidden units in the LSTM cell. For LSTM, Bi-LSTM')
tf.flags.DEFINE_integer('num_layers', 2, 'Number of the LSTM cells. For LSTM, Bi-LSTM, C-LSTM')
tf.flags.DEFINE_float('keep_prob', 0.5, 'Dropout keep probability')  # All
tf.flags.DEFINE_float('learning_rate', 1e-3, 'Learning rate')  # All
tf.flags.DEFINE_float('l2_reg_lambda', 0.001, 'L2 regularization lambda')  # All

# Training parameters
tf.flags.DEFINE_integer('batch_size', 32, 'Batch size')
tf.flags.DEFINE_integer('num_epochs', 10, 'Number of epochs')
tf.flags.DEFINE_float('decay_rate', 1, 'Learning rate decay rate. Range: (0, 1]')  # Learning rate decay
tf.flags.DEFINE_integer('decay_steps', 100000, 'Learning rate decay steps')  # Learning rate decay
tf.flags.DEFINE_integer('evaluate_every_steps', 100, 'Evaluate the model on validation set after this many steps')
tf.flags.DEFINE_integer('save_every_steps', 1000, 'Save the model after this many steps')
tf.flags.DEFINE_integer('num_checkpoint', 10, 'Number of models to store')



FLAGS = tf.app.flags.FLAGS    #### 奥，直接用FLAGS 代替tf.app.flags，封装好的结构体
tf.app.flags.DEFINE_string('f', '', 'kernel') ##add by me 不懂加这行的原因

if FLAGS.clf == 'lstm':
    FLAGS.embedding_size = FLAGS.hidden_size
elif FLAGS.clf == 'clstm':
    FLAGS.hidden_size = len(FLAGS.filter_sizes.split(",")) * FLAGS.num_filters

# Output files directory 
timestamp = str(int(time.time()))
outdir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
if not os.path.exists(outdir):
    os.makedirs(outdir)

# Load and save data
# =============================================================================
###### 仔细看一下data_helper 文件
data, labels, lengths, vocab_processor = data_helper.load_data(file_path=FLAGS.data_file,
                                                               sw_path=FLAGS.stop_word_file,
                                                               min_frequency=FLAGS.min_frequency,
                                                               max_length=FLAGS.max_length,
                                                               language=FLAGS.language,
                                                               shuffle=True)
# add by myself

#
# Save vocabulary processor   这些参数vocab_size，max_length也和data_helper有关
vocab_processor.save(os.path.join(outdir, 'vocab'))

FLAGS.vocab_size = len(vocab_processor.vocabulary_._mapping)

FLAGS.max_length = vocab_processor.max_document_length

params = FLAGS.flag_values_dict()   #####这个命令把前面FLAGS定义的参数归到了paras中
# 注：FLAGS._parse_flags() 改用FLAGS.flag_values_dict()将其解析成字典存储到FLAGS.__flags中
# Print parameters
model = params['clf']
if model == 'cnn':
    del params['hidden_size']
    del params['num_layers']
elif model == 'lstm' or model == 'blstm':
    del params['num_filters']
    del params['filter_sizes']
    params['embedding_size'] = params['hidden_size']
elif model == 'clstm':
    params['hidden_size'] = len(list(map(int, params['filter_sizes'].split(",")))) * params['num_filters']

params_dict = sorted(params.items(), key=lambda x: x[0]) # sorted() 函数对所有可迭代的对象进行排序操作。按照 x[0]这个元素排序
print('Parameters:')
for item in params_dict:
    print('{}: {}'.format(item[0], item[1]))
print('')

# Save parameters to file
params_file = open(os.path.join(outdir, 'params.pkl'), 'wb')
pkl.dump(params, params_file, True)
params_file.close()


# Simple Cross validation   train_test_split是库里面的函数，data是前面用data_helper.load data加载进来的,用来划分训练集和测试集的
x_train, x_valid, y_train, y_valid, train_lengths, valid_lengths = train_test_split(data,
                                                                                    labels,
                                                                                    lengths,
                                                                                    test_size=FLAGS.test_size,
                                                                                    random_state=22)
# Batch iterator
train_data = data_helper.batch_iter(x_train, y_train, train_lengths, FLAGS.batch_size, FLAGS.num_epochs)

with tf.Graph().as_default():
    with tf.Session() as sess:
        if FLAGS.clf == 'cnn':
            classifier = cnn_clf(FLAGS)   ####不用管数据的吗，只输入参数的吗
        elif FLAGS.clf == 'lstm' or FLAGS.clf == 'blstm':
            classifier = rnn_clf(FLAGS)
        elif FLAGS.clf == 'clstm':
            classifier = clstm_clf(FLAGS)
        else:
            raise ValueError('clf should be one of [cnn, lstm, blstm, clstm]')  ###确定模型

        # Train procedure
        # 我们通过tf.Variable构造一个variable添加进图中，Variable()构造函数需要变量的初始值(是一个任意类型、任意形状的tensor)，
        # 这个初始值指定variable的类型和形状。通过Variable()构造函数后，此variable的类型和形状固定不能修改了，但值可以用assign方法修改。
        global_step = tf.Variable(0, name='global_step', trainable=False)   ###定义给学习率递减用
        # Learning rate decay
        starter_learning_rate = FLAGS.learning_rate   ###定义给学习率递减用
        learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                   global_step,
                                                   FLAGS.decay_steps,
                                                   FLAGS.decay_rate,
                                                   staircase=True)  ###学习率递减
        optimizer = tf.train.AdamOptimizer(learning_rate) ###选择参数优化方法
        grads_and_vars = optimizer.compute_gradients(classifier.cost)  ###计算cost函数的梯度
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) ###结合优化方法和梯度

        # Summaries   summary这是将结果这个方法是添加变量到直方图中，在prompt输入tensorboard --logdir=文件夹绝对路径\ 即可得到tensorboard 的网址然后得到图形
        # 而在训练过程中，主要用到了tf.summary()的各类方法，能够保存训练过程以及参数分布图并在tensorboard显示。
        # tf.summary有诸多函数：1、tf.summary.scalar用来显示标量信息
        # 2、tf.summary.histogram 用来显示直方图信息
        loss_summary = tf.summary.scalar('Loss', classifier.cost)
        accuracy_summary = tf.summary.scalar('Accuracy', classifier.accuracy)

        # Train summary  写入了文件
        # merge_all 可以将所有summary全部保存到磁盘，以便tensorboard显示。如果没有特殊要求，一般用这一句就可一显示训练时的各种信息了
        # 8、tf.summary.FileWriter 指定一个文件用来保存图。
        train_summary_op = tf.summary.merge_all()
        train_summary_dir = os.path.join(outdir, 'summaries', 'train')
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Validation summary 写入了文件
        valid_summary_op = tf.summary.merge_all()
        valid_summary_dir = os.path.join(outdir, 'summaries', 'valid')
        valid_summary_writer = tf.summary.FileWriter(valid_summary_dir, sess.graph)

        # 后面用了saver一次，但是这是干啥的不清楚
        # 将Saver类添加ops 从而在checkpointes里save和restore变量 。它还提供了运行这些操作的便捷方法。
        # Checkpoints是专有格式的二进制文件，它将变量名称映射到张量值。测试Checkpoints内容的最佳方式是使用Saver来加载它
        # Savers可以使用提供的计数器自动为Checkpoint文件名编号，这使您可以在训练模型时在不同的步骤中保留多个Checkpoints。
        # 例如，您可以使用训练步骤编号对Checkpoint文件名进行编号。为避免填满磁盘，储存器会自动管理Checkpoint文件。例如，他们只能保留N个最新文件，或每N小时训练一个Checkpoint。
        # max_to_keep表示要保留的最近文件的最大数量。创建新文件时，将删除旧文件。如果为None或0，则不会从文件系统中删除任何Checkpoint，但只有最后一个Checkpoint保留在checkpoint文件中。
        # 默认为5（即保留最近的5个Checkpoint文件。）
        # 这个命令只是定义一下，后面用
        saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoint)

        # 这是干啥的不清楚
        # 我们在编写代码的时候，总是要先定义好整个图，然后才调用sess.run()。那么调用sess.run()的时候，程序是否执行了整个图
        # 添加节点用于初始化全局变量(GraphKeys.GLOBAL_VARIABLES)。返回一个初始化所有全局变量的操作（Op）
        sess.run(tf.global_variables_initializer())


        # 定义训练函数
        def run_step(input_data, is_training=True):
            """Run one step of the training process."""
            input_x, input_y, sequence_length = input_data

            # 当我们构建完图（可能是我们pre_process后生成的图片？NoNoNo，它只是指tensorflow框架的一种设计理念——计算流图）后，需要在一个会话中启动图，启动的第一步是创建一个Session对象。
            # 为了取回（Fetch）操作的输出内容, 可以在使用 Session 对象的 run()调用执行图时，传入一些 tensor, 这些 tensor 会帮助你取回结果。


            # 参数已在cnn，clstm中训练好
            fetches = {'step': global_step,
                       'cost': classifier.cost,
                       'accuracy': classifier.accuracy,
                       'learning_rate': learning_rate}

            # cnn，clstm模型用的input_x
            feed_dict = {classifier.input_x: input_x,
                         classifier.input_y: input_y}

            # fetches, feed_dict是什么封装的结构，干什么的，这里咋这样训练，咋还用到了accuracy和summaries
            if FLAGS.clf != 'cnn':
                fetches['final_state'] = classifier.final_state
                feed_dict[classifier.batch_size] = len(input_x)
                feed_dict[classifier.sequence_length] = sequence_length

            if is_training:
                fetches['train_op'] = train_op
                fetches['summaries'] = train_summary_op
                feed_dict[classifier.keep_prob] = FLAGS.keep_prob
            else:
                fetches['summaries'] = valid_summary_op
                feed_dict[classifier.keep_prob] = 1.0

            vars = sess.run(fetches, feed_dict)
            # 返回对象object的属性和属性值的字典对象，如果没有参数，就打印当前调用位置的属性和属性值 类似 locals()。
            step = vars['step']
            cost = vars['cost']
            accuracy = vars['accuracy']
            summaries = vars['summaries']

            # Write summaries to file
            if is_training:
                train_summary_writer.add_summary(summaries, step)
            else:
                valid_summary_writer.add_summary(summaries, step)

            time_str = datetime.datetime.now().isoformat()
            print("{}: step: {}, loss: {:g}, accuracy: {:g}".format(time_str, step, cost, accuracy))

            return accuracy


        print('Start training ...')

        for train_input in train_data:
            run_step(train_input, is_training=True)
            current_step = tf.train.global_step(sess, global_step)  #这是个提取当前步骤的命令
            # global_step是指图中看到的批次数量。每次提供一个批处理时，权重都会按照最小化损失的方向更新。
            # global_step只是跟踪到目前为止看到的批数。当在minimum()参数列表中传递该变量时，该变量将增加1。查看optimizer. minimum()，你可以使用tf.train.global_step()获得global_step值。

            ## 多少步训练和交叉验证
            if current_step % FLAGS.evaluate_every_steps == 0:
                print('\nValidation')
                run_step((x_valid, y_valid, valid_lengths), is_training=False)
                print('')
            ## 多少步保存
            if current_step % FLAGS.save_every_steps == 0:
                save_path = saver.save(sess, os.path.join(outdir, 'model/clf'), current_step)

        print('\nAll the files have been saved to {}\n'.format(outdir))


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
wandb: Currently logged in as: bamboo912 (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.28 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Building dataset ...
0
1
[]
[]
[2 0 2 ... 0 2 0]
[16 15 14 ... 22 47 11]
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
549361
[[  1   2   3 ...   0   0   0]
 [ 14  15   7 ...   0   0   0]
 [ 12  25  23 ...   0   0   0]
 ...
 [  1  32   9 ...   0   0   0]
 [ 14 963   5 ...   0   0   0]
 [  1  92  26 ...   0   0   0]]
[[   1  184   88 ...    0    0    0]
 [2592 1250    1 ...    0    0    0]
 [   1 1181   19 ...    0    0    0]
 ...
 [   1   32  528 ...    0    0    0]
 [   1    5   91 ...    0    0    0]
 [   1   51   92 ...    0    0    0]]
[2 2 2 ... 1 1 2]
[24 24 17 ... 12 14 16]
<tensorflow.contrib.learn.python.learn.preprocessing.text.VocabularyProcessor object at 0x000001C28F481A90>
Dataset has been built successfully.
Run time: 42.925567388534546
Number of sentences: 549361
Vocabulary size: 39606
Max document length:

2021-05-02T12:06:38.732069: step: 60, loss: 3.92238, accuracy: 0.3125
2021-05-02T12:06:38.892967: step: 61, loss: 3.10083, accuracy: 0.53125
2021-05-02T12:06:39.057864: step: 62, loss: 5.50546, accuracy: 0.28125
2021-05-02T12:06:39.223758: step: 63, loss: 4.07832, accuracy: 0.3125
2021-05-02T12:06:39.394651: step: 64, loss: 4.83514, accuracy: 0.46875
2021-05-02T12:06:39.558546: step: 65, loss: 3.70266, accuracy: 0.40625
2021-05-02T12:06:39.725440: step: 66, loss: 3.42017, accuracy: 0.40625
2021-05-02T12:06:39.881341: step: 67, loss: 4.54962, accuracy: 0.3125
2021-05-02T12:06:40.044238: step: 68, loss: 2.92068, accuracy: 0.40625
2021-05-02T12:06:40.203138: step: 69, loss: 3.32219, accuracy: 0.40625
2021-05-02T12:06:40.374030: step: 70, loss: 4.53483, accuracy: 0.40625
2021-05-02T12:06:40.569906: step: 71, loss: 5.13761, accuracy: 0.3125
2021-05-02T12:06:40.748792: step: 72, loss: 3.17266, accuracy: 0.40625
2021-05-02T12:06:40.917685: step: 73, loss: 2.45354, accuracy: 0.46875
2021-05-02