In [1]:
# Author DYF :2019-10-16-15:03
import os
import sys
import json
import time
import shutil
import pickle
import logging
#import data_helper
import numpy as np
import pandas as pd
import Data_preprocess
import tensorflow as tf
from text_cnn_rnn import TextCNNRNN
from sklearn.model_selection import train_test_split

logging.getLogger().setLevel(logging.INFO)

In [None]:
def train_model():
    datafile = 'toxic_comments.csv'
    
    # x_:数据集， y_: 标签， vocabulary: 单词及标号， vocabulary_inv:    , df: pandas数据， labels标签
    x_, y_, vocabulary, vocabulary_inv, df, labels = Data_preprocess.load_data(datafile)
    params =    { "batch_size": 128, "dropout_keep_prob": 0.5, "embedding_dim": 149998, "evaluate_every": 200, "filter_sizes": "3,4,5",
                  "hidden_unit": 300, "l2_reg_lambda": 0.0, "max_pool_size": 4, "non_static": False, "num_epochs": 1,"num_filters": 32 
                }
    # Assign a 149998 dimension vector to each word.
    word_embeddings = Data_preprocess.load_embeddings(vocabulary)
    embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)]
    embedding_mat = np.array(embedding_mat, dtype = np.float32)
    
    # Split the original dataset into train set and test set
    # 将数据分为训练集和测试机
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)  # 这里的y_只是某一类的y，这里是第一类toxic
    
    # Split the train set into train set and dev set  
    # 将训练数据又分为训练集和验证集
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)
    
        
    print('x_train', x_train)
    print('y_train', y_train)
    
    
    print('y_train.shape',y_train.shape[1])
    #i = input()

    
    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))
    
    
    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)
    
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(
                embedding_mat=embedding_mat,
                sequence_length=x_train.shape[1],
                num_classes = y_train.shape[1],     ##############
                non_static=params['non_static'],
                hidden_unit=params['hidden_unit'],
                max_pool_size=params['max_pool_size'],
                filter_sizes=map(int, params['filter_sizes'].split(",")),
                num_filters = params['num_filters'],
                embedding_size = params['embedding_dim'],
                l2_reg_lambda = params['l2_reg_lambda'])
            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)   ## 优化器
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) 
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)  
    
            # Checkpoint files will be saved in this directory during training
            # 存储训练模型
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')
    
            def real_len(batches):
                return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]
    
            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x: x_batch,
                    cnn_rnn.input_y: y_batch,
                    cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'],
                    cnn_rnn.batch_size: len(x_batch),
                    cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len: real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict)
                
            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x: x_batch,
                    cnn_rnn.input_y: y_batch,
                    cnn_rnn.dropout_keep_prob: 1.0,
                    cnn_rnn.batch_size: len(x_batch),
                    cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len: real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run(
                    [global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict)
                return accuracy, loss, num_correct, predictions
            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())
            
            # Training starts here 开始训练
            train_batches = Data_preprocess.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
            best_accuracy, best_at_step = 0, 0
            
            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = Data_preprocess.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))
            logging.critical('Training is complete, testing the best model on x_test and y_test')
            

            # Save the model files to trained_dir. predict.py needs trained model files. 
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        ickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)

if __name__ == '__main__':
    # python3 train.py ./data/train.csv.zip ./training_config.json
    train_model()

            
            
            
            
            
            
            
    

cilumn_sum 1405
xraw [['explanation', '+', 'why', 'the', 'edits', 'made', 'under', 'my', 'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted', 'they', 'were', 'not', 'vandalisms', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fac', 'and', 'please', 'do', 'not', 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', 'i', 'am', 'retired', 'now', '89', '205', '38', '27'], ['d', 'aww', '!', 'he', 'matches', 'this', 'background', 'colour', 'i', 'am', 'seemingly', 'stuck', 'with', 'thanks', 'talk', '21', ':', '51', 'january', '11', '2016', 'utc', ''], ['hey', 'man', 'i', 'am', 'really', 'not', 'trying', 'to', 'edit', 'war', 'it', 'just', 'that', 'this', 'guy', 'is', 'constantly', 'removing', 'relevant', 'information', 'and', 'talking', 'to', 'me', 'through', 'edits', 'instead', 'of', 'my', 'talk', 'page', 'he', 'seems', 'to', 'care', 'more', 'about', 'the', 'formatting', 'than', 'the', 'actual', 'info', ''], ['', '+', 'm

E1016 19:02:43.921569  7272 Data_preprocess.py:128] CRITICAL - This is prediction, reading the trained sequence length


y_raw [[0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [1, 0]]


E1016 19:02:43.925043  7272 Data_preprocess.py:130] CRITICAL - The maximum length is 1000
I1016 19:02:43.949848  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:43.950836  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:43.974612  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:43.981093  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:43.984036  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:43.995979  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:43.997924  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer t

I1016 19:02:45.158069  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:45.176916  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:45.208659  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:45.248367  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:45.292516  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:45.323235  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:45.324753  7272 Data_preprocess.py:138] This sentence has to be cut off because it is longer than trained sequence length
I1016 19:02:45.329694  7272 Data_preprocess.py:138] This sente

