In [1]:
import sys
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ! cp -r /content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/data/data/. /content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/data

In [3]:
sys.path.append('/content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/src')

In [4]:
%tensorflow_version 1.x
from __future__ import division

import time
import numpy as np
import os
import json
import tensorflow as tf

from pprint import pprint
from copyattention import CopyAttention
from input_data import DataSet
from input_data import setup

from feed_dicts import placeholder_inputs
from feed_dicts import fill_feed_dict
from feed_dicts import placeholder_inputs_single
from feed_dicts import fill_feed_dict_single
from feed_dicts import do_eval

TensorFlow 1.x selected.


In [5]:
# os.chdir('/content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/datas/rlebret-wikipedia-biography-dataset-d0d6c78')
# os.getcwd()
# !cat wikipedia-biography-dataset.z?? > tmp.zip
# !unzip tmp.zip
# !rm tmp.zip

In [6]:
os.chdir('/content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master')

## Hyperparameters

In [7]:
flags = tf.app.flags

# Model parameters
flags.DEFINE_integer("n", 14, "n-gram model parameter [11]") 
flags.DEFINE_integer("d", 64, "Dimension of word embeddings [64]")
flags.DEFINE_integer("g", 128, "Dimension of global embedding [128]")
flags.DEFINE_integer("nhu", 256, "Number of hidden units [256]")
flags.DEFINE_integer("l", 10, "Max number of words per field [10]")
flags.DEFINE_float("learning_rate", 0.0025, "Learning rate parameter [0.0025]")

# Dataset related parameters
flags.DEFINE_integer("max_fields", 10, "Maximum number of fields in an infobox [10]")
flags.DEFINE_integer("word_max_fields", 10, "Maximum of fields a word from an infobox can appear in [10]")
flags.DEFINE_integer("nW", 20000, "Size of the sentence vocabulary") # 20000
flags.DEFINE_integer("min_field_freq", 100, "Minimum frequency of occurance of a field [100]")

# Temporary flags - To be fixed
flags.DEFINE_integer("nQ", 20000, "Size of the table vocabulary") # 20000
flags.DEFINE_integer("nQpr", 1000, "Dummy") # 1000

# Experiment parameters
flags.DEFINE_integer("num_epochs", 10, "Number of epochs [10]")
flags.DEFINE_integer("batch_size", 32, "Batch size for SGD [32]") 
flags.DEFINE_integer("print_every", 100, "Print out the training loss every #steps [100]")
flags.DEFINE_integer("sample_every", 1000, "Sample sentences every #steps [1000]")
flags.DEFINE_integer("test_every", 1000, "Test after every #steps [1000]")
flags.DEFINE_integer("valid_every", 1000, "Validate after every #steps [1000]")
flags.DEFINE_string("data_dir", '/content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/data', "Path to the data directory [../data]")
flags.DEFINE_string("checkpoint_dir", '/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint', "Directory to save checkpoints")
flags.DEFINE_string("experiment_dir", '/content/drive/MyDrive/GM5/DEEP/Paper/experiment', "Directory to store current experiment results")
FLAGS = flags.FLAGS

In [8]:
### Gérer le pb de JsonEcoder
from json import JSONEncoder
class MyEncoder(JSONEncoder):
  def default(self, o):
    return o.__dict__

MyEncoder().encode(flags.FLAGS.__flags)



## Fonction implémentée pour calculer BLEU

In [9]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu

def split(sentence): 
    return (sentence.split())

def compute_BLEU(sentence, ref):
  reference = split(ref)
  # if sentence == '. \n':
  #     return 0
  # else:
  if '-lrb-' in sentence:
      reference.append('-lrb-')
  if '-rrb-' in sentence:
      reference.append('-rrb-')

  candidate = split(sentence)
  # print(reference)
  # print(candidate)
  BLEU_score = sentence_bleu([reference], candidate)
  return BLEU_score

## Première version de main
<br>
Nous calculons pas de BLEU, ni la perpléxité. Nous enregistrons les pertes d'entraînement et de validation dans les fichiers.

In [10]:
def main_v1(_):
    # pprint(flags.FLAGS.__flags)

    #### experiment_dir set : choose the last experiment file.
    if not os.path.exists(FLAGS.experiment_dir):
        os.makedirs(FLAGS.experiment_dir)
        expt_num = "1"
    else:
        expts = os.listdir(FLAGS.experiment_dir)
        last_expr = max([int(folder) for folder in expts])
        expt_num = str(last_expr + 1)
    expt_result_path = os.path.join(FLAGS.experiment_dir, expt_num)
    os.makedirs(expt_result_path)
    ####

    #### checkpoint_dir path set
    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)
    chkpt_result_path = os.path.join(FLAGS.checkpoint_dir, expt_num)
    os.makedirs(chkpt_result_path)
    ####

    #### record flags.FLAGS --> write them into params.json
    params_e_path = os.path.join(expt_result_path, "params.json")
    params_c_path = os.path.join(chkpt_result_path, "params.json")
    with open(params_e_path, 'w') as params_e, open(params_c_path, 'w') as params_c:
        json.dump(flags.FLAGS.__flags, params_e,cls= MyEncoder)
        json.dump(flags.FLAGS.__flags, params_c,cls= MyEncoder)
    ####

    #### Generate the indexes for create train/valid/test dataset objects
    word2idx, field2idx, qword2idx, nF, max_words_in_table, word_set = \
        setup(FLAGS.data_dir, '/content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/data/embeddings', FLAGS.n, FLAGS.batch_size,FLAGS.nW, FLAGS.min_field_freq, FLAGS.nQ)

    train_dataset = DataSet(FLAGS.data_dir, 'train', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)

    
    num_train_examples = int(train_dataset.num_examples()/100)  ##### on prend seulement 1% de dataset

    valid_dataset = DataSet(FLAGS.data_dir, 'valid', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)

    test_dataset = DataSet(FLAGS.data_dir, 'test', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)
    # print(test_dataset._xs[:10])
    # print(test_dataset._ys[:10])
    
    print('num_train_examples: ', num_train_examples)
    print('num_valid_examples: ', int(.1*valid_dataset.num_examples()))
    print('num_test_examples: ',int(.1*test_dataset.num_examples()))
    ####


    #### The sizes of respective conditioning variables for placeholder generation
    context_size = (FLAGS.n - 1)
    zp_size = context_size * FLAGS.word_max_fields
    zm_size = context_size * FLAGS.word_max_fields
    gf_size = FLAGS.max_fields
    gw_size = max_words_in_table
    copy_size = FLAGS.word_max_fields
    proj_size = FLAGS.nW + max_words_in_table
    ####

    #### Generate the TensorFlow graph
    with tf.Graph().as_default():

        ## Create the CopyAttention model
        start_c = time.time()
        model = CopyAttention(FLAGS.n, FLAGS.d, FLAGS.g, FLAGS.nhu,
                              FLAGS.nW, nF, FLAGS.nQ, FLAGS.l,
                              FLAGS.learning_rate, max_words_in_table,
                              FLAGS.max_fields, FLAGS.word_max_fields)
        duration_c = time.time() - start_c
        print("======= CopyAttention model done in %.3f minutes. ======="%(duration_c/60))
        ##


        ## Placeholders for train and validation with known shape per batch
        # context_pl (32, context_size) ; zp_pl (32, zp_size); zm_pl (32, zm_size)
        # gf_pl (32, gf_size) ; gw_pl (32, gw_size); copy_pl (none, copy_size)
        # proj_pl (32, proj_size); next_pl True next word tensor
        context_pl, zp_pl, zm_pl, gf_pl, gw_pl, next_pl, copy_pl, proj_pl = \
            placeholder_inputs(FLAGS.batch_size, context_size, zp_size,
                                zm_size, gf_size, gw_size, copy_size,
                                proj_size)
        # Placeholders for test
        context_plt, zp_plt, zm_plt, gf_plt, gw_plt, copy_plt, proj_plt, next_plt = \
            placeholder_inputs_single(context_size, zp_size, zm_size,
                                      gf_size, gw_size, copy_size,
                                      proj_size)
            
        print("======= Verify placeholders: context_pl, zp_pl, zm_pl, gf_pl, gw_pl, copy_pl, proj_pl: =======")
        for i in [context_pl, zp_pl, zm_pl, gf_pl, gw_pl, copy_pl, proj_pl]:
            print(i.get_shape())
        ##

        
        ## Train and validation part of the CopyAttention model
        # print("======= Training with batch size = %d ======="%FLAGS.batch_size)
        predict = model.inference(FLAGS.batch_size, context_pl, zp_pl, zm_pl,
                                  gf_pl, gw_pl, copy_pl, proj_pl)
        loss = model.loss(predict, next_pl) # cross_entropy
        train_op = model.training(loss) # optimizer
        evaluate = model.evaluate(predict, next_pl)
        # print("Train Accuracy: ", evaluate)
        # print("======= Stop Training =======")
        ##



        ## Test component of the model
        # print("======= Testing model with batch size = 1 =======")
        pred_single = model.inference(1, context_plt, zp_plt, zm_plt,
                                      gf_plt, gw_plt, copy_plt,
                                      proj_plt)
        predicted_label = model.predict(pred_single) #label, softmax pred_single
        ##



        ## Initialize the variables and start the session
        init = tf.initialize_all_variables()
        saver = tf.train.Saver()
        sess = tf.Session()
        # ckpt_file = os.path.join('/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint','15', '16.ckpt')
        # saver.restore(sess, r'/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint/15/16.ckpt')
        sess.run(init)

        
        # train_loss_epoch = []
        
        for epoch in range(1, FLAGS.num_epochs + 1):
            train_loss_tot = []

            train_dataset.generate_permutation()
            start_e = time.time()
            for i in range(num_train_examples):
                try:
                    feed_dict = fill_feed_dict(train_dataset, i,
                                              context_pl, zp_pl, zm_pl,
                                              gf_pl, gw_pl, next_pl,
                                              copy_pl, proj_pl)
                    _, loss_value = sess.run([train_op, loss],
                                            feed_dict=feed_dict)

                    train_loss_tot.append(loss_value)

                    if i % FLAGS.print_every == 0:
                        print("Epoch : %d\tStep : %d\tLoss : %0.3f" % (epoch, i, loss_value))

                    if i == -1 and i % FLAGS.valid_every == 0:
                        print("Validation starting")
                        #### TEST
                        valid_loss = do_eval(sess, train_op, loss,
                                            valid_dataset,
                                            context_pl, zp_pl, zm_pl,
                                            gf_pl, gw_pl, next_pl,
                                            copy_pl, proj_pl)
                        print("Epoch : %d\tValidation loss: %0.5f" % (i, valid_loss))

                    if i != 0 :#and i % FLAGS.sample_every == 0:
                        # print("Test starting")
                        test_dataset.reset_context()
                        pos = 0
                        len_sent = 0
                        prev_predict = word2idx['<start>']
                        res_path = os.path.join(expt_result_path, 'sample%d.txt'%epoch)
                        print("Write every 1000 sample in txt")
                        with open(res_path, 'a') as exp:
                            while pos != 1:
                                try:
                                  #### TEST
                                    feed_dict_t, idx2wq = fill_feed_dict_single(test_dataset,
                                                                                prev_predict,
                                                                                0, context_plt,
                                                                                zp_plt, zm_plt,
                                                                                gf_plt, gw_plt,
                                                                                next_plt,
                                                                                copy_plt,
                                                                                proj_plt)
                                    prev_predict = sess.run([predicted_label],
                                                            feed_dict=feed_dict_t)
                                    prev = prev_predict[0][0][0]
                                    if prev in idx2wq:
                                        exp.write(idx2wq[prev] + ' ')
                                        len_sent = len_sent + 1
                                    else:
                                        exp.write('<unk> ')
                                        len_sent = len_sent + 1
                                    if prev == word2idx['.']:
                                        pos = 1
                                        exp.write('\n')
                                    if len_sent == 50:
                                        break
                                    prev_predict = prev
                                except:
                                    pass
                except:
                    pass


            # train_loss_epoch.append(train_loss_epoch.mean())
            duration_e = time.time() - start_e
            print("Time taken for epoch : %d is %.3f minutes" % (epoch, duration_e/60))


            train_res = os.path.join(expt_result_path, 'train_loss.txt')
            with open(train_res, 'a') as tloss_f:
                tloss_f.write("Epoch : %d\tTrain loss: %0.5f\tComputation time: %0.3f\n" % (epoch, np.mean(train_loss_tot), duration_e))

            print("Saving checkpoint for epoch %d" % (epoch))
            checkpoint_file = os.path.join(chkpt_result_path, str(epoch) + '.ckpt')
            saver.save(sess, checkpoint_file)

            print("Validation starting")
            start = time.time()
            valid_loss = do_eval(sess, train_op, loss,
                                 valid_dataset, context_pl,
                                 zp_pl, zm_pl, gf_pl, gw_pl,
                                 next_pl, copy_pl, proj_pl)
            duration = time.time() - start
            print("Epoch : %d\tValidation loss: %0.5f" % (epoch, valid_loss))
            print("Time taken for validating epoch %d : %0.3f" % (epoch, duration))
            valid_res = os.path.join(expt_result_path, 'valid_loss.txt')
            
            with open(valid_res, 'a') as vloss_f:
                vloss_f.write("Epoch : %d\tValidation loss: %0.5f\tComputation time: %0.3f\n" % (epoch, valid_loss, duration))
            

## Deuxième version de main
<br>
Nous ajoutons le méchanisme "Early stopping" dans le main. Nous enregistrons les pertes d'entraînement et de validation dans les fichiers.

In [11]:
def main_v2(_):
    # pprint(flags.FLAGS.__flags)

    #### experiment_dir set : choose the last experiment file.
    if not os.path.exists(FLAGS.experiment_dir):
        os.makedirs(FLAGS.experiment_dir)
        expt_num = "1"
    else:
        expts = os.listdir(FLAGS.experiment_dir)
        last_expr = max([int(folder) for folder in expts])
        expt_num = str(last_expr + 1)
    expt_result_path = os.path.join(FLAGS.experiment_dir, expt_num)
    os.makedirs(expt_result_path)
    ####

    #### checkpoint_dir path set
    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)
    chkpt_result_path = os.path.join(FLAGS.checkpoint_dir, expt_num)
    os.makedirs(chkpt_result_path)
    ####

    #### write them into params.json
    params_e_path = os.path.join(expt_result_path, "params.json")
    params_c_path = os.path.join(chkpt_result_path, "params.json")
    with open(params_e_path, 'w') as params_e, open(params_c_path, 'w') as params_c:
        json.dump(flags.FLAGS.__flags, params_e,cls= MyEncoder)
        json.dump(flags.FLAGS.__flags, params_c,cls= MyEncoder)
    ####

    #### Generate the indexes for create train/valid/test dataset objects
    word2idx, field2idx, qword2idx, nF, max_words_in_table, word_set = \
        setup(FLAGS.data_dir, '/content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/data/embeddings', FLAGS.n, FLAGS.batch_size,FLAGS.nW, FLAGS.min_field_freq, FLAGS.nQ)

    train_dataset = DataSet(FLAGS.data_dir, 'train', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)

    
    num_train_examples = int(train_dataset.num_examples()/100)  ##### on prend seulement 1% de dataset

    valid_dataset = DataSet(FLAGS.data_dir, 'valid', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)

    test_dataset = DataSet(FLAGS.data_dir, 'test', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)
    # print(test_dataset._xs[:10])
    # print(test_dataset._ys[:10])
    
    print('num_train_examples: ', num_train_examples)
    print('num_valid_examples: ', int(.1*valid_dataset.num_examples()))
    print('num_test_examples: ',int(.1*test_dataset.num_examples()))
    ####


    #### The sizes of respective conditioning variables for placeholder generation
    context_size = (FLAGS.n - 1)
    zp_size = context_size * FLAGS.word_max_fields
    zm_size = context_size * FLAGS.word_max_fields
    gf_size = FLAGS.max_fields
    gw_size = max_words_in_table
    copy_size = FLAGS.word_max_fields
    proj_size = FLAGS.nW + max_words_in_table
    ####

    #### Generate the TensorFlow graph
    with tf.Graph().as_default():

        ## Create the CopyAttention model
        start_c = time.time()
        model = CopyAttention(FLAGS.n, FLAGS.d, FLAGS.g, FLAGS.nhu,
                              FLAGS.nW, nF, FLAGS.nQ, FLAGS.l,
                              FLAGS.learning_rate, max_words_in_table,
                              FLAGS.max_fields, FLAGS.word_max_fields)
        duration_c = time.time() - start_c
        print("======= CopyAttention model done in %.3f minutes. ======="%(duration_c/60))
        ##


        ## Placeholders for train and validation with known shape per batch
        # context_pl (32, context_size) ; zp_pl (32, zp_size); zm_pl (32, zm_size)
        # gf_pl (32, gf_size) ; gw_pl (32, gw_size); copy_pl (none, copy_size)
        # proj_pl (32, proj_size); next_pl True next word tensor
        context_pl, zp_pl, zm_pl, gf_pl, gw_pl, next_pl, copy_pl, proj_pl = \
            placeholder_inputs(FLAGS.batch_size, context_size, zp_size,
                                zm_size, gf_size, gw_size, copy_size,
                                proj_size)
        # Placeholders for test
        context_plt, zp_plt, zm_plt, gf_plt, gw_plt, copy_plt, proj_plt, next_plt = \
            placeholder_inputs_single(context_size, zp_size, zm_size,
                                      gf_size, gw_size, copy_size,
                                      proj_size)
            
        print("======= Verify placeholders: context_pl, zp_pl, zm_pl, gf_pl, gw_pl, copy_pl, proj_pl: =======")
        for i in [context_pl, zp_pl, zm_pl, gf_pl, gw_pl, copy_pl, proj_pl]:
            print(i.get_shape())
        ##

        
        ## Train and validation part of the CopyAttention model
        # print("======= Training with batch size = %d ======="%FLAGS.batch_size)
        predict = model.inference(FLAGS.batch_size, context_pl, zp_pl, zm_pl,
                                  gf_pl, gw_pl, copy_pl, proj_pl)
        loss = model.loss(predict, next_pl) # cross_entropy
        train_op = model.training(loss) # optimizer
        evaluate = model.evaluate(predict, next_pl)
        # print("Train Accuracy: ", evaluate)
        # print("======= Stop Training =======")
        ##



        ## Test component of the model
        # print("======= Testing model with batch size = 1 =======")
        pred_single = model.inference(1, context_plt, zp_plt, zm_plt,
                                      gf_plt, gw_plt, copy_plt,
                                      proj_plt)
        predicted_label = model.predict(pred_single) #预测label, softmax pred_single
        ##



        ## Initialize the variables and start the session
        init = tf.initialize_all_variables()
        saver = tf.train.Saver()
        sess = tf.Session()
        # ckpt_file = os.path.join('/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint','15', '16.ckpt')
        # saver.restore(sess, r'/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint/15/16.ckpt')
        sess.run(init)

        
        # train_loss_epoch = []
        sigma = 1e-4
        epoch = 1
        best_loss = 100.
        best_epoch = 0
        diff_epoch = 0
        while (epoch < FLAGS.num_epochs + 1) and (diff_epoch < 4):
        # for epoch in range(1, FLAGS.num_epochs + 1):
            train_loss_tot = []
            train_dataset.generate_permutation()
            start_e = time.time()
            for i in range(num_train_examples):
                try:
                    feed_dict = fill_feed_dict(train_dataset, i,
                                              context_pl, zp_pl, zm_pl,
                                              gf_pl, gw_pl, next_pl,
                                              copy_pl, proj_pl)
                    _, loss_value = sess.run([train_op, loss],
                                            feed_dict=feed_dict)
                    
                    train_loss_tot.append(loss_value)

                    if i % FLAGS.print_every == 0:
                        print("Epoch : %d\tStep : %d\tLoss : %0.3f" % (epoch, i, loss_value))

                    if i == -1 and i % FLAGS.valid_every == 0:
                        print("Validation starting")
                        #### TEST
                        valid_loss = do_eval(sess, train_op, loss,
                                            valid_dataset,
                                            context_pl, zp_pl, zm_pl,
                                            gf_pl, gw_pl, next_pl,
                                            copy_pl, proj_pl)
                        print("Epoch : %d\tValidation loss: %0.5f" % (i, valid_loss))

                    if i != 0:# and i % FLAGS.sample_every == 0:
                        # print("Test starting")
                        test_dataset.reset_context()
                        pos = 0
                        len_sent = 0
                        prev_predict = word2idx['<start>']
                        res_path = os.path.join(expt_result_path, 'sample%d.txt'%epoch)
                        print("Write every 1000 sample in txt")
                        with open(res_path, 'a') as exp:
                            while pos != 1:
                                try:
                                  #### TEST
                                    feed_dict_t, idx2wq = fill_feed_dict_single(test_dataset,
                                                                                prev_predict,
                                                                                0, context_plt,
                                                                                zp_plt, zm_plt,
                                                                                gf_plt, gw_plt,
                                                                                next_plt,
                                                                                copy_plt,
                                                                                proj_plt)
                                    prev_predict = sess.run([predicted_label],
                                                            feed_dict=feed_dict_t)
                                    prev = prev_predict[0][0][0]
                                    if prev in idx2wq:
                                        exp.write(idx2wq[prev] + ' ')
                                        len_sent = len_sent + 1
                                    else:
                                        exp.write('<unk> ')
                                        len_sent = len_sent + 1
                                    if prev == word2idx['.']:
                                        pos = 1
                                        exp.write('\n')
                                    if len_sent == 50:
                                        break
                                    prev_predict = prev
                                except:
                                    pass
                    
                except:
                    pass

            # train_loss_epoch.append(train_loss_epoch.mean())
            duration_e = time.time() - start_e
            print("Time taken for epoch : %d is %.3f minutes" % (epoch, duration_e/60))


            train_res = os.path.join(expt_result_path, 'train_loss.txt')
            with open(train_res, 'a') as tloss_f:
                tloss_f.write("Epoch : %d\tTrain loss: %0.5f\tComputation time: %0.3f\n" % (epoch, np.mean(train_loss_tot), duration_e))

            print("Saving checkpoint for epoch %d" % (epoch))
            checkpoint_file = os.path.join(chkpt_result_path, str(epoch) + '.ckpt')
            saver.save(sess, checkpoint_file)

            print("Validation starting")
            start = time.time()
            valid_loss = do_eval(sess, train_op, loss,
                                 valid_dataset, context_pl,
                                 zp_pl, zm_pl, gf_pl, gw_pl,
                                 next_pl, copy_pl, proj_pl)
            duration = time.time() - start
            print("Epoch : %d\tValidation loss: %0.5f" % (epoch, valid_loss))
            print("Time taken for validating epoch %d : %0.3f" % (epoch, duration))
            valid_res = os.path.join(expt_result_path, 'valid_loss.txt')
            
            with open(valid_res, 'a') as vloss_f:
                vloss_f.write("Epoch : %d\tValidation loss: %0.5f\tComputation time: %0.3f\n" % (epoch, valid_loss, duration))

            if valid_loss < best_loss:
                best_loss = valid_loss
                best_epoch = epoch    
            diff_epoch = epoch - best_epoch
            epoch += 1
            if diff_epoch>=4:
                print("====Early Stopping====")


## Troisième version de main
<br>
Nous calculons le BLEU et la perpléxité. Nous enregistrons les pertes d'entraînement et de validation dans les fichiers.

In [12]:
def main_v3(_):
    # pprint(flags.FLAGS.__flags)

    #### experiment_dir set : choose the last experiment file.
    if not os.path.exists(FLAGS.experiment_dir):
        os.makedirs(FLAGS.experiment_dir)
        expt_num = "1"
    else:
        expts = os.listdir(FLAGS.experiment_dir)
        last_expr = max([int(folder) for folder in expts])
        expt_num = str(last_expr + 1)
    expt_result_path = os.path.join(FLAGS.experiment_dir, expt_num)
    os.makedirs(expt_result_path)
    ####

    #### checkpoint_dir path set
    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)
    chkpt_result_path = os.path.join(FLAGS.checkpoint_dir, expt_num)
    os.makedirs(chkpt_result_path)
    ####

    #### write them into params.json
    params_e_path = os.path.join(expt_result_path, "params.json")
    params_c_path = os.path.join(chkpt_result_path, "params.json")
    with open(params_e_path, 'w') as params_e, open(params_c_path, 'w') as params_c:
        json.dump(flags.FLAGS.__flags, params_e,cls= MyEncoder)
        json.dump(flags.FLAGS.__flags, params_c,cls= MyEncoder)
    ####

    #### Generate the indexes for create train/valid/test dataset objects
    word2idx, field2idx, qword2idx, nF, max_words_in_table, word_set = \
        setup(FLAGS.data_dir, '/content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/data/embeddings', FLAGS.n, FLAGS.batch_size,FLAGS.nW, FLAGS.min_field_freq, FLAGS.nQ)

    train_dataset = DataSet(FLAGS.data_dir, 'train', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)

    
    num_train_examples = int(train_dataset.num_examples()/20)  ##### on prend seulement 1% de dataset

    valid_dataset = DataSet(FLAGS.data_dir, 'valid', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)

    test_dataset = DataSet(FLAGS.data_dir, 'test', FLAGS.n, FLAGS.nW, nF,
                            FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
                            field2idx, qword2idx,
                            FLAGS.max_fields, FLAGS.word_max_fields,
                            max_words_in_table, word_set)
    # print(test_dataset._xs[:10])
    # print(test_dataset._ys[:10])
    
    print('num_train_examples: ', num_train_examples)
    print('num_valid_examples: ', int(.1*valid_dataset.num_examples()))
    print('num_test_examples: ',int(.1*test_dataset.num_examples()))
    ####


    #### The sizes of respective conditioning variables for placeholder generation
    context_size = (FLAGS.n - 1)
    zp_size = context_size * FLAGS.word_max_fields
    zm_size = context_size * FLAGS.word_max_fields
    gf_size = FLAGS.max_fields
    gw_size = max_words_in_table
    copy_size = FLAGS.word_max_fields
    proj_size = FLAGS.nW + max_words_in_table
    ####

    #### Generate the TensorFlow graph
    with tf.Graph().as_default():

        ## Create the CopyAttention model
        start_c = time.time()
        model = CopyAttention(FLAGS.n, FLAGS.d, FLAGS.g, FLAGS.nhu,
                              FLAGS.nW, nF, FLAGS.nQ, FLAGS.l,
                              FLAGS.learning_rate, max_words_in_table,
                              FLAGS.max_fields, FLAGS.word_max_fields)
        duration_c = time.time() - start_c
        print("======= CopyAttention model done in %.3f minutes. ======="%(duration_c/60))
        ##


        ## Placeholders for train and validation with known shape per batch
        # context_pl (32, context_size) ; zp_pl (32, zp_size); zm_pl (32, zm_size)
        # gf_pl (32, gf_size) ; gw_pl (32, gw_size); copy_pl (none, copy_size)
        # proj_pl (32, proj_size); next_pl True next word tensor
        context_pl, zp_pl, zm_pl, gf_pl, gw_pl, next_pl, copy_pl, proj_pl = \
            placeholder_inputs(FLAGS.batch_size, context_size, zp_size,
                                zm_size, gf_size, gw_size, copy_size,
                                proj_size)
        # Placeholders for test
        context_plt, zp_plt, zm_plt, gf_plt, gw_plt, copy_plt, proj_plt, next_plt = \
            placeholder_inputs_single(context_size, zp_size, zm_size,
                                      gf_size, gw_size, copy_size,
                                      proj_size)
            
        print("======= Verify placeholders: context_pl, zp_pl, zm_pl, gf_pl, gw_pl, copy_pl, proj_pl: =======")
        for i in [context_pl, zp_pl, zm_pl, gf_pl, gw_pl, copy_pl, proj_pl]:
            print(i.get_shape())
        ##

        
        ## Train and validation part of the CopyAttention model
        # print("======= Training with batch size = %d ======="%FLAGS.batch_size)
        predict = model.inference(FLAGS.batch_size, context_pl, zp_pl, zm_pl,
                                  gf_pl, gw_pl, copy_pl, proj_pl)
        loss = model.loss(predict, next_pl) # cross_entropy
        train_op = model.training(loss) # optimizer 
        evaluate = model.evaluate(predict, next_pl)
        # print("Train Accuracy: ", evaluate)
        # print("======= Stop Training =======")
        ##



        ## Test component of the model
        # print("======= Testing model with batch size = 1 =======")
        pred_single = model.inference(1, context_plt, zp_plt, zm_plt,
                                      gf_plt, gw_plt, copy_plt,
                                      proj_plt)
        predicted_label = model.predict(pred_single) #label, softmax pred_single
        ##



        ## Initialize the variables and start the session
        init = tf.initialize_all_variables()
        saver = tf.train.Saver()
        sess = tf.Session()
        # ckpt_file = os.path.join('/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint','15', '16.ckpt')
        # saver.restore(sess, r'/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint/15/16.ckpt')
        sess.run(init)

        
        # train_loss_epoch = []
        
        for epoch in range(1, FLAGS.num_epochs + 1):
            train_loss_tot = []
            train_perplex_tot = []
            train_dataset.generate_permutation()
            start_e = time.time()
            for i in range(num_train_examples):
                try:
                    feed_dict = fill_feed_dict(train_dataset, i,
                                              context_pl, zp_pl, zm_pl,
                                              gf_pl, gw_pl, next_pl,
                                              copy_pl, proj_pl)
                    _, loss_value = sess.run([train_op, loss],
                                            feed_dict=feed_dict)
                    
                    
                    # Train perplexity
                    train_perplexity_fn = tf.exp(loss_value)
                    train_perplexity = sess.run(train_perplexity_fn)

                    train_loss_tot.append(loss_value)
                    train_perplex_tot.append(train_perplexity)

                    if i % FLAGS.print_every == 0:
                        print("Epoch : %d\tStep : %d\tLoss : %0.3f\tPerplexity : %3f" % (epoch, i, loss_value, train_perplexity))

                    if i == -1 and i % FLAGS.valid_every == 0:
                        print("Validation starting")
                        #### TEST
                        valid_loss = do_eval(sess, train_op, loss,
                                            valid_dataset,
                                            context_pl, zp_pl, zm_pl,
                                            gf_pl, gw_pl, next_pl,
                                            copy_pl, proj_pl)
                        
                        # Valid perplexity
                        valid_perplexity_fn = tf.exp(valid_loss)
                        valid_perplexity = sess.run(valid_perplexity_fn)

                        print("Epoch : %d\tValidation loss: %0.5f\tValid Perplexity : %5f" % (i, valid_loss, valid_perplexity))

                    if i != 0 and i % FLAGS.sample_every == 0:
                        # print("Test starting")
                        test_dataset.reset_context()
                        pos = 0
                        len_sent = 0
                        prev_predict = word2idx['<start>']
                        res_path = os.path.join(expt_result_path, 'sample.txt')
                        print("Write every 1000 sample in txt")
                        with open(res_path, 'a') as exp:
                            while pos != 1:
                                try:
                                  #### TEST
                                    feed_dict_t, idx2wq = fill_feed_dict_single(test_dataset,
                                                                                prev_predict,
                                                                                0, context_plt,
                                                                                zp_plt, zm_plt,
                                                                                gf_plt, gw_plt,
                                                                                next_plt,
                                                                                copy_plt,
                                                                                proj_plt)
                                    prev_predict = sess.run([predicted_label],
                                                            feed_dict=feed_dict_t)
                                    prev = prev_predict[0][0][0]
                                    if prev in idx2wq:
                                        exp.write(idx2wq[prev] + ' ')
                                        len_sent = len_sent + 1
                                    else:
                                        exp.write('<unk> ')
                                        len_sent = len_sent + 1
                                    if prev == word2idx['.']:
                                        pos = 1
                                        exp.write('\n')
                                    if len_sent == 50:
                                        break
                                    prev_predict = prev
                                except:
                                    pass
                except:
                    pass


            # train_loss_epoch.append(train_loss_epoch.mean())
            duration_e = time.time() - start_e
            print("Time taken for epoch : %d is %.3f minutes" % (epoch, duration_e/60))


            train_res = os.path.join(expt_result_path, 'train_loss.txt')
            with open(train_res, 'a') as tloss_f:
                tloss_f.write("Epoch : %d\tTrain loss: %0.5f\tTrain perplexity: %0.5f\tComputation time: %0.3f\n" % (epoch, np.mean(train_loss_tot), np.mean(train_perplex_tot), duration_e))

            print("Saving checkpoint for epoch %d" % (epoch))
            checkpoint_file = os.path.join(chkpt_result_path, str(epoch) + '.ckpt')
            saver.save(sess, checkpoint_file)

            print("Validation starting")
            start = time.time()
            valid_loss = do_eval(sess, train_op, loss,
                                 valid_dataset, context_pl,
                                 zp_pl, zm_pl, gf_pl, gw_pl,
                                 next_pl, copy_pl, proj_pl)
 
            # Valid perplexity
            valid_perplexity_fn = tf.exp(valid_loss)
            valid_perplexity = sess.run(valid_perplexity_fn)

            duration = time.time() - start
            print("Epoch : %d\tValidation loss: %0.5f\tValidation perplexity: %0.5f" % (epoch, valid_loss,valid_perplexity))
            print("Time taken for validating epoch %d : %0.3f" % (epoch, duration))
            valid_res = os.path.join(expt_result_path, 'valid_loss.txt')
            
            with open(valid_res, 'a') as vloss_f:
                vloss_f.write("Epoch : %d\tValidation loss: %0.5f\tValidation perplexity: %0.5f\tComputation time: %0.3f\n" % (epoch, valid_loss, valid_perplexity, duration))
            

In [None]:
if __name__ == "__main__":
    tf.app.run()

Done creating w_index
Creating field indexes
Field vocabulary size : 1689
Processed fields in 51.519 s
Creating table word index
Created the table word index in 24.915 s
num_train_examples:  35766
num_valid_examples:  8951
num_test_examples:  8935
Initializing the CopyAttention model
Instructions for updating:
Use `tf.cast` instead.


W1208 14:09:18.651765 140454726940544 deprecation.py:323] From /content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/src/copyattention.py:55: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.





W1208 14:09:18.654552 140454726940544 module_wrapper.py:139] From /content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/src/copyattention.py:67: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



W1208 14:09:18.657124 140454726940544 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

I1208 14:09:19.145890 140454726940544 utils.py:141] NumExpr defaulting to 4 threads.


Done initializing the CopyAttention model



W1208 14:09:20.164796 140454726940544 module_wrapper.py:139] From /content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/src/feed_dicts.py:26: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



(32, 13)
(32, 130)
(32, 130)
(32, 10)
(32, 337)
(?, 10)
(?, 20337)



W1208 14:09:20.204188 140454726940544 module_wrapper.py:139] From /content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/src/copyattention.py:335: The name tf.nn.xw_plus_b is deprecated. Please use tf.compat.v1.nn.xw_plus_b instead.



ADAGRAD op



W1208 14:09:20.229916 140454726940544 module_wrapper.py:139] From /content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/src/copyattention.py:378: The name tf.train.AdagradOptimizer is deprecated. Please use tf.compat.v1.train.AdagradOptimizer instead.



Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


W1208 14:09:20.525364 140454726940544 deprecation.py:506] From /tensorflow-1.15.2/python3.6/tensorflow_core/python/training/adagrad.py:76: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Use `tf.global_variables_initializer` instead.


W1208 14:09:20.659737 140454726940544 deprecation.py:323] From /tensorflow-1.15.2/python3.6/tensorflow_core/python/util/tf_should_use.py:198: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


random seed 42
Epoch : 1	Step : 0	Loss : 9.901	Perplexity : 19950.787109
Epoch : 1	Step : 100	Loss : 9.494	Perplexity : 13277.446289
Epoch : 1	Step : 200	Loss : 7.652	Perplexity : 2103.806396
Epoch : 1	Step : 300	Loss : 8.493	Perplexity : 4880.969727
Epoch : 1	Step : 500	Loss : 7.387	Perplexity : 1614.537231
Epoch : 1	Step : 600	Loss : 4.267	Perplexity : 71.321556
Epoch : 1	Step : 700	Loss : 7.295	Perplexity : 1473.339844
Epoch : 1	Step : 800	Loss : 6.488	Perplexity : 657.512878
Epoch : 1	Step : 900	Loss : 3.979	Perplexity : 53.461407
Epoch : 1	Step : 1000	Loss : 5.509	Perplexity : 246.948563
Write every 1000 sample in txt
Epoch : 1	Step : 1100	Loss : 3.190	Perplexity : 24.284740
Epoch : 1	Step : 1200	Loss : 5.839	Perplexity : 343.587311
Epoch : 1	Step : 1300	Loss : 6.716	Perplexity : 825.382202
Epoch : 1	Step : 1400	Loss : 2.631	Perplexity : 13.892287
Epoch : 1	Step : 1500	Loss : 3.064	Perplexity : 21.414185
Epoch : 1	Step : 1700	Loss : 2.036	Perplexity : 7.659359
Epoch : 1	Step : 180

### BLEU

In [None]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
import os
def split(sentence): 
    return (sentence.split())

def compute_BLEU(sentence, ref):
  reference = split(ref)
  # if sentence == '. \n':
  #     return 0
  # else:
  if '-lrb-' in sentence:
      reference.append('-lrb-')
  if '-rrb-' in sentence:
      reference.append('-rrb-')

  candidate = split(sentence)
  print(reference)
  print(candidate)
  BLEU_score = sentence_bleu([reference], candidate)
  return BLEU_score

epoch = 1
BLEU_per_epoch = np.zeros(10)
expt_result_path = '/content/drive/MyDrive/GM5/DEEP/Paper/experiment/39'
reference = 'leonard shenoff randle -lrb- born february 12 , 1949 -rrb- is a former major league baseball player .'
res_path = os.path.join(expt_result_path, 'sample%d.txt'%epoch)
file =  open(res_path, "r")
Bleu = 0.
nb = 0
for sentence in file:
    bleu = compute_BLEU(sentence, reference)
    print("bleu: %4f" % bleu)
    Bleu += bleu
    nb += 1

BLEU_per_epoch[epoch-1] = Bleu /  nb  
print("BLEU metric for epoch %d : %4f" % (epoch, BLEU_per_epoch[epoch-1]))


## Back UP

In [None]:

# def main(_):
#     # pprint(flags.FLAGS.__flags)

#     #### experiment_dir set : choose the last experiment file.
#     if not os.path.exists(FLAGS.experiment_dir):
#         os.makedirs(FLAGS.experiment_dir)
#         expt_num = "1"
#     else:
#         expts = os.listdir(FLAGS.experiment_dir)
#         last_expr = max([int(folder) for folder in expts])
#         expt_num = str(last_expr + 1)
#     expt_result_path = os.path.join(FLAGS.experiment_dir, expt_num)
#     os.makedirs(expt_result_path)
#     ####

#     #### checkpoint_dir path set
#     if not os.path.exists(FLAGS.checkpoint_dir):
#         os.makedirs(FLAGS.checkpoint_dir)
#     chkpt_result_path = os.path.join(FLAGS.checkpoint_dir, expt_num)
#     os.makedirs(chkpt_result_path)
#     ####

#     #### 记录 flags.FLAGS的参数 --> write them into params.json
#     params_e_path = os.path.join(expt_result_path, "params.json")
#     params_c_path = os.path.join(chkpt_result_path, "params.json")
#     with open(params_e_path, 'w') as params_e, open(params_c_path, 'w') as params_c:
#         json.dump(flags.FLAGS.__flags, params_e,cls= MyEncoder)
#         json.dump(flags.FLAGS.__flags, params_c,cls= MyEncoder)
#     ####

#     #### Generate the indexes for create train/valid/test dataset objects
#     word2idx, field2idx, qword2idx, nF, max_words_in_table, word_set = \
#         setup(FLAGS.data_dir, '/content/drive/MyDrive/GM5/DEEP/Paper/NeuralTextGeneration-master/data/embeddings', FLAGS.n, FLAGS.batch_size,FLAGS.nW, FLAGS.min_field_freq, FLAGS.nQ)

#     train_dataset = DataSet(FLAGS.data_dir, 'train', FLAGS.n, FLAGS.nW, nF,
#                             FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
#                             field2idx, qword2idx,
#                             FLAGS.max_fields, FLAGS.word_max_fields,
#                             max_words_in_table, word_set)

    
#     num_train_examples = int(train_dataset.num_examples()/10)  ##### on prend seulement n% de dataset

#     valid_dataset = DataSet(FLAGS.data_dir, 'valid', FLAGS.n, FLAGS.nW, nF,
#                             FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
#                             field2idx, qword2idx,
#                             FLAGS.max_fields, FLAGS.word_max_fields,
#                             max_words_in_table, word_set)

#     test_dataset = DataSet(FLAGS.data_dir, 'test', FLAGS.n, FLAGS.nW, nF,
#                             FLAGS.nQ, FLAGS.l, FLAGS.batch_size, word2idx,
#                             field2idx, qword2idx,
#                             FLAGS.max_fields, FLAGS.word_max_fields,
#                             max_words_in_table, word_set)
#     # print(test_dataset._xs[:10])
#     # print(test_dataset._ys[:10])
    
#     # print('num_train_examples: ', num_train_examples)
#     # print('num_valid_examples: ', int(.1*valid_dataset.num_examples()))
#     # print('num_test_examples: ',int(.1*test_dataset.num_examples()))
#     ####


#     #### The sizes of respective conditioning variables for placeholder generation
#     # 上下文长度 = len(n_gram) -1
#     context_size = (FLAGS.n - 1)
#     # local conditiong z+(-) = 上下文长度*一个word最多出现在10个fields
#     zp_size = context_size * FLAGS.word_max_fields
#     zm_size = context_size * FLAGS.word_max_fields
#     # global conditiong gf = 一个infobox中最多出现的fields个数
#     gf_size = FLAGS.max_fields
#     # global conditiong gw_size = infobox中出现的word总个数 非FLAG
#     gw_size = max_words_in_table
#     # Index into the copy action embedding matrix
#     copy_size = FLAGS.word_max_fields
#     # Projection matrix to project out the copy action score to output vocabulary.句子中vocab的个数 + infobox中出现的word总个数 非FLAG
#     proj_size = FLAGS.nW + max_words_in_table
#     ####

#     ######## ADDED ########
#     reference = 'leonard shenoff randle -lrb- born february 12 , 1949 -rrb- is a former major league baseball player .'
#     ################
    
#     #### Generate the TensorFlow graph
#     with tf.Graph().as_default():

#         ## Create the CopyAttention model
#         start_c = time.time()
#         model = CopyAttention(FLAGS.n, FLAGS.d, FLAGS.g, FLAGS.nhu,
#                               FLAGS.nW, nF, FLAGS.nQ, FLAGS.l,
#                               FLAGS.learning_rate, max_words_in_table,
#                               FLAGS.max_fields, FLAGS.word_max_fields)
#         duration_c = time.time() - start_c
#         print("======= CopyAttention model done in %.3f minutes. ======="%(duration_c/60))
#         ##


#         ## Placeholders for train and validation with known shape per batch
#         # context_pl (32, context_size) ; zp_pl (32, zp_size); zm_pl (32, zm_size)
#         # gf_pl (32, gf_size) ; gw_pl (32, gw_size); copy_pl (none, copy_size)
#         # proj_pl (32, proj_size); next_pl True next word tensor
#         context_pl, zp_pl, zm_pl, gf_pl, gw_pl, next_pl, copy_pl, proj_pl = \
#             placeholder_inputs(FLAGS.batch_size, context_size, zp_size,
#                                 zm_size, gf_size, gw_size, copy_size,
#                                 proj_size)
#         # Placeholders for test
#         context_plt, zp_plt, zm_plt, gf_plt, gw_plt, copy_plt, proj_plt, next_plt = \
#             placeholder_inputs_single(context_size, zp_size, zm_size,
#                                       gf_size, gw_size, copy_size,
#                                       proj_size)
            
#         print("======= Verify placeholders: context_pl, zp_pl, zm_pl, gf_pl, gw_pl, copy_pl, proj_pl: =======")
#         for i in [context_pl, zp_pl, zm_pl, gf_pl, gw_pl, copy_pl, proj_pl]:
#             print(i.get_shape())
#         ##

        
#         ## Train and validation part of the CopyAttention model
#         # print("======= Training with batch size = %d ======="%FLAGS.batch_size)
#         predict = model.inference(FLAGS.batch_size, context_pl, zp_pl, zm_pl,
#                                   gf_pl, gw_pl, copy_pl, proj_pl)
#         loss = model.loss(predict, next_pl) # cross_entropy
#         train_op = model.training(loss) # optimizer 梯度下降
#         evaluate = model.evaluate(predict, next_pl)
#         # print("Train Accuracy: ", evaluate)
#         # print("======= Stop Training =======")
#         ##



#         ## Test component of the model
#         # print("======= Testing model with batch size = 1 =======")
#         pred_single = model.inference(1, context_plt, zp_plt, zm_plt,
#                                       gf_plt, gw_plt, copy_plt,
#                                       proj_plt)
#         predicted_label = model.predict(pred_single) #预测label, softmax pred_single
#         ##



#         ## Initialize the variables and start the session
#         init = tf.initialize_all_variables()
#         saver = tf.train.Saver()
#         sess = tf.Session()
#         # ckpt_file = os.path.join('/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint','15', '16.ckpt')
#         # saver.restore(sess, r'/content/drive/MyDrive/GM5/DEEP/Paper/checkpoint/15/16.ckpt')
#         sess.run(init)
        
#         ######## ADDED ########
#         BLEU_per_epoch = np.zeros(FLAGS.num_epochs)
#         ################
        
#         # train_loss_epoch = []
        
#         for epoch in range(1, FLAGS.num_epochs + 1):
#             train_loss_tot = []

#             train_dataset.generate_permutation()
#             start_e = time.time()
#             for i in range(num_train_examples):
#                 try:
#                     feed_dict = fill_feed_dict(train_dataset, i,
#                                               context_pl, zp_pl, zm_pl,
#                                               gf_pl, gw_pl, next_pl,
#                                               copy_pl, proj_pl)
#                     _, loss_value = sess.run([train_op, loss],
#                                             feed_dict=feed_dict)

#                     train_loss_tot.append(loss_value)

#                     if i % FLAGS.print_every == 0:
#                         print("Epoch : %d\tStep : %d\tLoss : %0.3f" % (epoch, i, loss_value))

#                     if i == -1 and i % FLAGS.valid_every == 0:
#                         print("Validation starting")
#                         #### TEST
#                         valid_loss = do_eval(sess, train_op, loss,
#                                             valid_dataset,
#                                             context_pl, zp_pl, zm_pl,
#                                             gf_pl, gw_pl, next_pl,
#                                             copy_pl, proj_pl)
#                         print("Epoch : %d\tValidation loss: %0.5f" % (i, valid_loss))

#                     if i != 0 and i % FLAGS.sample_every == 0:
#                         # print("Test starting")
#                         test_dataset.reset_context()
#                         pos = 0
#                         len_sent = 0
#                         prev_predict = word2idx['<start>']
#                         res_path = os.path.join(expt_result_path, 'sample%d.txt'%epoch)
#                         # print("Write every 1000 sample in txt")
#                         with open(res_path, 'a') as exp:
#                             while pos != 1:
#                                 try:
#                                   #### TEST
#                                     feed_dict_t, idx2wq = fill_feed_dict_single(test_dataset,
#                                                                                 prev_predict,
#                                                                                 0, context_plt,
#                                                                                 zp_plt, zm_plt,
#                                                                                 gf_plt, gw_plt,
#                                                                                 next_plt,
#                                                                                 copy_plt,
#                                                                                 proj_plt)
#                                     prev_predict = sess.run([predicted_label],
#                                                             feed_dict=feed_dict_t)
#                                     prev = prev_predict[0][0][0]
#                                     if prev in idx2wq:
#                                         exp.write(idx2wq[prev] + ' ')
#                                         len_sent = len_sent + 1
#                                     else:
#                                         exp.write('<unk> ')
#                                         len_sent = len_sent + 1
#                                     if prev == word2idx['.']:
#                                         pos = 1
#                                         exp.write('\n')
#                                     if len_sent == 50:
#                                         break
#                                     prev_predict = prev
#                                 except:
#                                     pass
#                 except:
#                     pass

#             # res_path = os.path.join(expt_result_path, 'sample%d.txt'%epoch)
#             # file =  open(res_path, "r")
#             # Bleu = 0.
#             # for sentence in file:
#             #     bleu = compute_BLEU(sentence, reference)
#             #     # print("bleu: %4f" % (bleu))
#             #     Bleu += bleu

#             # train_loss_epoch.append(train_loss_epoch.mean())
#             duration_e = time.time() - start_e
#             print("Time taken for epoch : %d is %.3f minutes" % (epoch, duration_e/60))

#             ######## ADDED ########
#             # BLEU_per_epoch[epoch-1] = Bleu /  num_train_examples  
#             # print("BLEU metric for epoch %d : %4f" % (epoch, BLEU_per_epoch[epoch-1]))
#             ################

#             # bleu_res = os.path.join(expt_result_path, 'bleu.txt')
#             # with open(bleu_res, 'a') as bleu_f:
#             #     bleu_f.write("Epoch : %d\tBlue: %0.5f\n" % (epoch, BLEU_per_epoch[epoch-1]))

#             train_res = os.path.join(expt_result_path, 'train_loss.txt')
#             with open(train_res, 'a') as tloss_f:
#                 tloss_f.write("Epoch : %d\tTrain loss: %0.5f\tComputation time: %0.3f\n" % (epoch, np.mean(train_loss_tot), duration_e))

#             print("Saving checkpoint for epoch %d" % (epoch))
#             checkpoint_file = os.path.join(chkpt_result_path, str(epoch) + '.ckpt')
#             saver.save(sess, checkpoint_file)

#             print("Validation starting")
#             start = time.time()
#             valid_loss = do_eval(sess, train_op, loss,
#                                  valid_dataset, context_pl,
#                                  zp_pl, zm_pl, gf_pl, gw_pl,
#                                  next_pl, copy_pl, proj_pl)
#             duration = time.time() - start
#             print("Epoch : %d\tValidation loss: %0.5f" % (epoch, valid_loss))
#             print("Time taken for validating epoch %d : %0.3f" % (epoch, duration))
#             valid_res = os.path.join(expt_result_path, 'valid_loss.txt')
            
#             with open(valid_res, 'a') as vloss_f:
#                 vloss_f.write("Epoch : %d\tValidation loss: %0.5f\tComputation time: %0.3f\n" % (epoch, valid_loss, duration))
            

In [None]:
from nltk.translate.bleu_score import sentence_bleu

# BLEU score nécessite des listes de mots
# cette méthode prend une phrase te la transforme en une liste de mots
def split(sentence): 
    return (sentence.split())

# Générer la liste de mots de la phrase de réference et de phrase générée
def inputs(path, sentence):
  file =  open(path, "r")
  reference = split(sentence)
  last_line = ' '
  for sentence in file:
    if '-lrb-' in sentence[:8]:
      last_line = sentence
  
  candidate = split(last_line)
  reference.append('-lrb-')
  if '-rrb-' in last_line:
      reference.append('-rrb-')
  return candidate, reference

# path to file (variable : res_path) : On change LE PATH par la variable res_path
path_for_test_good_sentence = '/content/drive/MyDrive/GM5/DEEP/Paper/experiment/4/sample.txt'
# path_for_test_bad_sentence = '/content/drive/MyDrive/Université_de_Rouen_2020/Deep_Learning_Projet/paper_code/NeuralTextGeneration-master/experiment/0/sample.txt'

# test examples
#reference = split('lenny randle (born February 12, 1949) is a former Major League Baseball player.')
#candidate = split('born february 12 , 1949 is a former professional football player. is an american beach in long beach california .') 

sentence = 'lenny randle (born February 12, 1949) is a former Major League Baseball player.'
candidate, reference = inputs(path_for_test_good_sentence, sentence)
print(reference)
print(candidate)

BLEU_score = sentence_bleu(reference, candidate)
print(BLEU_score)


# OUTPUT
#['lenny', 'randle', '(born', 'February', '12,', '1949)', 'is', 'a', 'former', 'Major', 'League', 'Baseball', 'player.', '-lrb-', '-rrb-']
#[',', '-lrb-', 'born', 'february', '12', ',', '1949', '-rrb-', 'is', 'a', 'former', 'professional', 'football', 'player', '.']
#0.668740304976422