In [None]:
import gc
import glob
import hashlib
import itertools
import json
import os
import random
import re
import subprocess
from collections import Counter
from os.path import join as pjoin

import torch
from multiprocess import Pool

from others.logging import logger
from others.tokenization import BertTokenizer
from pytorch_transformers import XLNetTokenizer

from others.utils import clean
from prepro.utils import _get_word_ngrams

import xml.etree.ElementTree as ET

nyt_remove_words = ["photo", "graph", "chart", "map", "table", "drawing"]

def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        if (lower):
            tokens = [t.lower() for t in tokens]
        if (tokens[0] == '@highlight'):
            flag = True
            tgt.append([])
            continue
        if (flag):
            tgt[-1].extend(tokens)
        else:
            source.append(tokens)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt


def greedy_selection(doc_sent_list, abstract_sent_list, summary_size):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0
    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(' '.join(abstract)).split()
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []
    for s in range(summary_size):
        cur_max_rouge = max_rouge
        cur_id = -1
        for i in range(len(sents)):
            if (i in selected):
                continue
            c = selected + [i]
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            candidates_2 = set.union(*map(set, candidates_2))
            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
            rouge_score = rouge_1 + rouge_2
            if rouge_score > cur_max_rouge:
                cur_max_rouge = rouge_score
                cur_id = i
        if (cur_id == -1):
            return selected
        selected.append(cur_id)
        max_rouge = cur_max_rouge

    return sorted(selected)


class BertData():
    def __init__(self, args):
        self.args = args
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused0]'
        self.tgt_eos = '[unused1]'
        self.tgt_sent_split = '[unused2]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]

    def preprocess(self, src, tgt, sent_labels, use_bert_basic_tokenizer=False, is_test=False):

        if ((not is_test) and len(src) == 0):
            return None

        original_src_txt = [' '.join(s) for s in src]

        idxs = [i for i, s in enumerate(src) if (len(s) > self.args.min_src_ntokens_per_sent)]

        _sent_labels = [0] * len(src)
        for l in sent_labels:
            _sent_labels[l] = 1

        src = [src[i][:self.args.max_src_ntokens_per_sent] for i in idxs]
        sent_labels = [_sent_labels[i] for i in idxs]
        src = src[:self.args.max_src_nsents]
        sent_labels = sent_labels[:self.args.max_src_nsents]

        if ((not is_test) and len(src) < self.args.min_src_nsents):
            return None

        src_txt = [' '.join(sent) for sent in src]
        text = ' {} {} '.format(self.sep_token, self.cls_token).join(src_txt)

        src_subtokens = self.tokenizer.tokenize(text)

        src_subtokens = [self.cls_token] + src_subtokens + [self.sep_token]
        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]
        cls_ids = [i for i, t in enumerate(src_subtoken_idxs) if t == self.cls_vid]
        sent_labels = sent_labels[:len(cls_ids)]

        tgt_subtokens_str = '[unused0] ' + ' [unused2] '.join(
            [' '.join(self.tokenizer.tokenize(' '.join(tt), use_bert_basic_tokenizer=use_bert_basic_tokenizer)) for tt in tgt]) + ' [unused1]'
        tgt_subtoken = tgt_subtokens_str.split()[:self.args.max_tgt_ntokens]
        if ((not is_test) and len(tgt_subtoken) < self.args.min_tgt_ntokens):
            return None

        tgt_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(tgt_subtoken)

        tgt_txt = '<q>'.join([' '.join(tt) for tt in tgt])
        src_txt = [original_src_txt[i] for i in idxs]

        return src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt


def format_to_bert(args):
    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'valid', 'test']
    for corpus_type in datasets:
        a_lst = []
        for json_f in glob.glob(pjoin(args.raw_path, '*' + corpus_type + '.*.json')):
            real_name = json_f.split('/')[-1]
            a_lst.append((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt'))))
        print(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert, a_lst):
            pass

        pool.close()
        pool.join()


def _format_to_bert(params):
    corpus_type, json_file, args, save_file = params
    is_test = corpus_type == 'test'
    if (os.path.exists(save_file)):
        logger.info('Ignore %s' % save_file)
        return

    bert = BertData(args)

    logger.info('Processing %s' % json_file)
    jobs = json.load(open(json_file))
    datasets = []
    for d in jobs:
        source, tgt = d['src'], d['tgt']

        sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3)
        if (args.lower):
            source = [' '.join(s).lower().split() for s in source]
            tgt = [' '.join(s).lower().split() for s in tgt]
        b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer,
                                 is_test=is_test)

        if (b_data is None):
            continue
        src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt = b_data
        b_data_dict = {"src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs,
                       "src_sent_labels": sent_labels, "segs": segments_ids, 'clss': cls_ids,
                       'src_txt': src_txt, "tgt_txt": tgt_txt}
        datasets.append(b_data_dict)
    logger.info('Processed instances %d' % len(datasets))
    logger.info('Saving to %s' % save_file)
    torch.save(datasets, save_file)
    datasets = []
    gc.collect()


def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
                
def _format_to_lines(params):
    f, args = params
    source, tgt = load_json(f, args.lower)
    return {'src': source, 'tgt': tgt}

In [None]:
def custom_format_to_lines(args):
    corpus_mapping = {}
    files = []
    dataset = []

    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        files.append(f)
    
    corpora = {'test': files}    
    for corpus_type in ['test']:
        for file in files:
            dataset.append(_format_to_lines([file, args]))
            pt_file = "{:s}.{:s}.{:s}.json".format(args.save_path, corpus_type, file[36:-11])
    
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                dataset = []

# Preprocess

In [None]:
import argparse
import time

# Download the src folder from the repo https://github.com/nlpyang/PreSumm/tree/master/src, 
# placing it on the current directory in order to access the `other` and `prepro` modules
from others.logging import init_logger
from prepro import data_builder

def do_tokenize(args):
    print(time.clock())
    data_builder.tokenize(args)
    print(time.clock())
    
def do_format_to_lines(args):
    print(time.clock())
    custom_format_to_lines(args)
    print(time.clock())

def do_format_to_bert(args):
    print(time.clock())
    format_to_bert(args)
    print(time.clock())

In [None]:
import time
import sys

class args:
    pass

args.mode='tokenize'
args.oracle_mode='greedy'
args.map_path='./urls'
args.save_path='./tokenized/'
args.raw_path='./raw_data/'
args.shard_size=2000
args.min_nsents=3
args.max_nsents=100
args.min_src_ntokens=5
args.max_src_ntokens=200
args.lower=True
args.log_file='./logs/cnndm.log'
args.dataset='test'
args.n_cpus=1

do_tokenize(args)

In [None]:
import time
import sys

class args:
    pass

args.mode='do_format_to_lines'
args.oracle_mode='greedy'
args.map_path='../urls'
args.save_path='./json_data/'
args.raw_path='./tokenized/'
args.shard_size=2000
args.min_nsents=3
args.max_nsents=100
args.min_src_ntokens=5
args.max_src_ntokens=200
args.lower=True
args.log_file='./logs/cnndm.log'
args.dataset='test'
args.n_cpus=1


do_format_to_lines(args)

In [None]:
class args:
    pass

args.mode='do_format_to_bert'
args.oracle_mode='greedy'
args.map_path='../urls'
args.save_path='./bert_data/'
args.raw_path='./json_data/'
args.shard_size=2000
args.min_src_nsents=3
args.max_src_nsents=100
args.min_src_ntokens_per_sent=5
args.max_src_ntokens_per_sent=200
args.lower=True
args.log_file='./logs/cnndm.log'
args.use_bert_basic_tokenizer=False
args.dataset='test'
args.n_cpus=2
args.min_tgt_ntokens=5
args.max_tgt_ntokens=500

do_format_to_bert(args)

# Test

### BertSumExt

In [None]:
!python3 src/train.py -report_rouge=False -mode test -encoder bert -task ext -test_from ./models/bertext_cnndm_transformer.pt -bert_data_path ./bert_data -log_file ./logs/bertext_cnndm_transformer -result_path='./results/cnndm'

### BertSumExtAbs

In [None]:
!python3 src/train.py -report_rouge=False -mode test -task abs -test_from ./models/cnn_abs/model_step_148000.pt -model_path ./models/cnn_abs -bert_data_path ./bert_data -log_file ./logs/bertext_cnndm_abs_transformer -result_path='./results/cnndm_abs'

### TransformerAbs

In [None]:
!python3 src/train.py -report_rouge=False -mode test -task abs -test_from ./models/cnn_abs_baseline/cnndm_baseline_best.pt -model_path ./models/cnn_abs_baseline -bert_data_path ./bert_data -log_file ./logs/cnn_abs_baseline -result_path='./results/cnndm_abs_baseline'

# Validation

In [None]:
#!git clone https://github.com/wl-research/nubia.git
os.chdir('nubia')
#!pip install -r requirements.txt
from nubia_score import Nubia
nubia = Nubia()
%cd ..

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
from sumeval.metrics.rouge import RougeCalculator
from xml.etree import ElementTree
from xml.dom import minidom
from functools import reduce
from xml.etree.ElementTree import Element, SubElement, Comment


def eval(
    reference_summary, model_summary, metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "NUBIA", "BLEURT"]):

    rouge = RougeCalculator(stopwords=True, lang="en")

    if("ROUGE_1" in metrics):
      rouge_1 = rouge.rouge_n( summary=model_summary, references=reference_summary, n=1)
    else:
      rouge_1 = None

    if("ROUGE_2" in metrics):
      rouge_2 = rouge.rouge_n(summary=model_summary,references=[reference_summary],n=2)
    else:
      rouge_2 = None

    if("ROUGE_L" in metrics):
      rouge_l = rouge.rouge_l( summary=model_summary,references=[reference_summary])
    else:
      rouge_l = None

    if("NUBIA" in metrics):
      nubia_score = nubia.score(reference_summary, model_summary)
    else:
      nubia_score =  None

    if("BLEURT" in metrics):
      bleurt_score = scorer.score([reference_summary], [model_summary])
      assert type(bleurt_score) == list and len(bleurt_score) == 1
    else:
      bleurt_score = None

    return rouge_1, rouge_2,rouge_l, nubia_score, bleurt_score

def prettify(elem):
      """Return a pretty-printed XML string for the Element.
      """
      rough_string = ElementTree.tostring(elem, 'utf-8')
      reparsed = minidom.parseString(rough_string)
      return reparsed.toprettyxml(indent="  ")
  
def create_report_valid(
    summary_array, references_summary, article, name_file,
     metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "NUBIA", "BLEURT"]):

  rouge_1_arr  = []
  rouge_2_arr  = []
  rouge_L_arr  = []
  NUBIA_arr = []
  bleurt_arr = []

  top = Element('ZakSum')

  comment = Comment('Generated by Amr Zaki')
  top.append(comment)

  i=0
  for summ in summary_array:

      
      example = SubElement(top, 'example')
      article_element   = SubElement(example, 'article')
      article_element.text = article[i]
  
      reference_element = SubElement(example, 'reference')
      reference_element.text = references_summary[i]
  
      summary_element   = SubElement(example, 'summary')
      summary_element.text = summ

      if(len(summ) != 0):
        rouge_1, rouge_2, rouge_L, nubia_score, bleurt_score = eval(references_summary[i],summ, metrics=metrics )
      else: 
        rouge_1 = rouge_2 = rouge_L = nubia_score, bleurt_score = 0
  
      eval_element = SubElement(example, 'eval')
      if(rouge_1 != None):
        ROUGE_1_element  = SubElement(eval_element, 'ROUGE_1' , {'score':str(rouge_1)})
        rouge_1_arr.append(rouge_1) 
      if(rouge_2 != None):
        ROUGE_2_element  = SubElement(eval_element, 'ROUGE_2' , {'score':str(rouge_2)})
        rouge_2_arr.append(rouge_2)
      if(rouge_L != None):
        ROUGE_L_element  = SubElement(eval_element, 'ROUGE_l' , {'score':str(rouge_L)})
        rouge_L_arr.append(rouge_L)
      if(nubia_score != None): 
        NUBIA_element =  SubElement(eval_element,'NUBIA', {'score':str(nubia_score)})
        NUBIA_arr.append(nubia_score)
      if(bleurt_score != None): 
        BLEURT_element =  SubElement(eval_element,'BLEURT', {'score':str(bleurt_score[0])})
        bleurt_arr.append(bleurt_score[0])
  
      i+=1

  if(rouge_1_arr != []): top.set('rouge_1', str(np.mean(rouge_1_arr)))
  if(rouge_2_arr != []): top.set('rouge_2', str(np.mean(rouge_2_arr)))
  if(rouge_L_arr != []): top.set('rouge_L', str(np.mean(rouge_L_arr)))
  if(NUBIA_arr != []): top.set('NUBIA', str(np.mean(NUBIA_arr)))
  if(bleurt_arr != []):top.set('BLEURT', str(np.mean(bleurt_arr)))


  with open(name_file, "w+") as f:
    print(prettify(top), file=f)

In [5]:
def evaluation(candidates, references, sources, algorithm):
    metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "BLEU", "NUBIA"]
    rouge.create_report_valid(
            candidates, references, sources,
            name_file="./validation/{}1.xml".format(algorithm),
            metrics=metrics)

In [None]:
# When the candidates texts are generated, they do not follow the same 
# order as the original summary and title. They are processed in the 
# following order: 0, 1, 10, 100, ...., 2, and not 0, 1, 2, 3, ...., which means
# that when we call `evaluation` with the texts, the lists will have a different ordering,
# resulting in wrong comparisions and results.
#
# The function below creates a new candidates file containing the content of a list where each
# item is a two item list, with the first item as the patent number (id).
# like, [[0, <patent0>], [1, <patent1>], [10, <patent2>], ...] and the second as the generated summary.
# This is achieved by simpling listing the json files and extracting
# the patent id from the name (this could perhaps be done with an int
# range where each item is casted to string and then all sorted). After, we sort the
# list by the id, resulting in an ordering just like the original summaries and titles:
# [[0, <patent0>], [1, <patent1>], [2, <patent2>], ...]
#
# The new file will be the original name + ".sorted", e.g. "bert_cnn.candidate.sorted"
def sort_candidate_files(file):
    ids = list(map(lambda f: f[15:-5], sorted(os.listdir("./json_data"))))
    candidates = open(file, "r").readlines()
    zip_id_cans = list(zip(ids, candidates))
    sorted_cans = list(sorted(zip_id_cans, key=lambda x: int(x[0])))
    sorted_cans_texts = [can[1] for can in sorted_cans]
    
    f = open(file + ".sorted", "a")
    f.writelines(sorted_cans_texts)
    f.close()

In [None]:
sort_candidate_file("./results/cnndm_baseline.0.candidate")

In [None]:
patents = open("./resumo.valid.txt").readlines()
titles = open("./titulo.valid.txt").readlines()
candidates_texts = open("cnndm_baseline.0.candidate.sorted", "r").readlines()
evaluation(candidates_texts, titles, patents, 'bert_cnn_baseline')