In [1]:
import json
from nltk.stem.porter import PorterStemmer

['inspec', 'kp20k', 'krapivin', 'nus', 'semeval']
['train', 'valid', 'test']

dataset = 'kp20k'
dataset_type = 'test'
dataset_file = '%s/%s.json' % (dataset, dataset_type)
with open(dataset_file, 'r') as f:
    lines = f.readlines()
    
obj = json.loads(lines[0])
dataset_file, len(lines), obj.keys(), obj.values()

('kp20k/test.json',
 19987,
 dict_keys(['abstract', 'keywords', 'title']),
 dict_values(['A feedback vertex set of a graph G is a set S  of its vertices such that the subgraph induced by V(G)?S is a forest. The cardinality of a minimum feedback vertex set of G  is denoted by ?(G). A graph G is 2-degenerate  if each subgraph G? of G has a vertex v  such that dG?(v)?2. In this paper, we prove that ?(G)?2n/5 for any 2-degenerate n-vertex graph G and moreover, we show that this bound is tight. As a consequence, we derive a polynomial time algorithm, which for a given 2-degenerate n-vertex graph returns its feedback vertex set of cardinality at most 2n/5.', 'feedback vertex set;decycling set;2-degenerate graphs', 'A feedback vertex set of 2-degenerate graphs']))

In [2]:
obj['abstract'], \
obj['keywords'], \
';'.join(obj['keywords']) \
    if dataset == 'kp20k' and dataset_type == 'train' \
    else r'''type of obj['keywords'] is str'''

('A feedback vertex set of a graph G is a set S  of its vertices such that the subgraph induced by V(G)?S is a forest. The cardinality of a minimum feedback vertex set of G  is denoted by ?(G). A graph G is 2-degenerate  if each subgraph G? of G has a vertex v  such that dG?(v)?2. In this paper, we prove that ?(G)?2n/5 for any 2-degenerate n-vertex graph G and moreover, we show that this bound is tight. As a consequence, we derive a polynomial time algorithm, which for a given 2-degenerate n-vertex graph returns its feedback vertex set of cardinality at most 2n/5.',
 'feedback vertex set;decycling set;2-degenerate graphs',
 "type of obj['keywords'] is str")

In [3]:
import string

def replace_punc_with_space(src: str, keep_puc=r""",.'()"""):
    punc = ''.join(set(string.punctuation) - set(keep_puc))
    return src.translate(str.maketrans(punc, ' ' * len(punc)))

tmp_s = '2-Source Dispersers for Sub-Polynomial Entropy and Ramsey Graphs Beating the Frankl-Wilson Construction. The main result of this paper is an explicit disperser for two independent sources on n bits, each of entropy k = n o(1). Put differently, setting N = 2n and K = 2k ,'.lower()

tmp_s = replace_punc_with_space(tmp_s, r""",.'()""")
tmp_s

'2 source dispersers for sub polynomial entropy and ramsey graphs beating the frankl wilson construction. the main result of this paper is an explicit disperser for two independent sources on n bits, each of entropy k   n o(1). put differently, setting n   2n and k   2k ,'

In [4]:
def add_space_besides_punc(src: str):
    src = src.replace(',', ' , ').replace(')', ' ) ').replace('(', ' ( ').replace('\'', ' \' ')

    src_word_list = src.split(' ')
    new_src_word_list = []
    for word in src_word_list:
        if '.' in word:
            if word[-1] == '.' and word[0] != '.' and word.count('.') > 1 and word.replace('.', '').isalpha():  # 单词末尾和内部均有句号且单词仅由句号和字母构成，即e.g.，i.e.一类的词
                new_src_word_list.append(word)
            elif word[-1] == '.' and word[0] == '.':  # 如.net.一类的词
                new_src_word_list.append('.' + word[1:].replace('.', ' . '))
            else:  # 不是缩写，如1.2，abc.，2.，1.2.，one2set.等
                new_src_word_list.append(word.replace('.', ' . '))
        else:
            new_src_word_list.append(word)
    return ' '.join(new_src_word_list)

tmp_s = add_space_besides_punc(tmp_s)
tmp_s

'2 source dispersers for sub polynomial entropy and ramsey graphs beating the frankl wilson construction .  the main result of this paper is an explicit disperser for two independent sources on n bits ,  each of entropy k   n o ( 1 ) . put differently ,  setting n   2n and k   2k  , '

In [5]:
def replace_number_with_token(src: str, replacement='<digit>'):
    src_str_list = src.split(' ')
    new_src_str_list = []
    for src_str in src_str_list:
        if src_str.isnumeric():
            new_src_str_list.append(replacement)
        else:
            new_src_str_list.append(src_str)
    return ' '.join(new_src_str_list)

tmp_s = replace_number_with_token(tmp_s)
replace_number_with_token('1 123123 1231aaa 1 1 132 s1'), tmp_s, ' '.join(tmp_s.split())

('<digit> <digit> 1231aaa <digit> <digit> <digit> s1',
 '<digit> source dispersers for sub polynomial entropy and ramsey graphs beating the frankl wilson construction .  the main result of this paper is an explicit disperser for two independent sources on n bits ,  each of entropy k   n o ( <digit> ) . put differently ,  setting n   2n and k   2k  , ',
 '<digit> source dispersers for sub polynomial entropy and ramsey graphs beating the frankl wilson construction . the main result of this paper is an explicit disperser for two independent sources on n bits , each of entropy k n o ( <digit> ) . put differently , setting n 2n and k 2k ,')

In [6]:
import re

def process_text(one_piece_data):
    one_piece_src = one_piece_data['title'] + ' . <eos> ' + one_piece_data['abstract'] + '.'
    one_piece_src = re.sub('\.\.$', '.', one_piece_src)
    one_piece_src = one_piece_src.replace('. . <eos>', '. <eos>')
    one_piece_src = one_piece_src.lower()
    one_piece_src = replace_punc_with_space(one_piece_src, r""",.'()""")  # 把除','，'.'，'('，')'，'\''以外的符号都替换为空格
    one_piece_src = add_space_besides_punc(one_piece_src)  # 在标点周围放上空格
    one_piece_src = replace_number_with_token(one_piece_src)  # 将单独的数字都替换为 <digit>
    one_piece_src = ' '.join(one_piece_src.split())  # 连续空格保留一个
    one_piece_src = one_piece_src.replace(' . eos ', ' . <eos> ', 1)
    one_piece_src = re.sub('\. \.$', '.', one_piece_src)
    return one_piece_src

processed_src_sample = process_text(obj)
processed_src_sample

'a feedback vertex set of <digit> degenerate graphs . <eos> a feedback vertex set of a graph g is a set s of its vertices such that the subgraph induced by v ( g ) s is a forest . the cardinality of a minimum feedback vertex set of g is denoted by ( g ) . a graph g is <digit> degenerate if each subgraph g of g has a vertex v such that dg ( v ) <digit> . in this paper , we prove that ( g ) 2n <digit> for any <digit> degenerate n vertex graph g and moreover , we show that this bound is tight . as a consequence , we derive a polynomial time algorithm , which for a given <digit> degenerate n vertex graph returns its feedback vertex set of cardinality at most 2n <digit> .'

In [7]:
x = '12314... 3123..'
re.sub('\.\.$', '.', x)

'12314... 3123.'

In [8]:
def process_kp_list(kp_list: list):
    res = []
    for kp in kp_list:
        kp = kp.lower()
        kp = replace_punc_with_space(kp)
        kp = add_space_besides_punc(kp)
        kp = replace_number_with_token(kp)
        kp = ' '.join(kp.split())
        kp = kp.strip(' ')
        res.append(kp)
    return res

processed_trg_sample = process_kp_list(obj['keywords'] 
                                       if dataset == 'kp20k' and dataset_type == 'train' 
                                       else obj['keywords'].split(';'))
processed_trg_sample

['feedback vertex set', 'decycling set', '<digit> degenerate graphs']

In [9]:
def separate_p_and_a(src: str, kp_list: list):
    porter_stemmer = PorterStemmer()

    src_stemmed_word_list = [porter_stemmer.stem(word) for word in src.split(' ')]
    present = []
    absent = []
    for kp in kp_list:
        kp_stemmed_word_list = [porter_stemmer.stem(word) for word in kp.split(' ')]
        if not kp_stemmed_word_list:  # kp为空
            absent.append(kp)
        else:
            is_present = False
            for src_start_idx in range(len(src_stemmed_word_list) - len(kp_stemmed_word_list) + 1):
                is_present = True
                for kp_i, kp_w in enumerate(kp_stemmed_word_list):
                    src_w = src_stemmed_word_list[src_start_idx + kp_i]
                    if src_w != kp_w:
                        is_present = False
                        break
                if is_present:
                    break
            if is_present:
                present.append(kp)
            else:
                absent.append(kp)
    return present, absent

separate_p_and_a(processed_src_sample, processed_trg_sample) \
    if dataset == 'kp20k' and (dataset_type == 'train' or 'valid') \
    else 'func `separate_p_and_a` will not be used.'

(['feedback vertex set', '<digit> degenerate graphs'], ['decycling set'])

In [10]:
src = []
trg = []
for i, line in enumerate(lines):
    one_piece_data = json.loads(line)

    one_piece_src = process_text(one_piece_data)
    src.append(one_piece_src + '\n')

    if dataset == 'kp20k' and dataset_type == 'train':
        one_piece_trg = process_kp_list(one_piece_data['keywords'])
        pre_kp, ab_kp = separate_p_and_a(one_piece_src, one_piece_trg)
        new_kp_list = pre_kp + ['<peos>'] + ab_kp
        trg.append(';'.join(new_kp_list) + '\n')
    elif dataset == 'kp20k' and dataset_type == 'valid':
        one_piece_trg = process_kp_list(one_piece_data['keywords'].split(';'))
        pre_kp, ab_kp = separate_p_and_a(one_piece_src, one_piece_trg)
        new_kp_list = pre_kp + ['<peos>'] + ab_kp
        trg.append(';'.join(new_kp_list) + '\n')
    else:
        one_piece_trg = process_kp_list(one_piece_data['keywords'].split(';'))
        trg.append(';'.join(one_piece_trg) + '\n')
    print('{} / {}'.format(i+1, len(lines)))

len(src)

1 / 19987
2 / 19987
3 / 19987
4 / 19987
5 / 19987
6 / 19987
7 / 19987
8 / 19987
9 / 19987
10 / 19987
11 / 19987
12 / 19987
13 / 19987
14 / 19987
15 / 19987
16 / 19987
17 / 19987
18 / 19987
19 / 19987
20 / 19987
21 / 19987
22 / 19987
23 / 19987
24 / 19987
25 / 19987
26 / 19987
27 / 19987
28 / 19987
29 / 19987
30 / 19987
31 / 19987
32 / 19987
33 / 19987
34 / 19987
35 / 19987
36 / 19987
37 / 19987
38 / 19987
39 / 19987
40 / 19987
41 / 19987
42 / 19987
43 / 19987
44 / 19987
45 / 19987
46 / 19987
47 / 19987
48 / 19987
49 / 19987
50 / 19987
51 / 19987
52 / 19987
53 / 19987
54 / 19987
55 / 19987
56 / 19987
57 / 19987
58 / 19987
59 / 19987
60 / 19987
61 / 19987
62 / 19987
63 / 19987
64 / 19987
65 / 19987
66 / 19987
67 / 19987
68 / 19987
69 / 19987
70 / 19987
71 / 19987
72 / 19987
73 / 19987
74 / 19987
75 / 19987
76 / 19987
77 / 19987
78 / 19987
79 / 19987
80 / 19987
81 / 19987
82 / 19987
83 / 19987
84 / 19987
85 / 19987
86 / 19987
87 / 19987
88 / 19987
89 / 19987
90 / 19987
91 / 19987
92 / 199

19987

In [11]:
with open('%s/process-out/%s_src.txt' % (dataset, dataset_type), 'w') as f:
    f.writelines(src)

with open('%s/process-out/%s_trg.txt' % (dataset, dataset_type), 'w') as f:
    f.writelines(trg)