In [6]:
def filter_news(string):
    string = string.lower()
    return 'javascript is disabled' in string or 'requires javascript' in string or 'javascript' in string \
    or 'président' in string

import re

def cleaning(string):
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.t5.model'

In [13]:
import json

with open('news-data.json') as fopen:
    data = json.load(fopen)
    
results = []
for i in range(len(data['before'])):
    if not filter_news(data['before'][i]) and len(data['before'][i]) and len(data['after'][i]):
        results.append((cleaning(data['before'][i]), cleaning(data['after'][i])))
            
with tf.io.gfile.GFile('news-title.tsv', "w") as outfile:
    for i in range(len(results)):
        outfile.write("%s\t%s\n" % (results[i][0], results[i][1]))

In [14]:
def news_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['news-title.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def news_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['tajuk: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [15]:
t5.data.TaskRegistry.remove('news_dataset')
t5.data.TaskRegistry.add(
    'news_dataset',
    dataset_fn = news_dataset,
    splits = ['train'],
    text_preprocessor = [news_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)


  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


In [16]:
from tqdm import tqdm

nq_task = t5.data.TaskRegistry.get("news_dataset")
ds = nq_task.get_dataset(split='news-title.tsv', sequence_length={"inputs": 1024, "targets": 1024})
results = []
for ex in tqdm(tfds.as_numpy(ds)):
    results.append((ex['inputs'].tolist(), ex['targets'].tolist()))

with open('news-title.tsv.parse', 'w') as fopen:
    json.dump(results, fopen)

120410it [01:13, 1641.00it/s]
