forked from aalok-sathe/wikifactcheck-english
-
Notifications
You must be signed in to change notification settings - Fork 1
/
loadwfc-en.py
executable file
·117 lines (102 loc) · 4.04 KB
/
loadwfc-en.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
import os
import json
from urllib.request import urlopen
from glob import glob
from tqdm import tqdm
from pathlib import Path
from functools import partial
from argparse import ArgumentParser
PROJECT = 'wikifactcheck-english'
BASEDIR = '~/.{}'.format(PROJECT)
REPOURL = 'https://rawcdn.githack.com/{prj}/{prj}/master/'.format(prj=PROJECT)
def download(full=True, dest=BASEDIR, force=False):
'''
'''
parent = Path(dest).expanduser()
parent.mkdir(exist_ok=True)
filename = 'wikifactcheck-english_{}.jsonl'
url = REPOURL + filename
for part in tqdm(['train', 'test'], desc='train and test'):
filepath = parent / filename.format(part)
if filepath.exists() and not force:
print(filepath, 'already exists')
continue
with filepath.open('wb+') as fp, urlopen(url.format(part)) as web:
for line in web:
fp.write(line)
# if full data is requested (default), also download the 5 full{} part files
# and combine them into one, also deleting the part files
if full:
filename = 'wikifactcheck-english_full{}.jsonl'
url = REPOURL + filename
filepath = parent / filename.format('')
if filepath.exists() and not force:
print(filepath, 'already exists')
return
with filepath.open('wb+') as combined:
for part in tqdm(range(5), desc='full'):
with urlopen(url.format(part)) as web:
for line in web:
combined.write(line)
def load_(pattern, lines=None, path=BASEDIR):
'''
load from all .jsonl files containing {pattern}
returns a generator over entries of the appropriate kind
'''
parent = Path(path).expanduser()
# parent.mkdir(exist_ok=True)
if not parent.exists():
path = '.'
parent = Path(path)
filenames = sorted(glob('*{}*.jsonl'.format(pattern)))
if not filenames:
inp = input('Data not found at {}. Download? [Y/n]'.format(path))
if inp.lower() in ('', 'y', 'yes', 'yep', 'yeah'):
download(dest=path, full='full' in pattern)
else:
raise StopIteration
ctr = 0
for fname in filenames:
with Path(fname).open('r') as f:
for line in f:
ctr += 1
yield json.loads(line[:-1])
if lines and ctr >= lines:
raise StopIteration
load = load_
load_train = partial(load_, 'train') # train set
load_test = partial(load_, 'test') # held-out test set
load_full = partial(load_, 'full') # full dataset, including non-annot.
if __name__ == '__main__':
parser = ArgumentParser('wikifactcheck-english')
parser.add_argument('-d', '--download', help='download dataset',
action='store_true', default=False)
parser.add_argument('-f', '--force', help='force re-download?',
action='store_true', default=False)
datasets = ['train', 'test', 'full']
parser.add_argument('-r', '--read', type=str, nargs='*',
choices=datasets,
help='read from particular datasets (default: none)')
parser.add_argument('-n', '--numlines', type=int, default=None,
help='numlines to read from each one')
parser.add_argument('-t', '--fmt', help='output format for --read option',
default='json', choices=['json', 'python'])
# parser.add_argument('-z', '--shuffle',
# help='shuffle the split to output from')
args = parser.parse_args()
if args.download:
download(force=args.force)
if args.read:
for name in args.read:
ctr = 0
for item in load(name):
if args.numlines and ctr >= args.numlines:
break
if args.fmt is 'json':
print(json.dumps(item))
else:
print(item)
ctr += 1
else:
print('no data split chosen to read, exiting')