-
Notifications
You must be signed in to change notification settings - Fork 16
/
history_dirs_reports.py
61 lines (50 loc) · 2 KB
/
history_dirs_reports.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os, json, csv
WORKING_DIR = "/home/richard/tmp/doaj/history.20200306/assembled"
BASE_DIRS = [
#"article_2016_web-pre2019",
#"article_2017_background-pre2019",
#"article_2018_background-pre2019",
#"article_2018_web-pre2019",
"article_2019_background-2019",
#"article_2019_background-pre2019",
"article_2019_web-2019",
#"article_2019_web-pre2019",
#"article_2020_background-2019",
#"article_2020_web-2019",
#"journal_2016_background-pre2019",
#"journal_2017_background-pre2019",
#"journal_2017_web-pre2019",
#"journal_2018_background-pre2019",
#"journal_2018_web-pre2019",
"journal_2019_background-2019",
#"journal_2019_background-pre2019",
#"journal_2019_web-pre2019",
#"journal_2019_web-2019",
#"journal_2020_background-2019",
#"journal_2020_web-2019"
]
OUT_DIR = "/home/richard/tmp/doaj/history.20200306/assembled"
def history_dirs_reports(working_dir, base_dirs, out_dir):
for bd in base_dirs:
if not os.path.exists(os.path.join(working_dir, bd)):
continue
out = os.path.join(out_dir, bd + ".csv")
with open(out, "w", encoding="utf-8") as o:
writer = csv.writer(o)
writer.writerow(["ID", "Date", "Path", "File ID"])
dir = os.path.join(working_dir, bd)
_walk_history_tree(dir, writer, working_dir)
def _walk_history_tree(dir, writer, working_dir):
for dirpath, dirnames, filenames in os.walk(dir):
if len(filenames) > 0:
for f in filenames:
fid = f.split(".")[0]
path = os.path.join(dirpath, f)
report_path = path[len(working_dir):]
with open(path, "r", encoding="utf-8") as g:
data = json.load(g)
id = data.get("about", "no id")
date = os.path.basename(dirpath)
writer.writerow([id, date, report_path, fid])
if __name__ == "__main__":
history_dirs_reports(WORKING_DIR, BASE_DIRS, OUT_DIR)