-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathdans_analysis.py
129 lines (98 loc) · 5.01 KB
/
dans_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import datetime
import json
import logging
import os
from argparse import ArgumentParser
from typing import List, Tuple
import numpy as np
from matplotlib import pyplot as plt
from ruamel.yaml import CommentedMap
from analysis.config import load_config
from analysis.shared_parsers import PeriodicFiletypeCount, plot_counts, to_sorted_yearly, SortedFileCount, \
all_filetype_counts, extract_year_ticks, add_cumulative_counts
def main(config: CommentedMap) -> int:
start = datetime.datetime.now()
dans_cfg = config['data']['dans']
with open(dans_cfg['filetype_monthly_aggregate_path'], 'rt') as f:
monthly_stats: PeriodicFiletypeCount = json.loads(f.read())
filecount_sum = 0
for counts in monthly_stats.values():
filecount_sum += sum(counts.values())
logging.info(f'Total file count: {filecount_sum}')
logging.info(f'DANS analysis has {len(monthly_stats.keys())} file types (reverse-sorted by count):')
filetype_counts: List[Tuple[str, int]] = []
for filetype, monthly_counts in monthly_stats.items():
filetype_counts.append((filetype, sum(monthly_counts.values())))
for (filetype, counts_for_type) in sorted(filetype_counts, key=lambda x: x[1], reverse=True):
logging.info(f'{filetype} has a total of {counts_for_type} files')
# Aggregate to counts per year
yearly_stats = to_sorted_yearly(monthly_stats)
# Aggregate to periodic counts for all file types combined
all_filetypes_yearly_counts = all_filetype_counts(yearly_stats)
all_counts = [period_count['count'] for period_count in all_filetypes_yearly_counts['all']]
all_periods = [period_count['period'] for period_count in all_filetypes_yearly_counts['all']]
# Filter periods to specified range
first_valid_idx = all_periods.index(str(dans_cfg['min_year']))
last_valid_idx = all_periods.index(str(dans_cfg['max_year']))
all_counts = all_counts[first_valid_idx:last_valid_idx]
all_periods = all_periods[first_valid_idx:last_valid_idx]
# Generate plot for all file types combined
plt.plot(list(range(len(all_counts))), all_counts)
x_axis_labels = extract_year_ticks(all_periods, separator=' ', index=0)
plt.title('Tellingen per periode voor alle bestandstypen gecombineerd')
plt.xticks(list(range(len(all_counts))), x_axis_labels, rotation=45)
plt.legend(['Aantal bestanden'])
output_dir = dans_cfg['img_output_dir']
plt.savefig(os.path.join(output_dir, 'all-filetypes.png'))
plt.show()
# Keep only file types with more than the configured number of measurements and which are part of the selection
keep_filetypes = filter_stats(yearly_stats, dans_cfg)
# Add a cumulative count for specified types
for filetype in dans_cfg['mime_plots']:
if filetype.endswith(' cumulatief'):
original_type = filetype.replace(' cumulatief', '')
yearly_stats = add_cumulative_counts(yearly_stats, original_type)
keep_filetypes.append(filetype)
logging.info(f'Keeping {len(keep_filetypes)} filetypes for analysis: {keep_filetypes}')
kept_counts = {filetype: counts for filetype, counts in yearly_stats.items() if filetype in keep_filetypes}
plot_counts(kept_counts, dans_cfg)
end = datetime.datetime.now()
logging.info(f'Script took {end - start}')
return 0
def filter_stats(yearly_stats: SortedFileCount, dans_cfg: CommentedMap) -> List[str]:
keep_filetypes: List[str] = []
for filetype, yearly_counts in yearly_stats.items():
# We do the exercise below because the mime types included in the "mime_plots" list was decided based on the
# filters below
if filetype not in dans_cfg['mime_plots']:
continue
counts_reversed = list(reversed(yearly_counts))
if len(counts_reversed) == 0:
continue
# prune 0-counts
while counts_reversed[0]['count'] == 0:
counts_reversed.pop(0)
if len(counts_reversed) < dans_cfg['minimum_time_periods']:
continue
# Take the last quarters to assess if the file type was in decline
maybe_declining_period = yearly_counts[-dans_cfg['decline_periods']:]
yearly_changes = np.diff([p['count'] for p in maybe_declining_period])
# Ignore the file types with stable or increasing number quarterly_changes
if yearly_changes.mean() >= 0:
continue
count_idx = 0
while count_idx < len(yearly_counts):
year = int(yearly_counts[count_idx]['period'][:5])
if year < dans_cfg['min_year'] or year > dans_cfg['max_year']:
yearly_counts.pop(count_idx)
else:
count_idx += 1
# Keep the rest
keep_filetypes.append(filetype)
return keep_filetypes
if __name__ == '__main__':
parser = ArgumentParser('Performs the Data Archiving and Networked Services file metadata analysis')
parser.add_argument('-c', '--config', default='config.yaml')
args = parser.parse_args()
config = load_config(args.config)
raise SystemExit(main(config))