-
Notifications
You must be signed in to change notification settings - Fork 4
/
headers_checker.py
executable file
·127 lines (96 loc) · 4.97 KB
/
headers_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import copy
import datetime
import glob
import numpy
import os
import pandas
from wfdb_helpers import wfdb_read_header
from date_utils import *
from file_utils import *
from mimic_utils import *
dir_path = 'data/mimic3wdb/'
input_matched_csv = 'matched.csv'
tmp_patients_file = 'patients_files.json'
matched_df = pandas.read_csv(input_matched_csv, delimiter=',')
record_id = 0
def check_headers(patients=None):
if patients is None:
patients = get_patients(tmp_patients_file)
nb_h_nomatch_n = 0
nb_h_nomatch_w = 0
nb_h_nomatch = []
incorrect_date_n_s = 0
incorrect_date_n_e = 0
incorrect_date_w_s = 0
incorrect_date_w_e = 0
incorrect_date = []
for patient, files in patients.items():
files = sorted(files)
s_files = [f for f in files if 's' in f] # sxxxxx-date{n}.hea
subdir_path = dir_path + patient
for s_file in s_files:
type_wn = 'n' if 'n' in s_file else 'w'
header = wfdb_read_header(open(subdir_path + '/' + s_file + '.hea', 'r'))
if type_wn == 'n':
# Only a single .dat file
assert len(set(header['filename'])) == 1 # unique filenameeader['comments'])
assert header['nseg'] == 1
# Extract datetime
start = get_datetime(header)
end = start + get_timedelta_from_nb_samp(header['nsamp'], header['fs'])
tmp = patient + start.strftime('-%Y-%m-%d-%H-%Mn')
if s_file != tmp:
print('Header file contents does not match filename: {} VS {}'.format(s_file, tmp))
nb_h_nomatch_n += 1
nb_h_nomatch.append(s_file)
if start.year < 2080 or start.year > 2220:
print('Incorrect start date found in header contents: 2080 < {} < 2220 ({})'.format(start.year, s_file))
incorrect_date_n_s += 1
incorrect_date.append(s_file)
elif type_wn == 'w':
# Many .dat files, we need to iterate over each segment
start = get_datetime(header)
tmp = patient + start.strftime('-%Y-%m-%d-%H-%M')
if s_file != tmp:
print('Header file contents does not match filename: {} VS {}'.format(s_file, tmp))
nb_h_nomatch_w += 1
nb_h_nomatch.append(s_file)
nsampseg_from_start = 0
for i, segment in enumerate(header['filename']):
nsampseg = header['nsampseg'][i]
if 'layout' in segment:
assert nsampseg == 0
continue
if segment == '~':
nsampseg_from_start += nsampseg
continue
segment_header = wfdb_read_header(open(subdir_path + '/' + segment + '.hea', 'r'))
frequency = segment_header['fs']
# Extract datetime
supposed_start = start + get_timedelta_from_nb_samp(nsampseg_from_start, frequency)
nsampseg_from_start += nsampseg
supposed_end = start + get_timedelta_from_nb_samp(nsampseg_from_start, frequency)
if segment_header['nsamp'] != nsampseg:
print(segment_header['nsamp'], nsampseg)
exit(1)
start_date = get_datetime(segment_header, supposed_start)
end_date = start_date + get_timedelta_from_nb_samp(nsampseg, frequency)
if start_date.day != supposed_start.day:
print('Different days:', start_date, supposed_start, 'aborting...')
exit(1)
if end_date.day != supposed_end.day:
print('Different days:', end_date, supposed_end, 'aborting...')
exit(1)
if start.year < 2080 or start.year > 2220:
print('Incorrect start date found in header contents: 2080 < {} < 2220 ({})'.format(start.year, s_file))
incorrect_date_w_s += 1
incorrect_date.append(s_file)
print('[' + ','.join(nb_h_nomatch) + ']')
print('[' + ','.join(incorrect_date) + ']')
print('Found {} numerics headers with a date that is not matching header contents'.format(nb_h_nomatch_n))
print('Found {} numerics headers with invalid start dates (2080 < not here < 2220)'.format(incorrect_date_n_s))
print('Found {} numerics headers with invalid end dates (2080 < not here < 2220)'.format(incorrect_date_n_e))
print('Found {} waveform headers with a date that is not matching header contents'.format(nb_h_nomatch_w))
print('Found {} waveform headers with invalid start dates (2080 < not here < 2220)'.format(incorrect_date_w_s))
print('Found {} waveform headers with invalid end dates (2080 < not here < 2220)'.format(incorrect_date_w_e))
check_headers()