-
Notifications
You must be signed in to change notification settings - Fork 21
/
definition.py
247 lines (192 loc) · 8.8 KB
/
definition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import pandas as pd
from cardea.data_loader import DataLoader
class ProblemDefinition:
"""A class that defines the prediction problem
by specifying cutoff times and generating the target label if it does not exist.
"""
def check_target_label(self, entity_set, target_entity, target_label):
"""Checks if target label exists in the entity set.
Args:
entity_set: fhir entityset.
target_label: The target label of the prediction problem.
target_entity: The entity name which contains the target label.
Returns:
True if the target label exists.
"""
return DataLoader().check_column_existence(entity_set, target_entity, target_label)
def check_for_missing_values_in_target_label(
self, entity_set, target_entity, target_label_column_name):
"""Checks if there is a missing value in the target label.
Args:
entity_set: fhir entityset.
target_label: The target label of the prediction problem.
target_entity: The entity name which contains the target label.
Returns:
False is the target label does not contain a missing value.
"""
return DataLoader().check_for_missing_values(entity_set,
target_entity,
target_label_column_name)
def generate_target_label(self, entity_set, target_entity, target_label):
"""Generates target labels if the entityset is missing labels.
Args:
entity_set: fhir entityset.
target_label: The target label of the prediction problem.
target_entity: The entity name which contains the target label.
Returns:
Target entity with the generated label.
"""
def generate_cutoff_times(self, entity_set):
"""Generates cutoff times for the predection problem.
Args:
entity_set: fhir entityset.
Returns:
entity_set, target_entity, series of target_labels and a dataframe of cutoff_times.
Raises:
ValueError: An error occurs if the cutoff variable does not exist.
"""
def unify_cutoff_times_hours_admission_time(self, df, cutoff_time_label):
"""Unify records cutoff times based on shared time.
Args:
df: cutoff_entity dataframe.
"""
df = df.sort_values(by=[cutoff_time_label])
df = df.reset_index()
for i in df.index:
if i == 0:
if df.at[i, 'checked'] is not True:
df.at[i, 'ct'] = df.at[i, cutoff_time_label]
df.at[i, 'checked'] = True
elif df.at[i, 'checked'] is not True:
ct_val1 = df.at[i - 1, 'ct']
end_val1 = df.at[i - 1, 'end']
start_val2 = df.at[i, cutoff_time_label]
df.at[i, 'end']
if ct_val1 < start_val2 < end_val1:
df.at[i - 1, 'ct'] = start_val2
df.at[i, 'ct'] = start_val2
df.at[i, 'checked'] = True
else:
df.at[i, 'ct'] = df.at[i, cutoff_time_label]
df.at[i, 'checked'] = True
if i + 1 == len(df):
break
return df
def unify_cutoff_times_days_admission_time(self, df, cutoff_time_label):
"""Unify records cutoff times based on shared days.
Args:
df: cutoff_entity dataframe.
"""
frames = []
for d in set(df['date']):
sub_day = df[df['date'] == d]
sub_duration_greater = sub_day[sub_day['duration'] > 0]
sub_duration_less = sub_day[sub_day['duration'] <= 0]
frames.append(sub_duration_less)
sub_duration_greater = sub_duration_greater.sort_values(by=[cutoff_time_label])
if len(sub_duration_greater) != 0:
final_date = sub_duration_greater.iloc[-1][cutoff_time_label]
for i in sub_duration_greater.index:
sub_duration_greater.at[i, 'ct'] = final_date
sub_duration_greater.at[i, 'checked'] = True
frames.append(sub_duration_greater)
for i in sub_duration_less.index:
sub_duration_less.at[i, 'ct'] = pd.NaT
sub_duration_less.at[i, 'checked'] = False
frames.append(sub_duration_less)
result = pd.concat(frames)
result = result.drop_duplicates()
result[cutoff_time_label] = pd.to_datetime(result.start)
result = result.sort_values(by=[cutoff_time_label])
result = result.reset_index()
return result
def unify_cutoff_time_admission_time(self, es, cutoff_entity, cutoff_time_label):
"""Process records in the entity that contains cutoff times
based on shared days and time.
Args:
es: fhir entityset.
Returns:
processed entity
"""
df = es[cutoff_entity].df
df[cutoff_time_label] = pd.to_datetime(df[cutoff_time_label])
df['end'] = pd.to_datetime(df['end'])
duration = (df['end'] - df[cutoff_time_label]).dt.days
duration = duration.tolist()
df['duration'] = duration
df['date'] = df[cutoff_time_label].dt.date
df['ct'] = ''
df['checked'] = False
result1 = self.unify_cutoff_times_days_admission_time(df, cutoff_time_label)
result = self.unify_cutoff_times_hours_admission_time(result1, cutoff_time_label)
if 'level_0' in result.columns:
result = result.drop(columns=['level_0'])
return result
def unify_cutoff_times_days_discharge_time(self, df, cutoff_time_label):
"""Unify records cutoff times based on shared days.
Args:
df: cutoff_entity dataframe.
"""
frames = []
for d in set(df['end_date']):
sub_day = df[df['end_date'] == d]
sub_duration_greater = sub_day[sub_day['duration'] > 0]
sub_duration_less = sub_day[sub_day['duration'] <= 0]
frames.append(sub_duration_less)
sub_duration_greater = sub_duration_greater.sort_values(by=[cutoff_time_label])
if len(sub_duration_greater) != 0:
first_date = sub_duration_greater.iloc[0][cutoff_time_label]
for i in sub_duration_greater.index:
sub_duration_greater.at[i, 'ct'] = first_date
sub_duration_greater.at[i, 'checked'] = True
frames.append(sub_duration_greater)
for i in sub_duration_less.index:
sub_duration_less.at[i, 'ct'] = pd.NaT
sub_duration_less.at[i, 'checked'] = False
frames.append(sub_duration_less)
result = pd.concat(frames)
result = result.drop_duplicates()
result[cutoff_time_label] = pd.to_datetime(result.end)
result = result.reset_index()
return result
def unify_cutoff_times_hours_discharge_time(self, df, cutoff_time_label):
"""Unify records cutoff times based on shared time.
Args:
df: cutoff_entity dataframe.
"""
frames = []
for d in set(df['end_date']):
sub_day = df[df['end_date'] == d]
for h in set(sub_day['hour']):
sub_hour = sub_day[sub_day['hour'] == h]
sub_hour = sub_hour.sort_values(by=[cutoff_time_label])
if len(sub_hour) != 0:
first_date = sub_hour.iloc[0][cutoff_time_label]
for i in sub_hour.index:
sub_hour.at[i, 'ct'] = first_date
sub_hour.at[i, 'checked'] = True
frames.append(sub_hour)
result = pd.concat(frames)
result = result.drop_duplicates()
return result
def unify_cutoff_time_discharge_time(self, es, cutoff_entity, cutoff_time_label):
"""Process records in the entity that contains cutoff times
based on shared days and time.
Args:
es: fhir entityset.
Returns:
processed entity
"""
df = es[cutoff_entity].df
df['end_date'] = df[cutoff_time_label].dt.date
df['hour'] = df.end.apply(lambda x: x.hour)
duration = (df[cutoff_time_label] - df['start']).dt.days
duration = duration.tolist()
df['duration'] = duration
df['ct'] = ''
df['checked'] = False
result1 = self.unify_cutoff_times_days_discharge_time(df, cutoff_time_label)
result = self.unify_cutoff_times_hours_discharge_time(result1, cutoff_time_label)
if 'level_0' in result.columns:
result = result.drop(columns=['level_0'])
return result