# Aims
- Test the use of featuretools to create a morning entity set
- import pickle  data and put it into ft entity sets
- check that can select only records that are active for a window in a morning

In [1]:
#%cd ./flosp
import pandas as pd
import numpy as np
import seaborn as sns
import featuretools as ft
#import flosp
#import bradlib as bl
sns.set()

# from core import create_timeseries_from_events, make_callender_columns

# get raw data

In [2]:
ED = pd.read_pickle('./../../../3_Data/EDclassification/cleanED.pkl')

In [3]:
ED.head(2)

Unnamed: 0,pseudo_patient_number,attendance_number,age,gender,department,site_attended,arrival,arrival_mode,arrival_mode_desc,1st_time_triaged_[datetime],...,speciality_referred_to,time_into_cdu,time_leaving_dept_inc_cdu,admission_flag,majorsminorsresus_patient,triage_category,flag_admission,flag_bedreq,flag_specreq,flag_cdu
0,4101045105,90902325901,94.0,F,Main ED,Main ED,2009-04-13 09:16:00,,BROUGHT IN BY AMBULANCE,,...,,NaT,2009-04-13 12:24:00,N,Majors,VERY URGENT,0,1,1,1
1,644868441371,90902326101,47.0,M,Main ED,Main ED,2009-04-13 09:22:00,,BROUGHT IN BY AMBULANCE,,...,,NaT,2009-04-13 12:55:00,Y,Majors,VERY URGENT,1,1,1,1


In [4]:
ED.shape

(913837, 23)

In [5]:
#### reduce number of records for simplicity
ED = ED[ED.arrival > pd.datetime(2013,1,1)]

In [6]:
ED.shape

(571244, 23)

In [7]:
ED.isnull().sum()

pseudo_patient_number                 1
attendance_number                     0
age                                  10
gender                               10
department                            0
site_attended                         0
arrival                               0
arrival_mode                          0
arrival_mode_desc                     0
1st_time_triaged_[datetime]       10249
1st_time_seen_by_doctor           50376
time_of_inpatient_bed_request    341898
time_of_speciality_referral      394712
speciality_referred_to           394713
time_into_cdu                    484301
time_leaving_dept_inc_cdu             0
admission_flag                        0
majorsminorsresus_patient         64660
triage_category                  427604
flag_admission                        0
flag_bedreq                           0
flag_specreq                          0
flag_cdu                              0
dtype: int64

# clean ED

In [8]:
ED['wait_time_total'] = (ED['time_leaving_dept_inc_cdu'] - ED['arrival'])/pd.Timedelta(1,'m')

In [9]:
ED['arr_date'] = ED.arrival.apply(lambda x: x.date())

# create es

In [10]:
es = ft.EntitySet(id='activity')
es

Entityset: activity
  Entities:
  Relationships:
    No relationships

#### select columns

In [11]:
ED.columns

Index(['pseudo_patient_number', 'attendance_number', 'age', 'gender',
       'department', 'site_attended', 'arrival', 'arrival_mode',
       'arrival_mode_desc', '1st_time_triaged_[datetime]',
       '1st_time_seen_by_doctor', 'time_of_inpatient_bed_request',
       'time_of_speciality_referral', 'speciality_referred_to',
       'time_into_cdu', 'time_leaving_dept_inc_cdu', 'admission_flag',
       'majorsminorsresus_patient', 'triage_category', 'flag_admission',
       'flag_bedreq', 'flag_specreq', 'flag_cdu', 'wait_time_total',
       'arr_date'],
      dtype='object')

In [12]:
df_ed = ED[['arrival','arr_date','attendance_number','pseudo_patient_number',
            'age','gender',
                    'time_leaving_dept_inc_cdu',
            'flag_admission',
            'flag_cdu',
            'wait_time_total',
#             'flag_bedreq',
            'flag_specreq',
                   ]]

for i in ED.columns:
    print(i)
    print(ED[i].dtype)

In [13]:
df_ed.head(2)

Unnamed: 0,arrival,arr_date,attendance_number,pseudo_patient_number,age,gender,time_leaving_dept_inc_cdu,flag_admission,flag_cdu,wait_time_total,flag_specreq
330791,2013-03-17 13:06:00,2013-03-17,91301942801,596656963303,8.0,F,2013-03-17 16:38:00,0,1,212.0,1
330792,2013-03-23 16:20:00,2013-03-23,91302100301,133313334304,31.0,M,2013-03-24 23:55:00,0,1,1895.0,1


#### make attendances

In [14]:
es = es.entity_from_dataframe(entity_id='EDatt',
                        dataframe=df_ed,
                         # dataframe=df_ed[:],
                    index='attendance_number',
                        time_index='arrival',
                              secondary_time_index={'time_leaving_dept_inc_cdu':['wait_time_total','flag_admission','flag_cdu','flag_specreq']}, #,,'flag_bedreq']},
                             variable_types={'arrival':ft.variable_types.Datetime,
                                            'arr_date':ft.variable_types.Datetime,
                                             'pseudo_patient_number':ft.variable_types.Id,
                                             'age':ft.variable_types.Numeric,
                                             'gender':ft.variable_types.Categorical,
                                             'attendance_number':ft.variable_types.Numeric,
                                             'flag_admission':ft.variable_types.Numeric,
#                                              'flag_bedreq':ft.variable_types.Categorical,
#                                              'flag_specreq':ft.variable_types.Categorical,
                                             'flag_cdu':ft.variable_types.Categorical,
})

#### look at time index's

es['EDatt'].last_time_index = es['EDatt'].df['time_leaving_dept_inc_cdu'] # necessary? or should set for now?

In [15]:
es['EDatt'].df['arrival'].head()

91300000201   2013-01-01 00:06:00
91300000301   2013-01-01 00:16:00
91300000401   2013-01-01 00:16:00
91300000501   2013-01-01 00:30:00
91300000601   2013-01-01 00:34:00
Name: arrival, dtype: datetime64[ns]

In [16]:
es['EDatt'].secondary_time_index

{'time_leaving_dept_inc_cdu': ['wait_time_total',
  'flag_admission',
  'flag_cdu',
  'flag_specreq',
  'time_leaving_dept_inc_cdu']}

es['EDatt'].last_time_index.head()

In [17]:
es['EDatt']

Entity: EDatt
  Variables:
    attendance_number (dtype: index)
    time_leaving_dept_inc_cdu (dtype: datetime)
    wait_time_total (dtype: numeric)
    flag_specreq (dtype: numeric)
    arrival (dtype: datetime_time_index)
    arr_date (dtype: datetime)
    pseudo_patient_number (dtype: id)
    age (dtype: numeric)
    gender (dtype: categorical)
    flag_admission (dtype: numeric)
    flag_cdu (dtype: categorical)
  Shape:
    (Rows: 571244, Columns: 11)

##### make patient table - (removed for time being to make simpler features, possible to have info on readmissions directly in EDatt enitity - calced manually)

es = es.normalize_entity(base_entity_id='EDatt',
                   new_entity_id ='EDpatients',
                   index='pseudo_patient_number',
                         make_time_index=True,
                   additional_variables=['gender'])

In [18]:
es = es.normalize_entity(base_entity_id='EDatt',
                   new_entity_id ='days',
                   index='arr_date',
#                          make_time_index=True,
#                    additional_variables=['gender'])
                        )


In [19]:
es['days'].df.head()

Unnamed: 0,arr_date,first_EDatt_time
2013-01-01,2013-01-01,2013-01-01 00:06:00
2013-01-02,2013-01-02,2013-01-02 00:49:00
2013-01-03,2013-01-03,2013-01-03 00:08:00
2013-01-04,2013-01-04,2013-01-04 00:03:00
2013-01-05,2013-01-05,2013-01-05 00:11:00


### make date es

start = ED.arrival.min()
end = ED.arrival.max()
days = pd.DataFrame(pd.DatetimeIndex(freq='d',start=start,end=end),columns=['arr_date'])

# feature gen

In [20]:
es

Entityset: activity
  Entities:
    EDatt [Rows: 571244, Columns: 11]
    days [Rows: 2099, Columns: 2]
  Relationships:
    EDatt.arr_date -> days.arr_date

fm, features = ft.dfs(entityset=es,
                       target_entity='EDatt',
                     trans_primitives=['day'])

ft.list_primitives().head(10)

ft.list_primitives().query('type == "transform"')

fm, features = ft.dfs(entityset=es,
                       target_entity='days',
                     trans_primitives=['day','month'],
                     agg_primitives=['count','mean','num_true'])

In [21]:
fm, features = ft.dfs(entityset=es,
                       target_entity='days',
                     trans_primitives=['day','month','year','weekend'],
                     agg_primitives=[],
                     )
#                      agg_primitives=['count','mean','num_true'])
fm.head(3)

Unnamed: 0_level_0,DAY(first_EDatt_time),MONTH(first_EDatt_time),YEAR(first_EDatt_time),WEEKEND(first_EDatt_time)
arr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-01,1,1,2013,False
2013-01-02,2,1,2013,False
2013-01-03,3,1,2013,False


In [22]:
def save_es_as_pickle(fm, features, name):
    " Take fm df and rename columns. Save as pickle."
    for col in fm.columns:
        fm.rename(columns={col:name + '_' + col},inplace=True)
    
    fm.to_pickle('./../../../3_Data/EDclassification/processed/' + name + '.pkl')

In [23]:
save_es_as_pickle(fm, features, 'callender')

In [24]:
start = pd.datetime(2011,1,1,7)
end = ED.arrival.max()
ct = pd.DataFrame(pd.DatetimeIndex(freq='d',start=start,end=end),columns=['time'])
ct['arr_date'] = ct.time.apply(lambda x: x.date())
ct.head()

Unnamed: 0,time,arr_date
0,2011-01-01 07:00:00,2011-01-01
1,2011-01-02 07:00:00,2011-01-02
2,2011-01-03 07:00:00,2011-01-03
3,2011-01-04 07:00:00,2011-01-04
4,2011-01-05 07:00:00,2011-01-05


ft.list_primitives().query('type == "aggregation"')

In [25]:
fm, features = ft.dfs(entityset=es,
                       target_entity='days',
                     trans_primitives=[],
                     agg_primitives=['count','mean','std','num_true','sum'],
                     cutoff_time=ct)

save_es_as_pickle(fm, features, 'EDmorn')

fm.head(3)

Unnamed: 0_level_0,EDmorn_COUNT(EDatt),EDmorn_MEAN(EDatt.wait_time_total),EDmorn_MEAN(EDatt.flag_specreq),EDmorn_MEAN(EDatt.age),EDmorn_MEAN(EDatt.flag_admission),EDmorn_STD(EDatt.wait_time_total),EDmorn_STD(EDatt.flag_specreq),EDmorn_STD(EDatt.age),EDmorn_STD(EDatt.flag_admission),EDmorn_SUM(EDatt.wait_time_total),EDmorn_SUM(EDatt.flag_specreq),EDmorn_SUM(EDatt.age),EDmorn_SUM(EDatt.flag_admission)
arr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-01,0,,,,,,,,,0.0,0.0,0.0,0.0
2011-01-02,0,,,,,,,,,0.0,0.0,0.0,0.0
2011-01-03,0,,,,,,,,,0.0,0.0,0.0,0.0


In [70]:
fm, features = ft.dfs(entityset=es,
                       target_entity='days',
                     trans_primitives=[],
                     agg_primitives=['count','mean','std','num_true','sum'],
#                      cutoff_time=ct,
                     )

fm.head(3)

Unnamed: 0_level_0,COUNT(EDatt),MEAN(EDatt.wait_time_total),MEAN(EDatt.flag_specreq),MEAN(EDatt.age),MEAN(EDatt.flag_admission),STD(EDatt.wait_time_total),STD(EDatt.flag_specreq),STD(EDatt.age),STD(EDatt.flag_admission),SUM(EDatt.wait_time_total),SUM(EDatt.flag_specreq),SUM(EDatt.age),SUM(EDatt.flag_admission)
arr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-01,312,209.041667,0.737179,39.262821,0.375,230.640702,0.440873,26.373184,0.484901,65221.0,230.0,12250.0,117.0
2013-01-02,257,260.194553,0.677043,41.264591,0.385214,227.621004,0.468519,26.95352,0.487595,66870.0,174.0,10605.0,99.0
2013-01-03,238,282.92437,0.693277,41.819328,0.436975,257.883045,0.462105,27.278779,0.497057,67336.0,165.0,9953.0,104.0


In [71]:
fm.index = fm.index.shift(1,'d')

In [72]:
save_es_as_pickle(fm, features, 'EDprevday')

In [73]:
break

SyntaxError: 'break' outside loop (<ipython-input-73-6aaf1f276005>, line 1)

In [57]:
fm.head(3)

Unnamed: 0_level_0,COUNT(EDatt),MEAN(EDatt.wait_time_total),MEAN(EDatt.age),STD(EDatt.wait_time_total),STD(EDatt.age),SUM(EDatt.wait_time_total),SUM(EDatt.age),DAY(first_EDatt_time),MONTH(first_EDatt_time)
arr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-09-01,50,147.842105,43.1,91.707423,25.1601,2809.0,2155.0,1,9
2018-09-02,47,161.533333,46.212766,85.856913,25.208639,2423.0,2172.0,2,9
2018-09-03,37,200.133333,51.945946,88.566252,31.102828,3002.0,1922.0,3,9


In [57]:
fm.head(3)

Unnamed: 0_level_0,COUNT(EDatt),MEAN(EDatt.wait_time_total),MEAN(EDatt.age),STD(EDatt.wait_time_total),STD(EDatt.age),SUM(EDatt.wait_time_total),SUM(EDatt.age),DAY(first_EDatt_time),MONTH(first_EDatt_time)
arr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-09-01,50,147.842105,43.1,91.707423,25.1601,2809.0,2155.0,1,9
2018-09-02,47,161.533333,46.212766,85.856913,25.208639,2423.0,2172.0,2,9
2018-09-03,37,200.133333,51.945946,88.566252,31.102828,3002.0,1922.0,3,9


In [67]:
ct2 = ct.copy()
ct2.drop('time',axis=1,inplace=True)
ct2['time'] = pd.to_datetime(ct['arr_date'])

ct2.head(2)

Unnamed: 0,arr_date,time
0,2018-09-01,2018-09-01
1,2018-09-02,2018-09-02


In [69]:
fm, features = ft.dfs(entityset=es,
                       target_entity='days',
                     trans_primitives=['day','month'],
                     agg_primitives=['count','mean','std','num_true','sum'],
                     cutoff_time=ct2,
                     training_window = '24h')

fm.head(3)



Unnamed: 0_level_0,COUNT(EDatt),MEAN(EDatt.wait_time_total),MEAN(EDatt.age),STD(EDatt.wait_time_total),STD(EDatt.age),SUM(EDatt.wait_time_total),SUM(EDatt.age),DAY(first_EDatt_time),MONTH(first_EDatt_time)
arr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-09-01,0,,,,,0.0,0.0,,
2018-09-02,0,,,,,0.0,0.0,,
2018-09-03,0,,,,,0.0,0.0,,


In [58]:
es['EDatt'].secondary_time_index

{'time_leaving_dept_inc_cdu': ['wait_time_total', 'time_leaving_dept_inc_cdu']}

fm, features = ft.dfs(entityset=es,
                       target_entity='days')

fm.head()

Cutoff times on attendance ES

es.add_last_time_indexes()

In [21]:
ED[ED.attendance_number == 91807209801]

Unnamed: 0,pseudo_patient_number,attendance_number,age,gender,department,site_attended,arrival,arrival_mode,arrival_mode_desc,1st_time_triaged_[datetime],...,time_leaving_dept_inc_cdu,admission_flag,majorsminorsresus_patient,triage_category,flag_admission,flag_bedreq,flag_specreq,flag_cdu,wait_time_total,arr_date
1078924,400848001421,91807209801,52.0,M,Main ED,Main ED,2018-09-01 00:07:00,Patient arranged own transport / walk-in,OTHER,01-SEP-18 00:26,...,2018-09-01 00:43:00,N,,URGENT,0,1,1,1,36.0,2018-09-01


In [22]:
ct = pd.DataFrame()

# ct['arr_date'] = [pd.datetime(2018,9,3,0,0)]
ct['attendance_number'] = [91807209801]

ct['time'] = pd.to_datetime(['2018-09-3 00:07'])

In [23]:
es['EDatt']

Entity: EDatt
  Variables:
    attendance_number (dtype: index)
    time_leaving_dept_inc_cdu (dtype: datetime)
    wait_time_total (dtype: numeric)
    arrival (dtype: datetime_time_index)
    arr_date (dtype: datetime)
    pseudo_patient_number (dtype: id)
    age (dtype: numeric)
    gender (dtype: categorical)
    admission_flag (dtype: categorical)
    flag_bedreq (dtype: categorical)
    flag_specreq (dtype: categorical)
    flag_cdu (dtype: categorical)
  Shape:
    (Rows: 9025, Columns: 12)

In [32]:
fm, features = ft.dfs(entityset=es,
                       target_entity='EDatt',
#                        cutoff_time=ct,
#                       cutoff_time_in_index=True, # just adds the cuttoff time to fm, if non given in ct df then its taken as now
#                      training_window="48 hours",
                     )
 

fm.head()

Unnamed: 0_level_0,pseudo_patient_number,age,gender,flag_cdu,DAY(time_leaving_dept_inc_cdu),DAY(arrival),DAY(arr_date),YEAR(time_leaving_dept_inc_cdu),YEAR(arrival),YEAR(arr_date),MONTH(time_leaving_dept_inc_cdu),MONTH(arrival),MONTH(arr_date),WEEKDAY(time_leaving_dept_inc_cdu),WEEKDAY(arrival),WEEKDAY(arr_date)
attendance_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
91807184702,231525311101,30.0,F,0,5,5,5,2018,2018,2018,9,9,9,2,2,2
91807209801,400848001421,52.0,M,1,1,1,1,2018,2018,2018,9,9,9,5,5,5
91807209901,874787747747,41.0,M,1,1,1,1,2018,2018,2018,9,9,9,5,5,5
91807210001,644767444724,31.0,F,1,1,1,1,2018,2018,2018,9,9,9,5,5,5
91807210101,774474742172,70.0,M,1,1,1,1,2018,2018,2018,9,9,9,5,5,5


In [34]:
fm, features = ft.dfs(entityset=es,
                       target_entity='EDatt',
                       cutoff_time=ct,
                      cutoff_time_in_index=True, # just adds the cuttoff time to fm
                     training_window="48 hours",
                     )
 

fm

Unnamed: 0_level_0,Unnamed: 1_level_0,pseudo_patient_number,age,gender,flag_cdu,DAY(time_leaving_dept_inc_cdu),DAY(arrival),DAY(arr_date),YEAR(time_leaving_dept_inc_cdu),YEAR(arrival),YEAR(arr_date),MONTH(time_leaving_dept_inc_cdu),MONTH(arrival),MONTH(arr_date),WEEKDAY(time_leaving_dept_inc_cdu),WEEKDAY(arrival),WEEKDAY(arr_date)
attendance_number,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
91807209801,2018-09-03 00:07:00,400848001421,52.0,M,1,1,1,1,2018,2018,2018,9,9,9,5,5,5


In [132]:
ct = pd.DataFrame()

ct['arr_date'] = [pd.datetime(2018,9,3,0,0)]

ct['time'] = pd.to_datetime(['2018-09-02 06:00'])

In [134]:
ct

Unnamed: 0,arr_date,time
0,2018-09-03,2018-09-02 06:00:00


In [131]:
fm, features = ft.dfs(entityset=es,
                       target_entity='EDatt',
                       cutoff_time=ct,
                      cutoff_time_in_index=True,
                     training_window="24 hours",
                     )
 

fm

Unnamed: 0_level_0,Unnamed: 1_level_0,SUM(EDatt.wait_time_total),SUM(EDatt.age),STD(EDatt.wait_time_total),STD(EDatt.age),MAX(EDatt.wait_time_total),MAX(EDatt.age),SKEW(EDatt.wait_time_total),SKEW(EDatt.age),MIN(EDatt.wait_time_total),MIN(EDatt.age),...,NUM_UNIQUE(EDatt.WEEKDAY(time_leaving_dept_inc_cdu)),NUM_UNIQUE(EDatt.WEEKDAY(arrival)),MODE(EDatt.DAY(time_leaving_dept_inc_cdu)),MODE(EDatt.DAY(arrival)),MODE(EDatt.YEAR(time_leaving_dept_inc_cdu)),MODE(EDatt.YEAR(arrival)),MODE(EDatt.MONTH(time_leaving_dept_inc_cdu)),MODE(EDatt.MONTH(arrival)),MODE(EDatt.WEEKDAY(time_leaving_dept_inc_cdu)),MODE(EDatt.WEEKDAY(arrival))
arr_date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-09-03,2018-09-02 06:00:00,0,0,,,,,,,,,...,,,,,,,,,,


In [108]:
es['days']

Entity: days
  Variables:
    arr_date (dtype: index)
  Shape:
    (Rows: 30, Columns: 1)

# save out basic entities as pkl

es.to_pickle(pickle_path)

In [33]:
break

SyntaxError: 'break' outside loop (<ipython-input-33-6aaf1f276005>, line 1)

# Create day features with cuttoffs for different times of day

In [None]:
es

### create hourly - atten_no realationship

In [None]:
hrly_atten_rel = es['EDatt'].df[['arrival','time_leaving_dept_inc_cdu']]
hrly_atten_rel['arrival'] = hrly_atten_rel['arrival'].apply(lambda x : x.replace(second=0,minute=0)) # round arrival hour down
hrly_atten_rel['time_leaving_dept_inc_cdu'] = hrly_atten_rel['time_leaving_dept_inc_cdu'].apply(lambda x : x.replace(second=0,minute=0)) +pd.Timedelta(hours=1) # round leaving tim up
hrly_atten_rel.head()

In [None]:
hrly_atten_rel.head()

In [None]:
hrly_atten_rel['n_hours'] = ((hrly_atten_rel['time_leaving_dept_inc_cdu'] - hrly_atten_rel['arrival'])/pd.Timedelta(1,'h')).astype(int)

In [None]:
hrly_atten_rel.head()

In [None]:
import itertools

In [None]:
#### create a (long format) list of links between attendance numbers and 
#%%timeit



# function for list comp which finds list of datetimes (for each hour)
date_func = lambda datetime , offset : datetime + pd.Timedelta(offset,'h')

# iterate over rows in df
sample = hrly_atten_rel.head()
sample = sample.reset_index()

ids = np.empty(shape=(sample['n_hours'].sum()),dtype='int64') # initilise array - change to np.empty() to speed up
timestamps = np.empty(shape=(sample['n_hours'].sum()),dtype='datetime64[s]')
row_count = 0

for row in sample.itertuples():
    atten_id = [row[1]]
    hour_list = [date_func(row[2],i) for i in np.arange(row[4])] # creates list of hour datetimes
    
    # create array of list for all combinations of timestamp
    for i in itertools.product(atten_id,hour_list):
        ids[row_count] = i[0] # assign patient numbers
        timestamps[row_count] = i[1]
        row_count += 1 # add to row count for new array
        
print(ids)
print(timestamps)
ids.shape

In [None]:
data = {'ids':ids,
       'timestamp':timestamps}

pd.DataFrame(data=data)

In [None]:
fm, features = ft.dfs(entityset=es,
                       target_entity='EDatt',
                       cutoff_time=ct,
                       cutoff_time_in_index=True,
                     max_depth=1,
                     training_window="1 hours")

fm

In [None]:
#### put back into df
pd.DataFrame(data=[ids,timestamps],columns=['atten_id','timestamp'])

In [None]:
sample.reset_index(inplace=True)

In [None]:

#X = np.array()
for i in itertools.product(['1'],['2','3','4']):
    print(list(i))
    X[row[0]] = i
    
X

In [None]:
import pandas as pd

In [None]:

X.append()

In [None]:
X = np.array([[1,2,3]])
X = np.append(X,[[3,4,5]],axis=0)
X

In [None]:
ct=pd.DataFrame()

ct['attendance_number'] = [90902042301,90902042301,90902042301,90902042301,90902043501,90902043501]

ct['time'] = pd.to_datetime(['2009-4-1 02:00',
                             '2009-4-1 03:00',
                             '2009-4-1 04:00',
                             '2009-4-1 05:00',
                              '2009-4-1 07:00',
                              '2014-1-1 04:00'])

In [None]:
ct

In [None]:
es['EDpatients'].last_time_index = es['EDpatients'].df['last_time']

In [None]:
fm, features = ft.dfs(entityset=es,
                       target_entity='EDatt',
                       cutoff_time=ct,
                       cutoff_time_in_index=True,
                     max_depth=1,
                     training_window="1 hours")

fm

In [None]:
es

In [None]:
ct2 = es['EDatt'].df[['attendance_number','arrival','arrival_mode_desc_bool']].head(5)

In [None]:
ft.__version__

In [None]:
fm, features = ft.dfs(entityset=es,
                       target_entity='EDatt',
                       cutoff_time=ct2,
                       cutoff_time_in_index=True,
                     max_depth=2)

fm

In [None]:
break

#### create a day table - by normalising a new feature

In [None]:
es = es.normalize_entity(base_entity_id='EDatt',
                   new_entity_id="days",
                    index="arr_date")

#### import hourly table

In [None]:
hrly = pd.read_pickle('./../3_Data/processed/sgh/sghHOURLY.pkl')

In [None]:
hrly.columns

In [None]:
hrly2 = hrly[['dt_date','EDocc_nonadmit', 'EDocc_admit', 'EDocc_total', 'EDocc_awaitingadm',
       'EDocc_nonbreach', 'EDocc_breach', 'IPocc_NonElective',
       'IPocc_Day_Case', 'IPocc_Elective', 'IPocc_total', 'IPocc_elec_nonelec',
       'IPadm', 'IPadm_nonelec', 'IPadm_daycase', 'IPadm_elective', 'IPdis',
       'IPdis_nonelec', 'IPdis_daycase', 'IPdis_elective',
       'IPadm_elec_nonelec', 'IPdis_elec_nonelec', 'EDarrive',
       'EDarrive_breach', 'EDarrive_adm', 'EDdepart', 'EDdepart_breach',
       'EDdepart_adm']].reset_index()

In [None]:
es.entity_from_dataframe(entity_id='hourly',dataframe=hrly2,index='index')

In [None]:
new_relationship = ft.Relationship(es["days"]["arr_date"],
                                    es["hourly"]["dt_date"])

es = es.add_relationship(new_relationship)

In [None]:
es['hourly'].df.head(2)

# summary of esets

In [None]:
es

In [None]:
es['days'].df.head()

In [None]:
es['EDatt'].df.head(2)

In [None]:
es['patients'].df.head(3)

In [None]:
es['hourly'].df.head(2)

In [None]:
es['days'].df.head(3)

# make features

feature_matrix, feature_defs = ft.dfs(entityset=es,target_entity='days',
      agg_primitives=['count','mean','num_true','sum'],
      trans_primitives=['month','year','day'],
    max_depth=1,n_jobs=2)

feature_matrix

In [None]:
feature_matrix1, feature_defs = ft.dfs(entityset=es,target_entity='days',
      agg_primitives=['count','mean','num_true','sum','max','last'],
      trans_primitives=['month','year','weekend','weekday'],
    max_depth=1,n_jobs=1)

feature_matrix1.shape

In [None]:
feature_matrix2, feature_defs = ft.dfs(entityset=es,target_entity='days',
      agg_primitives=['count','mean','num_true','sum','max','last'],
      trans_primitives=['month','year','weekend','weekday'],
    max_depth=2,n_jobs=1)

feature_matrix2.shape

In [None]:
feature_matrix, feature_defs = ft.dfs(entityset=es,target_entity='days',
      agg_primitives=['count','mean','num_true','sum','max','last'],
      trans_primitives=['month','year','weekend','weekday'],
    max_depth=3,n_jobs=1)

feature_matrix.shape

In [None]:
set(feature_matrix.columns) - set(feature_matrix1.columns)

In [None]:
for i in feature_matrix.columns:
    print(i)

In [None]:
feature_matrix.shape

In [None]:
feature_matrix.shape

## do some basic drops

In [None]:
uniques = feature_matrix.apply(lambda x: x.nunique())

In [None]:
uniques[uniques==1].index

In [None]:
feature_matrix = feature_matrix.drop(uniques[uniques==1].index, axis=1)

#### drop columns which have all same values

In [None]:
feature_matrix.shape

In [None]:
cols = list(feature_matrix)
nunique = feature_matrix.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
feature_matrix.drop(cols_to_drop, axis=1,inplace=True)

In [None]:
feature_matrix.shape

#### drop cols that are duplicated 

In [None]:
from utils import duplicate_columns

In [None]:
dups = duplicate_columns(feature_matrix)
dups

In [None]:
feature_matrix.drop(dups,axis=1,inplace=True)

In [None]:
feature_matrix.shape

### save out

In [None]:
pd.to_pickle(feature_matrix,'./activity.pkl')

In [None]:
ft.list_primitives().head(40)

In [None]:
ft.list_primitives().tail(40).loc[22].description

# Dev

In [None]:
break

In [None]:
ED.columns

#### create and link day table - by adding a new df

from core import create_timeseries_from_events

ED.columns

start = ED.tail(i).arrival.min().round('D')
end = ED.tail(i).arrival.max().round('D')

occED = create_timeseries_from_events(ED.tail(i),'arrival','time_leaving_dept_inc_cdu',col_to_split='admission_flag',start=start,end=end,freq='H')

occED['EDocc_MEAN'] = occED.sum(axis=1) # make agg col

occED['day'] = occED.index.round('D')

occEDday = occED.groupby(['day']).mean()

occEDday.index

day = occEDday['EDocc_MEAN'].reset_index()

day.head(2)

#### add to es

es = es.entity_from_dataframe(entity_id="days",dataframe=day,
                              index="day",
                              time_index="day")

new_relationship = ft.Relationship(es["days"]["day"],
                                   es["ED_atten"]["arr_date"])

es = es.add_relationship(new_relationship)

#### Look at tables

In [None]:
es['days'].df.head()

es['days'].df['day'].head()

es['days'].df['day'].dtype

es['ED_atten'].index

es['ED_atten'].df.arrival.dtype

es['patient'].df.columns #.dtype

es['patient'].df['first_ED_atten_time'].dtype

In [None]:
es['ED_atten']

In [None]:
es['patient']

In [None]:
es['days']

es['days']

In [None]:
es

# DFS

### create range of datetimes for arrival in df

In [None]:
feature_matrix, feature_defs = ft.dfs(entityset=es,target_entity="days")

In [None]:
feature_matrix

In [None]:
window_fm, window_features = ft.dfs(entityset=es,target_entity='days',cutoff_time=ct,
       cutoff_time_in_index=True,
      training_window = '24 hours')

In [None]:
pd.DatetimeIndex(start=start,end=end,freq='D')

In [None]:
ft.list_primitives().head(40)