# 02 predictions for time intervals

Relating patient/attendance transactional dataframes to a daily or hourly dataframe requires a many-to-many link, which is not possible directly in ft. A linking dataframe can be used which does not contain unique keys, and can be joined on both sides with a many relationship.

In [1]:
import pandas as pd
import numpy as np
import featuretools as ft
from create_data import make_attendances_dataframe

In [2]:
df = make_attendances_dataframe(15)

#### create all dataframes we need

In [3]:
from create_data import make_timeindex_dataframe, make_HourlyTimeAttenNum_dataframe

In [6]:
df_ActiveVisits = make_HourlyTimeAttenNum_dataframe(df,'arrival_datetime','departure_datetime')
df_ActiveVisits.head()

Unnamed: 0,atten_id,hour
0,5,2018-01-01 02:00:00
1,5,2018-01-01 03:00:00
2,5,2018-01-01 04:00:00
3,5,2018-01-01 05:00:00
4,5,2018-01-01 06:00:00


In [5]:
df_hours = make_timeindex_dataframe(df,'hour','h')
df_hours.head(3)

Unnamed: 0,hour
0,2018-01-01 00:00:00
1,2018-01-01 01:00:00
2,2018-01-01 02:00:00


In [4]:
df_days = make_timeindex_dataframe(df,'day','D')
df_days.head(3)

Unnamed: 0,day
0,2018-01-01
1,2018-01-02
2,2018-01-03


#### Make entitity sets - as before

In [7]:
import featuretools.variable_types as vtypes
data_variable_types = {'atten_id': vtypes.Id,
                       'pat_id': vtypes.Id,
                       'arrival_datetime': vtypes.Datetime,
                      'time_in_department': vtypes.Numeric,
                       'departure_datetime': vtypes.Datetime,
                       'gender': vtypes.Boolean,
                      'ambulance_arrival': vtypes.Boolean}
es = ft.EntitySet('Hospital')
es = es.entity_from_dataframe(entity_id='attendances',
                               dataframe=df,
                               index='atten_id',
                               time_index='arrival_datetime',
                              secondary_time_index={'departure_datetime':['time_in_department']}, # dictionary here!
                               variable_types=data_variable_types)

#### make entity with each attendance and hour it is active 

In [8]:
df_ActiveVisits.head(3)

Unnamed: 0,atten_id,hour
0,5,2018-01-01 02:00:00
1,5,2018-01-01 03:00:00
2,5,2018-01-01 04:00:00


In [9]:
# Make linking-es (active_visits)
es = es.entity_from_dataframe(entity_id='active_visits',
                               dataframe=df_ActiveVisits,
                              make_index=True,
                              index='index',
                               variable_types={'atten_id':vtypes.Id,
                                              'hour':vtypes.Datetime})

#### make entity with hourly index

In [10]:
df_hours.head(3)

Unnamed: 0,hour
0,2018-01-01 00:00:00
1,2018-01-01 01:00:00
2,2018-01-01 02:00:00


In [11]:
# Make hours eset
es = es.entity_from_dataframe(entity_id='hours',
                               dataframe=df_hours,
                               index='hour',
                               variable_types={'hour':vtypes.Datetime})

As we have made more entities with dataframes (and not normalised them from existing entities) we must explicitly tell the entity set the relationships:

In [11]:
# add es relationships
rel_Atten_ActiveVisits = ft.Relationship(es["attendances"]["atten_id"],
                                    es["active_visits"]["atten_id"])
rel_Hours_ActiveVisits = ft.Relationship(es["hours"]["hour"],
                                    es["active_visits"]["hour"])
es = es.add_relationships([rel_Atten_ActiveVisits,rel_Hours_ActiveVisits]) 

#### creating features for individual hours

Now we have all our entities linked we can run DFS on the entity "hours". This will generate features like  "COUNT(active_visits)" -> in other words -> "Occupancy" for that particlar time of day.

In [25]:
fm, features = ft.dfs(entityset=es,
                     target_entity='hours',
#                     agg_primitives=[],
#                     trans_primitives=[],
                     verbose=True,
                     max_depth = 5)
fm.head(5)

Built 41 features
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|███████████| Calculated: 5/5 chunks


Unnamed: 0_level_0,COUNT(active_visits),NUM_UNIQUE(active_visits.atten_id),MODE(active_visits.atten_id),SUM(active_visits.attendances.time_in_department),STD(active_visits.attendances.time_in_department),MAX(active_visits.attendances.time_in_department),SKEW(active_visits.attendances.time_in_department),MIN(active_visits.attendances.time_in_department),MEAN(active_visits.attendances.time_in_department),NUM_UNIQUE(active_visits.attendances.pat_id),...,NUM_UNIQUE(active_visits.attendances.WEEKDAY(departure_datetime)),MODE(active_visits.attendances.MODE(active_visits.hour)),MODE(active_visits.attendances.DAY(arrival_datetime)),MODE(active_visits.attendances.DAY(departure_datetime)),MODE(active_visits.attendances.YEAR(arrival_datetime)),MODE(active_visits.attendances.YEAR(departure_datetime)),MODE(active_visits.attendances.MONTH(arrival_datetime)),MODE(active_visits.attendances.MONTH(departure_datetime)),MODE(active_visits.attendances.WEEKDAY(arrival_datetime)),MODE(active_visits.attendances.WEEKDAY(departure_datetime))
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01 00:00:00,0.0,,,0.0,,,,,,,...,,,,,,,,,,
2018-01-01 01:00:00,0.0,,,0.0,,,,,,,...,,,,,,,,,,
2018-01-01 02:00:00,1.0,1.0,5.0,0.0,,,,,,0.0,...,0.0,,,,,,,,,
2018-01-01 03:00:00,1.0,1.0,5.0,0.0,,,,,,0.0,...,0.0,,,,,,,,,
2018-01-01 04:00:00,1.0,1.0,5.0,0.0,,,,,,0.0,...,0.0,,,,,,,,,


In [26]:
features

[<Feature: COUNT(active_visits)>,
 <Feature: NUM_UNIQUE(active_visits.atten_id)>,
 <Feature: MODE(active_visits.atten_id)>,
 <Feature: SUM(active_visits.attendances.time_in_department)>,
 <Feature: STD(active_visits.attendances.time_in_department)>,
 <Feature: MAX(active_visits.attendances.time_in_department)>,
 <Feature: SKEW(active_visits.attendances.time_in_department)>,
 <Feature: MIN(active_visits.attendances.time_in_department)>,
 <Feature: MEAN(active_visits.attendances.time_in_department)>,
 <Feature: NUM_UNIQUE(active_visits.attendances.pat_id)>,
 <Feature: MODE(active_visits.attendances.pat_id)>,
 <Feature: SUM(active_visits.attendances.COUNT(active_visits))>,
 <Feature: SUM(active_visits.attendances.NUM_UNIQUE(active_visits.hour))>,
 <Feature: STD(active_visits.attendances.COUNT(active_visits))>,
 <Feature: STD(active_visits.attendances.NUM_UNIQUE(active_visits.hour))>,
 <Feature: MAX(active_visits.attendances.COUNT(active_visits))>,
 <Feature: MAX(active_visits.attendances.

In [31]:
fm.dropna(axis=1).head()

Unnamed: 0_level_0,COUNT(active_visits),SUM(active_visits.attendances.time_in_department),SUM(active_visits.attendances.COUNT(active_visits)),SUM(active_visits.attendances.NUM_UNIQUE(active_visits.hour))
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:00:00,0.0,0.0,0.0,0.0
2018-01-01 01:00:00,0.0,0.0,0.0,0.0
2018-01-01 02:00:00,1.0,0.0,0.0,0.0
2018-01-01 03:00:00,1.0,0.0,0.0,0.0
2018-01-01 04:00:00,1.0,0.0,0.0,0.0


# Further notes

#### adding "Intesting" variables

In [32]:
df.head()

Unnamed: 0,atten_id,pat_id,arrival_datetime,time_in_department,ambulance_arrival,departure_datetime,gender
5,1005,992,2018-01-01 02:49:00,197,1,2018-01-01 06:06:00,0
9,1009,10281,2018-01-01 05:39:00,122,0,2018-01-01 07:41:00,1
4,1004,4471,2018-01-01 08:07:00,89,1,2018-01-01 09:36:00,1
0,1000,8416,2018-01-01 08:15:00,59,0,2018-01-01 09:14:00,0
6,1006,472,2018-01-01 11:21:00,303,1,2018-01-01 16:24:00,0


In [48]:
es["attendances"]["ambulance_arrival"].interesting_values = [True, False]

In [51]:
fm, features = ft.dfs(entityset=es,
                     target_entity='hours',
                     agg_primitives=['count','mean','percent_true'],
                     trans_primitives=[],
                      where_primitives=['count'],
                     verbose=True,
                     max_depth = 8)
fm.head(5)

Built 3 features
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|███████████| Calculated: 5/5 chunks


Unnamed: 0_level_0,COUNT(active_visits),MEAN(active_visits.attendances.time_in_department),MEAN(active_visits.attendances.COUNT(active_visits))
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01 00:00:00,0.0,,
2018-01-01 01:00:00,0.0,,
2018-01-01 02:00:00,1.0,,
2018-01-01 03:00:00,1.0,,
2018-01-01 04:00:00,1.0,,


In [50]:
ft.list_primitives()

Unnamed: 0,name,type,description
0,skew,aggregation,Computes the skewness of a data set.
1,median,aggregation,Finds the median value of any feature with wel...
2,avg_time_between,aggregation,Computes the average time between consecutive ...
3,time_since_last,aggregation,Time since last related instance.
4,mode,aggregation,Finds the most common element in a categorical...
5,count,aggregation,Counts the number of non null values.
6,any,aggregation,Test if any value is 'True'.
7,all,aggregation,Test if all values are 'True'.
8,min,aggregation,Finds the minimum non-null value of a numeric ...
9,last,aggregation,Returns the last value.
