### Objective

Write an original ETL code that combines files and identifies cases that have a low blood pressure based on logic below. 

A successful project includes:

1. Original code file (Python)

2. Final report (excel or csv)

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [86]:
# Read the dataset BloodPressure into a DataFrame
bloodpressure = pd.read_csv('BloodPressure.csv')

bloodpressure.head()

Unnamed: 0,PERSON_ID,TIME,SYSTOLIC_BLOOD_PRESSURE
0,123.0,1/5/2016 7:16,33
1,123.0,1/5/2016 7:17,75
2,123.0,1/5/2016 7:18,58
3,123.0,1/5/2016 7:19,93
4,123.0,1/5/2016 7:20,35


In [87]:
# examine the number of rows and columns present in our dataframe

bloodpressure.shape

(1593, 3)

In [88]:
# examine the data type of each Series/columns
bloodpressure.dtypes

PERSON_ID                  float64
TIME                        object
SYSTOLIC_BLOOD_PRESSURE     object
dtype: object

In [89]:
# count the number of missing values in each Series
bloodpressure.isnull().sum()

PERSON_ID                  12
TIME                       12
SYSTOLIC_BLOOD_PRESSURE    13
dtype: int64

In [90]:
# use the 'isnull' Series method to filter the DataFrame rows where all the 'nan' is present
bloodpressure[bloodpressure.SYSTOLIC_BLOOD_PRESSURE.isnull()]

Unnamed: 0,PERSON_ID,TIME,SYSTOLIC_BLOOD_PRESSURE
1514,987.0,10/17/2016 18:40,
1581,,,
1582,,,
1583,,,
1584,,,
1585,,,
1586,,,
1587,,,
1588,,,
1589,,,


In [91]:
# We can use bloodpressure.dropna(how='any').shape to drops row, if 'any' values are missing in a row

# But if 'all' values are missing in a row, then we drop these rows with the code below (none are dropped in this case)
bloodpressure.dropna(how='all', inplace=True)

In [92]:
bloodpressure.isnull().sum()

PERSON_ID                  0
TIME                       0
SYSTOLIC_BLOOD_PRESSURE    1
dtype: int64

In [93]:
# use the 'isnull' Series method to filter the DataFrame rows where all nan is still present
bloodpressure[bloodpressure.SYSTOLIC_BLOOD_PRESSURE.isnull()]

Unnamed: 0,PERSON_ID,TIME,SYSTOLIC_BLOOD_PRESSURE
1514,987.0,10/17/2016 18:40,


--

In [95]:
# coverting the PERSON_ID float data type to an integer data type

bloodpressure['PERSON_ID'] = bloodpressure.PERSON_ID.astype(int)

In [96]:
#To convert the SYSTOLIC_BLOOD_PRESSURE data type, we shall either drop the string 'nan' or fill it with zero 
bloodpressure['SYSTOLIC_BLOOD_PRESSURE'] = pd.to_numeric(bloodpressure.SYSTOLIC_BLOOD_PRESSURE, errors='coerce').fillna(0)

In [97]:
# It's now easy to convert the SYSTOLIC_BLOOD_PRESSURE' data type from float to an integer
bloodpressure['SYSTOLIC_BLOOD_PRESSURE'] = bloodpressure.SYSTOLIC_BLOOD_PRESSURE.astype(int)

# convert Time data type from object to a datetime datatype
bloodpressure['TIME'] = pd.to_datetime(bloodpressure['TIME'])

In [98]:
bloodpressure.dtypes

PERSON_ID                           int32
TIME                       datetime64[ns]
SYSTOLIC_BLOOD_PRESSURE             int32
dtype: object

In [106]:
bloodpressure.head()

Unnamed: 0,PERSON_ID,TIME,SYSTOLIC_BLOOD_PRESSURE
0,123,2016-01-05 07:16:00,33
1,123,2016-01-05 07:17:00,75
2,123,2016-01-05 07:18:00,58
3,123,2016-01-05 07:19:00,93
4,123,2016-01-05 07:20:00,35


In [107]:
bloodpressure.tail()

Unnamed: 0,PERSON_ID,TIME,SYSTOLIC_BLOOD_PRESSURE
1576,111,2015-08-15 09:34:00,31
1577,111,2015-08-15 09:35:00,33
1578,111,2015-08-15 09:36:00,58
1579,111,2015-08-15 09:37:00,46
1580,111,2015-08-15 09:38:00,60


In [111]:
bloodpressure[(bloodpressure.PERSON_ID==987) & (bloodpressure.SYSTOLIC_BLOOD_PRESSURE == 0)].count()

PERSON_ID                  22
TIME                       22
SYSTOLIC_BLOOD_PRESSURE    22
dtype: int64

--

Creating a new column called Date, Hours, Minutes, and from the Time column

In [120]:
time = bloodpressure.loc[0, 'TIME']

print('Hour:', time.hour)
print('Minute:', time.minute)
print('Year:', time.year)

Hour: 7
Minute: 16
Year: 2016


In [121]:
# The apply alongside Lambda function take "Time" column and return Date, hour, minutes, and day of week attribute
bloodpressure['Date'] = bloodpressure['TIME'].apply(lambda t: t.date())

bloodpressure['Hour'] = bloodpressure['TIME'].apply(lambda time: time.hour)

bloodpressure['Minutes'] = bloodpressure['TIME'].apply(lambda time: time.minute)

bloodpressure['Day of week'] = bloodpressure['TIME'].apply(lambda time: time.dayofweek)

dmap = {0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}

bloodpressure['Day of week'] = bloodpressure['Day of week'].map(dmap)

In [122]:
bloodpressure.head()

Unnamed: 0,PERSON_ID,TIME,SYSTOLIC_BLOOD_PRESSURE,Date,Hour,Minutes,Day of week
0,123,2016-01-05 07:16:00,33,2016-01-05,7,16,Tue
1,123,2016-01-05 07:17:00,75,2016-01-05,7,17,Tue
2,123,2016-01-05 07:18:00,58,2016-01-05,7,18,Tue
3,123,2016-01-05 07:19:00,93,2016-01-05,7,19,Tue
4,123,2016-01-05 07:20:00,35,2016-01-05,7,20,Tue


In [124]:
new_bloodpressure = bloodpressure.loc[:, ['PERSON_ID', 'SYSTOLIC_BLOOD_PRESSURE', 'Date', 'Minutes', 'Day of week']]

new_bloodpressure.head()

Unnamed: 0,PERSON_ID,SYSTOLIC_BLOOD_PRESSURE,Date,Minutes,Day of week
0,123,33,2016-01-05,16,Tue
1,123,75,2016-01-05,17,Tue
2,123,58,2016-01-05,18,Tue
3,123,93,2016-01-05,19,Tue
4,123,35,2016-01-05,20,Tue


In [127]:
demographics = pd.read_csv('Demographics.csv', parse_dates=[1])

demographics.head()

Unnamed: 0,PERSON_ID,SERVICE_DATE,AGE_MONTHS
0,123,2016-01-05,46
1,123,2016-02-13,47
2,456,2015-06-08,40
3,456,2015-08-15,42
4,456,2015-05-21,39


In [128]:
demographics.dtypes

PERSON_ID                int64
SERVICE_DATE    datetime64[ns]
AGE_MONTHS               int64
dtype: object

In [129]:
# Join both dataframes using a left join

bloodpressure_demographics = pd.merge(new_bloodpressure, demographics, how='left')

bloodpressure_demographics.head()

Unnamed: 0,PERSON_ID,SYSTOLIC_BLOOD_PRESSURE,Date,Minutes,Day of week,SERVICE_DATE,AGE_MONTHS
0,123,33,2016-01-05,16,Tue,2016-01-05,46
1,123,33,2016-01-05,16,Tue,2016-02-13,47
2,123,75,2016-01-05,17,Tue,2016-01-05,46
3,123,75,2016-01-05,17,Tue,2016-02-13,47
4,123,58,2016-01-05,18,Tue,2016-01-05,46


In [130]:
bloodpressure_demographics.dtypes

PERSON_ID                           int32
SYSTOLIC_BLOOD_PRESSURE             int32
Date                               object
Minutes                             int64
Day of week                        object
SERVICE_DATE               datetime64[ns]
AGE_MONTHS                          int64
dtype: object

In [131]:
bloodpressure_demographics.isnull().sum()

PERSON_ID                  0
SYSTOLIC_BLOOD_PRESSURE    0
Date                       0
Minutes                    0
Day of week                0
SERVICE_DATE               0
AGE_MONTHS                 0
dtype: int64

Question 1: 

Steps

Identify cases during which blood pressure dropped below the norm for the age (see below) for 14 continuous minutes or longer.
Assume, that the PERSON_ID is the identifier for the patient, and SERVICE_DATE is the date of the surgery that they had. Surgeries don’t span over 1 day. The Age is given for that patient, for the surgery date.  The Blood pressure is only taken during the surgery duration. 

If the child reached 44 months, systolic blood pressure is considered low at 55 mmHg and below. Before 44 months of age, 46 mmHg and below is considered low.


In [134]:
bloodpressure_demographics[(bloodpressure_demographics.Minutes>=14) & 
                           (bloodpressure_demographics.SYSTOLIC_BLOOD_PRESSURE <=46)]\
.groupby(['PERSON_ID'], as_index = False).agg({'Minutes':'count'})

Unnamed: 0,PERSON_ID,Minutes
0,111,23
1,123,102
2,234,144
3,345,82
4,456,436
5,567,48
6,789,22
7,987,72
