# Data interchange project COVID-19
Author: Alla Topp

In [1]:
import pandas as pd

In [2]:
# importing COVID data from new york times 
new_york_data = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')

In [3]:
new_york_data.head(5)

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0


In [4]:
len(new_york_data)

19429

In [5]:
new_york_data.dtypes

date      object
state     object
fips       int64
cases      int64
deaths     int64
dtype: object

In [6]:
new_york_data['date'] = pd.to_datetime(new_york_data['date'], errors='coerce')

In [7]:
# checking if there are any missing values in this data set
missing_values_count = new_york_data.isnull().sum()
missing_values_count  

date      0
state     0
fips      0
cases     0
deaths    0
dtype: int64

In [8]:
new_york_data.dtypes

date      datetime64[ns]
state             object
fips               int64
cases              int64
deaths             int64
dtype: object

## Estimated Inpatient Beds Occupied by State Timeseries

In [9]:
# importing data where hospital bed are oppupied by all patients

patient_all = pd.read_csv('https://healthdata.gov/sites/default/files/estimated_inpatient_all_20210102_1640.csv', thousands = ',')
patient_all.head()

Unnamed: 0,state,collection_date,Inpatient Beds Occupied Estimated,Count LL,Count UL,Percentage of Inpatient Beds Occupied Estimated,Percentage LL,Percentage UL,Total Inpatient Beds,Total LL,Total UL
0,CW,2020-12-02,529819,529480,530158,74.98,74.9,75.05,706920,706592,707248
1,CW,2020-12-03,531844,531528,532159,75.04,74.97,75.11,708908,708557,709260
2,CW,2020-12-04,528558,528238,528879,74.62,74.55,74.69,708497,708157,708837
3,CW,2020-12-05,512030,511673,512386,72.69,72.59,72.8,704537,704132,704943
4,CW,2020-12-06,498694,498341,499047,71.21,71.12,71.31,700470,700103,700837


In [10]:
len(patient_all)

1612

In [11]:
patient_all.dtypes

state                                               object
collection_date                                     object
Inpatient Beds Occupied Estimated                    int64
Count LL                                             int64
Count UL                                             int64
Percentage of Inpatient Beds Occupied Estimated    float64
Percentage LL                                      float64
Percentage UL                                      float64
Total Inpatient Beds                                 int64
Total LL                                             int64
Total UL                                             int64
dtype: object

In [12]:
#cols = ['Inpatient Beds Occupied Estimated', 'Count LL', 'Count UL', 'Total Inpatient Beds', 'Total LL', 'Total UL']
#patient_all[cols] = patient_all[cols].apply(pd.to_numeric, errors='ignore', axis=1)

In [13]:
patient_all['collection_date'] = pd.to_datetime(patient_all['collection_date'], errors='coerce')

In [14]:
patient_all.dtypes

state                                                      object
collection_date                                    datetime64[ns]
Inpatient Beds Occupied Estimated                           int64
Count LL                                                    int64
Count UL                                                    int64
Percentage of Inpatient Beds Occupied Estimated           float64
Percentage LL                                             float64
Percentage UL                                             float64
Total Inpatient Beds                                        int64
Total LL                                                    int64
Total UL                                                    int64
dtype: object

In [15]:
# checking if there are any missing values in this data set
missing_values_all = patient_all.isnull().sum()
missing_values_all 

state                                              0
collection_date                                    0
Inpatient Beds Occupied Estimated                  0
Count LL                                           0
Count UL                                           0
Percentage of Inpatient Beds Occupied Estimated    0
Percentage LL                                      0
Percentage UL                                      0
Total Inpatient Beds                               0
Total LL                                           0
Total UL                                           0
dtype: int64

## Estimated Inpatient Beds Occupied by COVID-19 Patients by State Timeseries

In [16]:
# import of data where hospital beds are occupied only by COVID patients
beds_covid = pd.read_csv('https://healthdata.gov/sites/default/files/estimated_inpatient_covid_20210102%201640.csv', thousands = ',')
beds_covid.head()

Unnamed: 0,state,collection_date,Inpatient Beds Occupied by COVID-19 Patients Estimated,Count LL,Count UL,Percentage of Inpatient Beds Occupied by COVID-19 Patients Estimated,Percentage LL,Percentage UL,Total Inpatient Beds,Total LL,Total UL
0,CW,2020-12-02,105059,104916,105202,14.95,14.91,14.98,706920,706592,707248
1,CW,2020-12-03,104627,104483,104772,14.84,14.8,14.88,708908,708557,709260
2,CW,2020-12-04,104541,104398,104684,14.84,14.8,14.88,708497,708157,708837
3,CW,2020-12-05,103896,103742,104050,14.83,14.77,14.89,704537,704132,704943
4,CW,2020-12-06,105182,105032,105331,15.1,15.05,15.15,700470,700103,700837


In [17]:
len(beds_covid)

1612

In [18]:
beds_covid['collection_date'] = pd.to_datetime(beds_covid['collection_date'], errors='coerce')

In [19]:
beds_covid.dtypes

state                                                                           object
collection_date                                                         datetime64[ns]
Inpatient Beds Occupied by COVID-19 Patients Estimated                           int64
Count LL                                                                         int64
Count UL                                                                         int64
Percentage of Inpatient Beds Occupied by COVID-19 Patients Estimated           float64
Percentage LL                                                                  float64
Percentage UL                                                                  float64
Total Inpatient Beds                                                             int64
Total LL                                                                         int64
Total UL                                                                         int64
dtype: object

## Estimated ICU Beds Occupied by State Timeseries 

In [20]:
# import data where occupied beds are by ICU patients 

icu_beds = pd.read_csv('https://healthdata.gov/sites/default/files/estimated_icu_20210102_1640.csv', thousands = ',')
icu_beds.head()

Unnamed: 0,state,collection_date,Staffed Adult ICU Beds Occupied Estimated,Count LL,Count UL,Percentage of Staffed Adult ICU Beds Occupied Estimated,Percentage LL,Percentage UL,Total Staffed Adult ICU Beds,Total LL,Total UL
0,CW,2020-12-02,65922,65894,65950,77.32,77.25,77.38,85282,85263,85301
1,CW,2020-12-03,66429,66394,66463,77.72,77.62,77.82,85486,85455,85516
2,CW,2020-12-04,66448,66384,66512,77.6,77.43,77.77,85638,85618,85659
3,CW,2020-12-05,65819,65759,65880,77.03,76.85,77.21,85454,85408,85501
4,CW,2020-12-06,64681,64622,64739,75.98,75.79,76.18,85132,85073,85191


In [21]:
len(icu_beds)

1612

In [22]:
icu_beds['collection_date'] = pd.to_datetime(icu_beds['collection_date'], errors='coerce')

In [23]:
icu_beds.dtypes

state                                                              object
collection_date                                            datetime64[ns]
Staffed Adult ICU Beds Occupied Estimated                           int64
Count LL                                                            int64
Count UL                                                            int64
Percentage of Staffed Adult ICU Beds Occupied Estimated           float64
Percentage LL                                                     float64
Percentage UL                                                     float64
Total Staffed Adult ICU Beds                                        int64
Total LL                                                            int64
Total UL                                                            int64
dtype: object

# Merging data sets

### First, merging 3 healthdata.gov datasets

In [24]:
# merging all patients dataset and covid patients dataset 
mrd = pd.merge(patient_all, beds_covid, on = ['state', 'collection_date'])
mrd.head()

Unnamed: 0,state,collection_date,Inpatient Beds Occupied Estimated,Count LL_x,Count UL_x,Percentage of Inpatient Beds Occupied Estimated,Percentage LL_x,Percentage UL_x,Total Inpatient Beds_x,Total LL_x,Total UL_x,Inpatient Beds Occupied by COVID-19 Patients Estimated,Count LL_y,Count UL_y,Percentage of Inpatient Beds Occupied by COVID-19 Patients Estimated,Percentage LL_y,Percentage UL_y,Total Inpatient Beds_y,Total LL_y,Total UL_y
0,CW,2020-12-02,529819,529480,530158,74.98,74.9,75.05,706920,706592,707248,105059,104916,105202,14.95,14.91,14.98,706920,706592,707248
1,CW,2020-12-03,531844,531528,532159,75.04,74.97,75.11,708908,708557,709260,104627,104483,104772,14.84,14.8,14.88,708908,708557,709260
2,CW,2020-12-04,528558,528238,528879,74.62,74.55,74.69,708497,708157,708837,104541,104398,104684,14.84,14.8,14.88,708497,708157,708837
3,CW,2020-12-05,512030,511673,512386,72.69,72.59,72.8,704537,704132,704943,103896,103742,104050,14.83,14.77,14.89,704537,704132,704943
4,CW,2020-12-06,498694,498341,499047,71.21,71.12,71.31,700470,700103,700837,105182,105032,105331,15.1,15.05,15.15,700470,700103,700837


In [25]:
all_beds = pd.merge(mrd, icu_beds, on = ['state', 'collection_date'])
all_beds

Unnamed: 0,state,collection_date,Inpatient Beds Occupied Estimated,Count LL_x,Count UL_x,Percentage of Inpatient Beds Occupied Estimated,Percentage LL_x,Percentage UL_x,Total Inpatient Beds_x,Total LL_x,...,Total UL_y,Staffed Adult ICU Beds Occupied Estimated,Count LL,Count UL,Percentage of Staffed Adult ICU Beds Occupied Estimated,Percentage LL,Percentage UL,Total Staffed Adult ICU Beds,Total LL,Total UL
0,CW,2020-12-02,529819,529480,530158,74.98,74.90,75.05,706920,706592,...,707248,65922,65894,65950,77.32,77.25,77.38,85282,85263,85301
1,CW,2020-12-03,531844,531528,532159,75.04,74.97,75.11,708908,708557,...,709260,66429,66394,66463,77.72,77.62,77.82,85486,85455,85516
2,CW,2020-12-04,528558,528238,528879,74.62,74.55,74.69,708497,708157,...,708837,66448,66384,66512,77.60,77.43,77.77,85638,85618,85659
3,CW,2020-12-05,512030,511673,512386,72.69,72.59,72.80,704537,704132,...,704943,65819,65759,65880,77.03,76.85,77.21,85454,85408,85501
4,CW,2020-12-06,498694,498341,499047,71.21,71.12,71.31,700470,700103,...,700837,64681,64622,64739,75.98,75.79,76.18,85132,85073,85191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607,WI,2020-12-28,7350,7349,7351,58.93,58.93,58.93,12473,12473,...,12473,900,900,900,55.83,55.83,55.83,1612,1612,1612
1608,WI,2020-12-29,7720,7661,7780,61.71,60.49,62.94,12510,12499,...,12521,948,942,954,58.55,57.57,59.54,1619,1619,1619
1609,WI,2020-12-30,7830,7779,7882,62.55,61.08,64.02,12518,12437,...,12599,972,962,981,59.90,57.65,62.14,1622,1605,1639
1610,WI,2020-12-31,7823,7760,7886,62.49,60.73,64.26,12518,12421,...,12615,943,931,955,58.39,55.63,61.15,1615,1594,1636


In [26]:
# changing the name of column collection_date to date like other dataset has
all_beds.rename(columns = {'collection_date':'date'}, inplace = True) 
all_beds.head()

Unnamed: 0,state,date,Inpatient Beds Occupied Estimated,Count LL_x,Count UL_x,Percentage of Inpatient Beds Occupied Estimated,Percentage LL_x,Percentage UL_x,Total Inpatient Beds_x,Total LL_x,...,Total UL_y,Staffed Adult ICU Beds Occupied Estimated,Count LL,Count UL,Percentage of Staffed Adult ICU Beds Occupied Estimated,Percentage LL,Percentage UL,Total Staffed Adult ICU Beds,Total LL,Total UL
0,CW,2020-12-02,529819,529480,530158,74.98,74.9,75.05,706920,706592,...,707248,65922,65894,65950,77.32,77.25,77.38,85282,85263,85301
1,CW,2020-12-03,531844,531528,532159,75.04,74.97,75.11,708908,708557,...,709260,66429,66394,66463,77.72,77.62,77.82,85486,85455,85516
2,CW,2020-12-04,528558,528238,528879,74.62,74.55,74.69,708497,708157,...,708837,66448,66384,66512,77.6,77.43,77.77,85638,85618,85659
3,CW,2020-12-05,512030,511673,512386,72.69,72.59,72.8,704537,704132,...,704943,65819,65759,65880,77.03,76.85,77.21,85454,85408,85501
4,CW,2020-12-06,498694,498341,499047,71.21,71.12,71.31,700470,700103,...,700837,64681,64622,64739,75.98,75.79,76.18,85132,85073,85191


### Remapping dataframe to translate US States to Two letter codes

In [27]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [28]:
new_york_data['state'] = new_york_data['state'].map(us_state_abbrev) 

In [29]:
print(new_york_data)

            date state  fips   cases  deaths
0     2020-01-21    WA    53       1       0
1     2020-01-22    WA    53       1       0
2     2020-01-23    WA    53       1       0
3     2020-01-24    IL    17       1       0
4     2020-01-24    WA    53       1       0
...          ...   ...   ...     ...     ...
19424 2021-02-18    VA    51  557896    7090
19425 2021-02-18    WA    53  335338    4868
19426 2021-02-18    WV    54  128760    2236
19427 2021-02-18    WI    55  610056    6820
19428 2021-02-18    WY    56   53531     662

[19429 rows x 5 columns]


### Final merge

In [30]:
final_dataset = pd.merge(new_york_data, all_beds, on = ['state', 'date'])
final_dataset

Unnamed: 0,date,state,fips,cases,deaths,Inpatient Beds Occupied Estimated,Count LL_x,Count UL_x,Percentage of Inpatient Beds Occupied Estimated,Percentage LL_x,...,Total UL_y,Staffed Adult ICU Beds Occupied Estimated,Count LL,Count UL,Percentage of Staffed Adult ICU Beds Occupied Estimated,Percentage LL,Percentage UL,Total Staffed Adult ICU Beds,Total LL,Total UL
0,2020-12-02,AL,1,256828,3711,11036,11036,11036,77.89,77.89,...,14168,1312,1312,1312,88.53,88.53,88.53,1482,1482,1482
1,2020-12-02,AK,2,33614,116,933,933,933,65.02,65.02,...,1435,103,103,103,81.10,81.10,81.10,127,127,127
2,2020-12-02,AZ,4,342925,6755,10620,10613,10627,76.94,76.75,...,13815,1389,1389,1389,69.83,69.83,69.83,1989,1989,1989
3,2020-12-02,AR,5,161521,2522,5531,5531,5531,73.53,73.53,...,7522,822,822,822,80.99,80.99,80.99,1015,1015,1015
4,2020-12-02,CA,6,1272041,19440,47713,47665,47761,75.98,75.91,...,62919,5862,5855,5869,79.70,79.70,79.70,7369,7365,7373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1576,2021-01-01,VA,51,354766,5081,11402,10991,11813,69.44,61.40,...,16947,1413,1356,1470,72.54,63.50,81.57,1948,1881,2015
1577,2021-01-01,WA,53,251585,3523,8450,8331,8569,67.61,64.25,...,12683,951,950,952,70.50,69.96,71.03,1349,1345,1353
1578,2021-01-01,WV,54,87820,1361,3745,3745,3745,74.90,74.90,...,5000,509,509,509,83.17,83.17,83.17,612,596,628
1579,2021-01-01,WI,55,522523,5257,7484,7365,7603,60.08,56.91,...,12617,917,901,933,56.71,53.15,60.27,1617,1591,1643


## Changing LL and UL column names 

In [31]:
final_dataset.rename(columns = {'Count LL_x':'Count LL_inpatient beds occupied'}, inplace = True) 
final_dataset.rename(columns = {'Count UL_x':'Count UL_inpatient beds occupied'}, inplace = True)
final_dataset.rename(columns = {'Percentage LL_x':'Percentage LL_inpatient beds occupied'}, inplace = True)
final_dataset.rename(columns = {'Percentage UL_x':'Percentage UL_inpatient beds occupied'}, inplace = True)
final_dataset.rename(columns = {'Total LL_x':'Total LL_inpatient beds occupied'}, inplace = True)
final_dataset.rename(columns = {'Total UL_x':'Total UL_inpatient beds occupied'}, inplace = True)

In [32]:
final_dataset.rename(columns = {'Count LL_y':'Count LL_occupied by COVID-19 patients'}, inplace = True)
final_dataset.rename(columns = {'Count UL_y':'Count UL_occupied by COVID-19 patients'}, inplace = True)
final_dataset.rename(columns = {'Percentage LL_y':'Percentage LL_occupied by COVID-19 patients'}, inplace = True)
final_dataset.rename(columns = {'Percentage UL_y':'Percentage UL_occupied by COVID-19 patients'}, inplace = True)
final_dataset.rename(columns = {'Total LL_y':'Total LL_occupied by COVID-19 patients'}, inplace = True)
final_dataset.rename(columns = {'Total UL_y':'Total UL_occupied by COVID-19 patients'}, inplace = True)

In [33]:
final_dataset.rename(columns = {'Count LL':'Count LL_ICU Beds Occupied'}, inplace = True)
final_dataset.rename(columns = {'Count UL':'Count UL_ICU Beds Occupied'}, inplace = True)
final_dataset.rename(columns = {'Percentage LL':'Percentage LL_ICU Beds Occupied'}, inplace = True)
final_dataset.rename(columns = {'Percentage UL':'Percentage UL_ICU Beds Occupied'}, inplace = True)
final_dataset.rename(columns = {'Total LL':'Total LL_ICU Beds Occupied'}, inplace = True)
final_dataset.rename(columns = {'Total UL':'Total UL_ICU Beds Occupied'}, inplace = True)

## Final dataset column names 

In [34]:
# list of columns in final dataset
list(final_dataset.columns)

['date',
 'state',
 'fips',
 'cases',
 'deaths',
 'Inpatient Beds Occupied Estimated',
 'Count LL_inpatient beds occupied',
 'Count UL_inpatient beds occupied',
 'Percentage of Inpatient Beds Occupied Estimated',
 'Percentage LL_inpatient beds occupied',
 'Percentage UL_inpatient beds occupied',
 'Total Inpatient Beds_x',
 'Total LL_inpatient beds occupied',
 'Total UL_inpatient beds occupied',
 'Inpatient Beds Occupied by COVID-19 Patients Estimated',
 'Count LL_occupied by COVID-19 patients',
 'Count UL_occupied by COVID-19 patients',
 'Percentage of Inpatient Beds Occupied by COVID-19 Patients Estimated',
 'Percentage LL_occupied by COVID-19 patients',
 'Percentage UL_occupied by COVID-19 patients',
 'Total Inpatient Beds_y',
 'Total LL_occupied by COVID-19 patients',
 'Total UL_occupied by COVID-19 patients',
 'Staffed Adult ICU Beds Occupied Estimated',
 'Count LL_ICU Beds Occupied',
 'Count UL_ICU Beds Occupied',
 'Percentage of Staffed Adult ICU Beds Occupied Estimated',
 'Perce

## Checking the final dataset

In [35]:
final_dataset

Unnamed: 0,date,state,fips,cases,deaths,Inpatient Beds Occupied Estimated,Count LL_inpatient beds occupied,Count UL_inpatient beds occupied,Percentage of Inpatient Beds Occupied Estimated,Percentage LL_inpatient beds occupied,...,Total UL_occupied by COVID-19 patients,Staffed Adult ICU Beds Occupied Estimated,Count LL_ICU Beds Occupied,Count UL_ICU Beds Occupied,Percentage of Staffed Adult ICU Beds Occupied Estimated,Percentage LL_ICU Beds Occupied,Percentage UL_ICU Beds Occupied,Total Staffed Adult ICU Beds,Total LL_ICU Beds Occupied,Total UL_ICU Beds Occupied
0,2020-12-02,AL,1,256828,3711,11036,11036,11036,77.89,77.89,...,14168,1312,1312,1312,88.53,88.53,88.53,1482,1482,1482
1,2020-12-02,AK,2,33614,116,933,933,933,65.02,65.02,...,1435,103,103,103,81.10,81.10,81.10,127,127,127
2,2020-12-02,AZ,4,342925,6755,10620,10613,10627,76.94,76.75,...,13815,1389,1389,1389,69.83,69.83,69.83,1989,1989,1989
3,2020-12-02,AR,5,161521,2522,5531,5531,5531,73.53,73.53,...,7522,822,822,822,80.99,80.99,80.99,1015,1015,1015
4,2020-12-02,CA,6,1272041,19440,47713,47665,47761,75.98,75.91,...,62919,5862,5855,5869,79.70,79.70,79.70,7369,7365,7373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1576,2021-01-01,VA,51,354766,5081,11402,10991,11813,69.44,61.40,...,16947,1413,1356,1470,72.54,63.50,81.57,1948,1881,2015
1577,2021-01-01,WA,53,251585,3523,8450,8331,8569,67.61,64.25,...,12683,951,950,952,70.50,69.96,71.03,1349,1345,1353
1578,2021-01-01,WV,54,87820,1361,3745,3745,3745,74.90,74.90,...,5000,509,509,509,83.17,83.17,83.17,612,596,628
1579,2021-01-01,WI,55,522523,5257,7484,7365,7603,60.08,56.91,...,12617,917,901,933,56.71,53.15,60.27,1617,1591,1643


### Export of the files 

In [29]:
# final_dataset.to_csv('final_covid.csv')
# fnl_covid.to_json('file.json', orient = 'split', compression = 'infer', index = 'true') 
# https://towardsdatascience.com/automatically-update-data-sources-in-python-e424dbea68d0

### Reference:
* Data Import (https://www.datacamp.com/community/tutorials/importing-data-into-pandas)
* Dict map (https://gist.github.com/rogerallen/1583593)
* Pandas Indexing (https://www.shanelynn.ie/merge-join-dataframes-python-pandas-index-1/)
* Remap (https://www.geeksforgeeks.org/using-dictionary-to-remap-values-in-pandas-dataframe-columns/)
* NYT data (https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv)
* Healthdata.gov data (https://healthdata.gov/dataset/covid-19-estimated-patient-impact-and-hospital-capacity-state)