# COVID Hospitalization Weekly Scrapper

This script is intended for obtaining the data from the OHSA Tableau dashboard for COVID hospitalization; as well as reformatting the data to then append into the data record.

In [1]:
#import libraries and packages
import pandas as pd
import numpy as np
import datetime
import os

In [2]:
#Remember to update link below with latest download link from https://public.tableau.com/app/profile/oregon.health.authority.covid.19/viz/OregonCOVID-19HospitalCapacitySummaryTables_15965754787060/HospitalizationsByFacility
downloaded_data = pd.read_csv('https://public.tableau.com/vizql/w/OregonCOVID-19HospitalCapacitySummaryTables_15965754787060/v/HospitalizationsByFacility/tempfile/sessions/300C3B28D8CC4165BA2ED421F58A214C-0:0/?key=1798539847&keepfile=yes&attachment=yes',encoding='utf-16le',sep='\t',skiprows=1)

In [3]:
downloaded_data

Unnamed: 0,Hospital Name,County,Region,10,11,13,20,<10
0,Adventist Health Portland,Multnomah,Region 1,,,,,
1,Adventist Health Tillamook,Tillamook,Region 1,,,,,
2,Asante Ashland Community Hospital,Jackson,Region 5,,,,,
3,Asante Rogue Regional Medical Center,Jackson,Region 5,,,,,
4,Asante Three Rivers Medical Center,Josephine,Region 5,,,,,
5,Bay Area Hospital,Coos,Region 3,,,,,
6,Columbia Memorial Hospital,Clatsop,Region 1,,,,,
7,Curry General Hospital,Curry,Region 3,,,,,
8,Good Samaritan Regional Medical Center,Benton,Region 2,,,,,
9,Good Shepherd Health Care System,Umatilla,Region 9,,,,,


In [4]:
# Define the columns to exclude from the search for hospitalization data
excluded_columns = ['Hospital Name', 'County', 'Region', '<10']

# Identify columns to search for hospitalization data
hospitalization_columns = [col for col in downloaded_data.columns if col not in excluded_columns]

In [5]:
# Function to apply across each row
def get_hospitalization_value(row):
    # Iterate over the hospitalization columns
    for col in hospitalization_columns:
        # Check if the value is not NaN (using pd.notna because np.nan != np.nan)
        if pd.notna(row[col]):
            return row[col]
    # Default value if all relevant columns are NaN
    return '<10'

In [6]:
# Create the new column by applying the function
downloaded_data['Hospitalization Cases'] = downloaded_data.apply(get_hospitalization_value, axis=1)

#Drop columns that are no longer needed
downloaded_data.drop(hospitalization_columns + ['<10'], axis=1, inplace=True)

In [7]:
downloaded_data['MAX_7 Day Average of COVID-19 Positive Patients_Num']=downloaded_data['Hospitalization Cases'].replace('<10', '9').astype(int)
downloaded_data['MIN_7 Day Average of COVID-19 Positive Patients_Num']=downloaded_data['Hospitalization Cases'].replace('<10', '1').astype(int)

In [8]:
# Group by 'County' and sum both the max and min columns
aggregated_data = downloaded_data.groupby('County').agg({
    'MAX_7 Day Average of COVID-19 Positive Patients_Num': 'sum',
    'MIN_7 Day Average of COVID-19 Positive Patients_Num': 'sum'
}).reset_index()

In [9]:
# Function to prompt the user for a date and validate the format
def get_user_date_input(prompt):
    while True:
        user_input = input(prompt)
        try:
            # Attempt to convert the user input into a date using the specified format
            valid_date = datetime.datetime.strptime(user_input, '%m/%d/%Y')
            # If conversion is successful, return the formatted date
            return valid_date.strftime('%m/%d/%Y')
        except ValueError:
            # If conversion fails, notify the user and prompt again
            print("The date format should be m/d/YYYY. Please try again.")

In [10]:
# Prompt the user to enter the date
date_input = get_user_date_input("Enter the date when the Tableau dashboard was last updated (m/d/YYYY): ")

print(f"The entered date is: {date_input}")


The entered date is: 02/20/2024


In [11]:
aggregated_data['DateStamp']=date_input

In [12]:
# Pivot the DataFrame
df_pivot = aggregated_data.pivot(index='DateStamp', columns='County', values=['MAX_7 Day Average of COVID-19 Positive Patients_Num', 'MIN_7 Day Average of COVID-19 Positive Patients_Num'])

# Adjust column levels
df_pivot.columns = df_pivot.columns.swaplevel(0, 1)
df_pivot.sort_index(axis=1, level=0, inplace=True)

In [13]:
df_pivot

County,Benton,Benton,Clackamas,Clackamas,Clatsop,Clatsop,Coos,Coos,Crook,Crook,...,Union,Union,Wallowa,Wallowa,Wasco,Wasco,Washington,Washington,Yamhill,Yamhill
Unnamed: 0_level_1,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,...,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num
DateStamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
02/20/2024,9,1,36,4,18,2,9,1,9,1,...,9,1,9,1,9,1,28,12,9,1


## Read Existing File and Append New Data

In [14]:
# change directory folder
os.chdir('Z:/')
os.getcwd() #to get working directory

'Z:\\'

In [15]:
# Read the existing Excel file
existing_data = pd.read_excel("COVID_Hospitalized.xlsx",header=[0, 1], index_col=0)


In [16]:
#Align column order
df_pivot=df_pivot.reindex(columns=existing_data.columns)

In [18]:
# Append the data
combined_data = existing_data.append(df_pivot, ignore_index=False)

  combined_data = existing_data.append(df_pivot, ignore_index=False)


In [19]:
# Replace NaN values with 0
combined_data.fillna(0, inplace=True)

In [20]:
# Convert all columns to integers, except for the excluded (non-numeric) ones
# Identify non-numeric columns first (if any)
non_numeric_columns = combined_data.select_dtypes(include=['object']).columns

# Convert numeric columns to integers
for column in combined_data.columns.difference(non_numeric_columns):
    combined_data[column] = combined_data[column].astype(int)

In [21]:
combined_data.tail()

County,Baker,Baker,Benton,Benton,Clackamas,Clackamas,Clatsop,Clatsop,Coos,Coos,...,Jefferson,Jefferson,Lake,Lake,Tillamook,Tillamook,Wallowa,Wallowa,Polk,Polk
Unnamed: 0_level_1,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,...,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num,MAX_7 Day Average of COVID-19 Positive Patients_Num,MIN_7 Day Average of COVID-19 Positive Patients_Num
1/23/2024,9,1,9,1,37,13,9,1,9,1,...,0,0,0,0,0,0,0,0,0,0
1/30/2024,9,1,9,1,39,15,18,2,9,1,...,9,1,0,0,9,1,0,0,0,0
2/6/2024,9,1,9,1,36,4,9,1,9,1,...,9,1,0,0,9,1,0,0,0,0
02/13/2024,9,1,9,1,36,4,0,0,18,2,...,9,1,0,0,9,1,9,1,0,0
02/20/2024,0,0,9,1,36,4,18,2,9,1,...,0,0,0,0,9,1,9,1,0,0


In [22]:
# Save the updated DataFrame back to the Excel file
combined_data.to_excel("COVID_Hospitalized.xlsx")