In [None]:
!pip install gcsfs

In [1]:
import gcsfs
import os
import pandas as pd

CREDENTIAL = "../gcp-credential.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = CREDENTIAL

gs = gcsfs.GCSFileSystem(project="ita-datalakepoc")

BUCKET_NAME = "electedoffice_covid19_indicators"

In [3]:
COUNTY_VACCINE_URL = (
    "https://data.chhs.ca.gov/dataset/e283ee5a-cf18-4f20-a92c-ee94a2866ccd/resource/"
    "130d7ba2-b6eb-438d-a412-741bde207e1c/download/"
    "covid19vaccinesbycounty.csv"
)

#---------------------------------------------------------------#
# Vaccines Administered
#---------------------------------------------------------------#
def clean_vaccines_by_county():
    df = pd.read_csv(COUNTY_VACCINE_URL)
    
    POP_URL = "https://raw.githubusercontent.com/CityOfLosAngeles/covid19-indicators/master/data/ca_county_pop_crosswalk.csv"
    population = pd.read_csv(POP_URL, dtype={"county_fips": "str"})    
    
    df = pd.merge(df, population, 
                  on = "county",
                  how = "inner", validate = "m:1")
    
    df = df.assign(
        date = pd.to_datetime(df.administered_date),
    )
    
    # Reshape and make long
    id_vars = ["county", "administered_date", "date", 
               "county_fips", "county_pop2020", "california_flag"]
    
    df2 = df.melt(id_vars=id_vars)
    
    # Let's also get the proportion relative to that county's pop
    # Ultimately, only interested in partially/fully vaccinated population,
    # but generate it for all the other variables too
    df2 = df2.assign(
        proportion = df2.value.divide(df2.county_pop2020)
    )
        
    return df2


df = clean_vaccines_by_county()
df.head()

Unnamed: 0,county,administered_date,date,county_fips,county_pop2020,california_flag,variable,value,proportion
0,Alameda,2020-12-15,2020-12-15,6001,1685886,California,total_doses,37,2.2e-05
1,Alameda,2020-12-16,2020-12-16,6001,1685886,California,total_doses,208,0.000123
2,Alameda,2020-12-17,2020-12-17,6001,1685886,California,total_doses,397,0.000235
3,Alameda,2020-12-18,2020-12-18,6001,1685886,California,total_doses,1791,0.001062
4,Alameda,2020-12-19,2020-12-19,6001,1685886,California,total_doses,1437,0.000852


In [4]:
# Export as parquet
FILE_NAME = "test_file.parquet"
df.to_parquet(f"gcs://{BUCKET_NAME}/{FILE_NAME}")

In [5]:
# Import parquet
df2 = pd.read_parquet(f"gcs://{BUCKET_NAME}/{FILE_NAME}")
df2.head()

Unnamed: 0,county,administered_date,date,county_fips,county_pop2020,california_flag,variable,value,proportion
0,Alameda,2020-12-15,2020-12-15,6001,1685886,California,total_doses,37,2.2e-05
1,Alameda,2020-12-16,2020-12-16,6001,1685886,California,total_doses,208,0.000123
2,Alameda,2020-12-17,2020-12-17,6001,1685886,California,total_doses,397,0.000235
3,Alameda,2020-12-18,2020-12-18,6001,1685886,California,total_doses,1791,0.001062
4,Alameda,2020-12-19,2020-12-19,6001,1685886,California,total_doses,1437,0.000852
