## Upload covid data to Socrata

References:

* [socrata-py package](https://github.com/socrata/socrata-py)
* [sodapy package](https://pypi.org/project/sodapy/)
* [Socrata Data Management experience](https://support.socrata.com/hc/en-us/articles/115016067067-Using-the-Socrata-Data-Management-Experience)
* [some examples](https://github.com/xmunoz/sodapy)

In [1]:
import os
import pandas as pd
import sodapy

import neighborhood_utils
import utils
import credentials

BUCKET_NAME = "public-health-dashboard"
S3_FILE_PATH = f"s3://{BUCKET_NAME}/jhu_covid19/"
 
client = sodapy.Socrata("data.lacity.org", 
                        app_token = None,
                        username = credentials.SOCRATA_USERNAME,
                        password = credentials.SOCRATA_PASSWORD)



In [3]:
def extra_processing(csv_file):
    if csv_file=="us-county-time-series.csv":
        parquet_file = csv_file.replace('.csv', '.parquet')
        df = pd.read_parquet(f"{S3_FILE_PATH}{parquet_file}")
        df = df.assign(
            date = pd.to_datetime(df.date).dt.date,
        )
        
        
    elif csv_file=="la-county-neighborhood-time-series.csv":
        df = neighborhood_utils.clean_data()

        keep_cols = [
            "aggregate_region", "population", "date", "date2", 
            "cases", "deaths", "new_cases", "new_deaths",
            "cases_per100k", "deaths_per100k",
            "cases_avg7", "deaths_avg7",
            "new_cases_avg7", "new_deaths_avg7",
            "cases_per100k_avg7", "deaths_per100k_avg7",
        ]

        df = df[keep_cols]
        
    elif csv_file=="vaccinations-by-county.csv":
        df = pd.read_csv(utils.COUNTY_VACCINE_URL)
    
        population = pd.read_parquet(f"{S3_FILE_PATH}ca_county_pop_crosswalk.parquet")    
    
        df = pd.merge(df, population, 
                      on = "county",
                      how = "inner", validate = "m:1")
    
        df = df.assign(
            date = pd.to_datetime(df.administered_date),
        )
        
    elif csv_file=="vaccinations-by-demographics-county.csv":
        df = pd.read_csv(utils.COUNTY_DEMOGRAPHICS_URL)
        df = df.assign(
            date = pd.to_datetime(df.administered_date),
        )
    return df


def overwrite_socrata_table(csv_file, socrata_dataset_id, NUM_MINUTES=20):    
    data = open(f"{csv_file}")
    
    client.timeout = (NUM_MINUTES * 60)
    client.replace(socrata_dataset_id, data)
    
    print(f"{csv_file} updated")
    os.remove(f"{csv_file}")
    
    
def upsert_socrata_rows(csv_file, socrata_dataset_id, NUM_MINUTES=5):
    # Grab existing table in Socrata and find where it leaves off
    existing_table = client.get(socrata_dataset_id)
    existing_table = pd.DataFrame(existing_table)
    max_date = pd.to_datetime(existing_table.date.max())
    
    df = pd.read_csv(f"{csv_file}")
    df = df.assign(
        date = pd.to_datetime(df.date)
    )
    
    # Now, overwrite the local CSV with just the rows we need to upsert
    df[df.date > max_date].to_csv(f"{csv_file}" , index=False)
    
    data = open(f"{csv_file}")
    
    client.timeout = (NUM_MINUTES * 60)
    client.upsert(socrata_dataset_id, data)

    print(f"{csv_file} updated")
    os.remove(f"{csv_file}")   

In [4]:
DATAFRAME_DICT = {
    # key: socrata_dataset_id
    # value: csv file
    "jsff-uc6b": "us-county-time-series.csv",
    "fvye-93wd": "la-county-neighborhood-time-series.csv",
    "rpp7-mevy": "vaccinations-by-county.csv",
    "iv7a-6rrq": "vaccinations-by-demographics-county.csv"
}

for socrata_id, filename in DATAFRAME_DICT.items():
    # So far, all the datasets need some extra processing to make sure Socrata table schema is correct
    if (("us-county" in filename) or 
        ("neighborhood" in filename) or 
        ("vaccinations" in filename)):
        df = extra_processing(filename)
    else:
        df = pd.read_csv(f"{S3_FILE_PATH}{filename}")
    
    # Write the full table out as CSV 
    df.to_csv(f"{filename}", index=False)
    
    #overwrite_socrata_table(filename, socrata_dataset_id = socrata_id)
    upsert_socrata_rows(filename, socrata_dataset_id = socrata_id)
    
    
#s3_to_socrata("la-county-neighborhood-time-series.csv", 
#              socrata_dataset_id = "wrj6-vjm7")

us-county-time-series.csv updated
la-county-neighborhood-time-series.csv updated
vaccinations-by-county.csv updated
vaccinations-by-demographics-county.csv updated


In [33]:
'''
la = pd.read_csv("../notebooks/test_data/us-county-time-series.csv")
la = la[la.county=="Los Angeles"]
la[la.date < "2021-08-23"].to_csv("../notebooks/test_data/la.csv", index=False)
la.to_csv("../notebooks/test_data/la.csv", index=False)

filename = "../notebooks/test_data/la.csv"
socrata_dataset_id = "3cst-kzzr"
existing_table = client.get(socrata_dataset_id)
existing_table = pd.DataFrame(existing_table)
max_date = pd.to_datetime(existing_table.date.max())

t2 = pd.read_csv(f"{filename}")
t2 = t2.assign(
    date = pd.to_datetime(t2.date)
)

t2[t2.date > max_date].to_csv(filename, index=False)
data = open(f"{filename}")
    
client.timeout = (5 * 60)
client.upsert(socrata_dataset_id, data)

'''

{'Errors': 0, 'Rows Created': 2, 'Rows Deleted': 0, 'Rows Updated': 0}