## Upload covid data to Socrata

References:

* [socrata-py package](https://github.com/socrata/socrata-py)
* [sodapy package](https://pypi.org/project/sodapy/)
* [Socrata Data Management experience](https://support.socrata.com/hc/en-us/articles/115016067067-Using-the-Socrata-Data-Management-Experience)
* [some examples](https://github.com/xmunoz/sodapy)

In [1]:
import pandas as pd

from processing_utils import default_parameters
from processing_utils import neighborhood_utils
from processing_utils import utils
from processing_utils import socrata_utils

import credentials

S3_FILE_PATH = default_parameters.S3_FILE_PATH


SOCRATA_USER = credentials.SOCRATA_USERNAME
SOCRATA_PASSWORD = credentials.SOCRATA_PASSWORD

In [2]:
def us_county(csv_file, county_list=["Los Angeles"]):
    parquet_file = csv_file.replace('.csv', '.parquet')
    
    df = pd.read_parquet(f"{S3_FILE_PATH}{parquet_file}")
 
    df = (df[df.county.isin(county_list)]
          .reset_index(drop=True)
          .assign(
            date = pd.to_datetime(df.date).dt.date,
            incident_rate = df.incident_rate.round(1),
        )
    )
    
    # Subsetting by date needs to happen after we manipulate date column
    # Otherwise, datetime.date clashes with pandas.timestamp
    df = df[df.date < default_parameters.today_date]
    
    return df


def la_county_neighborhood(csv_file):
    df = neighborhood_utils.clean_data()

    keep_cols = [
        "aggregate_region", "population", "date", "date2",
        "cases", "deaths", "new_cases", "new_deaths",
        "cases_per100k", "deaths_per100k",
        "cases_avg7", "deaths_avg7",
        "new_cases_avg7", "new_deaths_avg7",
        "cases_per100k_avg7", "deaths_per100k_avg7",
    ]
    
    df = df[keep_cols]
    
    # Round some of the other columns to 1 decimal place
    integrify_me = ["cases", "deaths", 
                    "new_cases", "new_deaths", "population"]

    round_me = [c for c in df.columns if (c not in ["aggregate_region", "date", "date2"])
               and (c not in integrify_me)]

    df[round_me] = df[round_me].astype(float).round(1)
    df[integrify_me] = df[integrify_me].astype(int)

    return df


def ca_vaccinations(csv_file):
    df = pd.read_csv(utils.COUNTY_VACCINE_URL)

    population = pd.read_parquet(f"{S3_FILE_PATH}ca_county_pop_crosswalk.parquet")    

    df = pd.merge(df, population, 
                  on = "county",
                  how = "inner", validate = "m:1")

    df = df.assign(
        date = pd.to_datetime(df.administered_date),
    ).drop(columns = ["administered_date"])
        
    return df

def ca_vaccinations_demographics(csv_file):
    df = pd.read_csv(utils.COUNTY_DEMOGRAPHICS_URL)
    
    df = df.assign(
        date = pd.to_datetime(df.administered_date),
    ).drop(columns = ["administered_date"])
    
    string_cols = ["county", "county_type", 
                   "demographic_category", "demographic_value",
                   "date", "suppress_data"]
    
    integrify_me = [c for c in df.columns if c not in string_cols]
    df[integrify_me] = df[integrify_me].astype("Int64")
    
    return df

def la_county_testing(csv_file):
    s3_file_name = "county-city-testing.parquet" 
    
    df = pd.read_parquet(f"{S3_FILE_PATH}{s3_file_name}")
    df = df.assign(
        date = pd.to_datetime(df.date).dt.date,
    )
    
    # Drop the row that has all 0's, even if it's yesterday
    # Keep data ending in 2 days ago
    df = df[df.date <= default_parameters.two_days_ago]
    
    return df

In [3]:
def extra_processing(csv_file):
    if csv_file=="us-county-time-series.csv":
        df = us_county(csv_file, county_list=["Los Angeles"])
    elif csv_file=="la-county-neighborhood-time-series.csv":
        df = la_county_neighborhood(csv_file)
    elif csv_file=="vaccinations-by-county.csv":
        df = ca_vaccinations(csv_file)
    elif csv_file=="vaccinations-by-demographics-county.csv": 
        df = ca_vaccinations_demographics(csv_file) 
    elif csv_file=="vaccinations-by-demographics-county.csv": 
        df = ca_vaccinations_demographics(csv_file)
    elif csv_file=="la-county-testing-time-series.csv": 
        df = la_county_testing(csv_file)
    else:
        df = pd.read_csv(f"{S3_FILE_PATH}{csv_file}")
    
    return df

In [4]:
DATAFRAME_DICT = {
    # key: socrata_dataset_id
    # value: csv file
    "jsff-uc6b": "us-county-time-series.csv",
    "fvye-93wd": "la-county-neighborhood-time-series.csv",
    "rpp7-mevy": "vaccinations-by-county.csv",
    "iv7a-6rrq": "vaccinations-by-demographics-county.csv",
    "w9vh-pj9e": "la-county-testing-time-series.csv",
}

for socrata_id, filename in DATAFRAME_DICT.items():
    # So far, all the datasets need some extra processing to make sure Socrata table schema is correct
    df = extra_processing(filename)
    
    # Write the full table out as CSV 
    df.to_csv(f"{filename}", index=False)
    print(f"{filename} produced")
    
    #socrata_utils.overwrite_socrata_table(SOCRATA_USER, SOCRATA_PASSWORD, 
    #                                      filename, socrata_dataset_id = socrata_id)
    #socrata_utils.upsert_socrata_rows(SOCRATA_USER, SOCRATA_PASSWORD, 
    #                                   filename, socrata_dataset_id = socrata_id)
    
    
#s3_to_socrata("la-county-neighborhood-time-series.csv", 
#              socrata_dataset_id = "wrj6-vjm7")



us-county-time-series.csv csv produced
us-county-time-series.csv updated




la-county-neighborhood-time-series.csv csv produced
la-county-neighborhood-time-series.csv updated




vaccinations-by-county.csv csv produced
vaccinations-by-county.csv updated




vaccinations-by-demographics-county.csv csv produced
vaccinations-by-demographics-county.csv updated




la-county-testing-time-series.csv csv produced


HTTPError: 404 Client Error: Not Found

In [None]:
'''
la = pd.read_csv("../notebooks/test_data/us-county-time-series.csv")
la = la[la.county=="Los Angeles"]
la[la.date < "2021-08-23"].to_csv("../notebooks/test_data/la.csv", index=False)
la.to_csv("../notebooks/test_data/la.csv", index=False)

filename = "../notebooks/test_data/la.csv"
socrata_dataset_id = "3cst-kzzr"
existing_table = client.get(socrata_dataset_id)
existing_table = pd.DataFrame(existing_table)
max_date = pd.to_datetime(existing_table.date.max())

t2 = pd.read_csv(f"{filename}")
t2 = t2.assign(
    date = pd.to_datetime(t2.date)
)

t2[t2.date > max_date].to_csv(filename, index=False)
data = open(f"{filename}")
    
client.timeout = (5 * 60)
client.upsert(socrata_dataset_id, data)

'''