In [1]:
import pandas as pd

URL = "https://raw.githubusercontent.com/ANRGUSC/lacounty_covid19_data/master/data/Covid-19.csv"
S3_FILE_PATH = "s3://public-health-dashboard/jhu_covid19/"

def clean_data():
    df = pd.read_csv(URL)
    df = (df.assign(
            date = pd.to_datetime(df["Time Stamp"]).dt.date,
            date2 = pd.to_datetime(df["Time Stamp"]),
            cases = df["Number of cases"].fillna(0).astype(int),
        ).drop(columns = ["Time Stamp", "Number of cases"])
          .sort_values(["date", "Region"])
          .reset_index(drop=True)
    )
    
    # Derive columns
    group_cols = ["Region"]
    sort_cols = ["Region", "date"]
    
    df = (df.assign(
            new_cases = df.sort_values(sort_cols).groupby(group_cols)["cases"].diff(periods=1),
        )
    )
    
    df = (df.assign(
            cases_avg7=df.cases.rolling(window=7).mean(),
            new_cases_avg7=df.new_cases.rolling(window=7).mean(),
        )   
    )
    
    quartiles = (df.groupby("date")["cases"].describe()[["25%", "50%", "75%"]]
                 .rename(columns = {"25%": "ptile25",
                                    "50%": "ptile50",
                                    "75%" :"ptile75"})
                 .reset_index()
                )
    
    df2 = pd.merge(df, quartiles, on = "date", how = "left", validate = "m:1")
    
    df2.to_parquet(f"{S3_FILE_PATH}lacounty-neighborhood-time-series.parquet")
    
    return df2

In [2]:
df = clean_data()

In [3]:
df.head()

Unnamed: 0,Region,Latitude,Longitude,date,date2,cases,new_cases,cases_avg7,new_cases_avg7,ptile25,ptile50,ptile75
0,Alhambra,34.093042,-118.12706,2020-03-16,2020-03-16,2,,,,1.0,2.0,2.0
1,Arcadia,34.136208,-118.04015,2020-03-16,2020-03-16,1,,,,1.0,2.0,2.0
2,Beverly Hills,34.06965,-118.396306,2020-03-16,2020-03-16,1,,,,1.0,2.0,2.0
3,Boyle Heights,34.043689,-118.209768,2020-03-16,2020-03-16,5,,,,1.0,2.0,2.0
4,Carson,33.832204,-118.251755,2020-03-16,2020-03-16,1,,,,1.0,2.0,2.0
