# NYCHealth Coronavirus (COVID-19) data
### Original datasource: https://github.com/nychealth/coronavirus-data

In [None]:
import pandas as pd
import numpy as np
import os
import requests
import json
import re
import csv
from datetime import datetime
import pycountry

In [None]:
REPO_NAME = 'coronavirus-data'
REPO_OWNER = 'nychealth'
API_ENDPOINT = 'https://api.github.com/repos'
RAW_DATA_ENDPOINT = 'https://raw.githubusercontent.com/'

In [None]:
# papermill parameters
output_folder = "../output/"
GIT_USER = os.getenv("GIT_USER") 
GIT_TOKEN = os.getenv("GIT_TOKEN")

In [None]:
assert GIT_USER and GIT_TOKEN  # raise assertion error when GIT_USER and GIT_TOKEN are not specified

In [None]:
session = requests.session()
session.auth = (GIT_USER, GIT_TOKEN)  # create session

In [None]:
response = session.get(f'{API_ENDPOINT}/{REPO_OWNER}/{REPO_NAME}/commits')
print(f'{API_ENDPOINT}/{REPO_OWNER}/{REPO_NAME}/commits')

In [None]:
commit_shas = list(map(lambda commit: (commit['commit']['author']['date'], commit['sha']), json.loads(response.text)))

In [None]:
df = []
for (date, commit) in commit_shas:
    
    response = session.get(f'{RAW_DATA_ENDPOINT}/{REPO_OWNER}/{REPO_NAME}/{commit}/tests-by-zcta.csv')
    if response.status_code == 200:
        
        csv_dict = csv.DictReader(response.text.split('\n'))
        for row in csv_dict:
            if "modzcta_cum_perc_pos" in list(row):
                row["zcta_cum.perc_pos"] = row.pop("modzcta_cum_perc_pos")
            if "modzcta" in list(row):
                row["MODZCTA"] = row.pop("modzcta")
            row.update({'Date': date})  # add commit_date as field: Date
            df.append(row)


In [None]:
df = pd.DataFrame(df)

In [None]:
df = df.replace(".", regex=False, value="NA")
df["Total"] = df["Total"].str.strip()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%dT%H:%M:%SZ")  # parse date
df['MODZCTA'] = df['MODZCTA'].replace(['NA'], '99999')  # parse NA
df['MODZCTA'] = df["MODZCTA"].replace("\.", value="", regex=True)

In [None]:
zcta_to_fips = pd.read_csv('https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt').set_index('ZCTA5')
zcta_to_fips = zcta_to_fips[~zcta_to_fips.index.duplicated(keep='first')]
df['FIPS'] = ''

df['FIPS'].loc[df['MODZCTA'] != '99999'] = zcta_to_fips.loc[list(map(lambda x: int(x), df['MODZCTA'].loc[df['MODZCTA'] != '99999'].tolist())), 'GEOID'].tolist()
df['FIPS'] = df["FIPS"].replace("\.", value="", regex=True)


In [None]:
df['Country_Region'] = "United States"
df['ISO3166_1'] = "US"
fips_to_state = pd.read_csv('https://raw.githubusercontent.com/kjhealy/fips-codes/master/county_fips_master.csv', encoding ="ISO-8859-1").set_index('fips')
df['ISO3166_2'] = ''
df['ISO3166_2'].loc[df['FIPS'] != ''] = fips_to_state.loc[df['FIPS'].loc[df['FIPS'] != ''].tolist()]['state_abbr'].tolist()

In [None]:
df['zcta_cum.perc_pos'] = df['zcta_cum.perc_pos'].replace('NA', np.nan)
df['MODZCTA'] = df['MODZCTA'].replace('99999', '')

In [None]:
df["Positive"].loc[df["Positive"] == "NA"] = np.nan

In [None]:
df = df.astype({
    'Positive': 'float32',
    'Total': 'float32',
    'zcta_cum.perc_pos': 'float32',
    'FIPS': 'object'
})

In [None]:
df.dtypes

In [None]:
df = df.rename(columns={
    "zcta_cum.perc_pos": "ZTCA_CUM_PERC_POS"
})

In [None]:
df["Last_Updated_Date"] = datetime.utcnow()
df['Last_Reported_Date'] = df['Date'] == df['Date'].max()

In [None]:
df.to_csv(output_folder + "NYC_HEALTH_TESTS.csv", index=False, columns=[
    "MODZCTA",
    "Positive",
    "Total",
    "ZTCA_CUM_PERC_POS",
    "Date",
    "FIPS",
    "Country_Region",
    "ISO3166_1",
    "ISO3166_2",
    "Last_Updated_Date",
    "Last_Reported_Date"
])