## Data Preprocessing for COVID 19 DSPG activity

In [2]:
%config InlineBackend.figure_format='retina'

import calendar
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams, ticker
from scipy.optimize import curve_fit

from sklearn.neural_network import MLPClassifier

In [3]:
census_raw_url = "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
census_raw_df  = pd.read_csv(census_raw_url, encoding="ISO-8859-1") 
census_raw_df.head()
census_county_df = census_raw_df[census_raw_df["SUMLEV"] == 50]
census_df = census_county_df[['STNAME', 'CTYNAME', 'STATE', 'COUNTY', 'POPESTIMATE2019']].copy()
census_df['COUNTYFP'] = ((census_df['STATE'] * 1000) + (census_df['COUNTY']))
census_df.head()

Unnamed: 0,STNAME,CTYNAME,STATE,COUNTY,POPESTIMATE2019,COUNTYFP
1,Alabama,Autauga County,1,1,55869,1001
2,Alabama,Baldwin County,1,3,223234,1003
3,Alabama,Barbour County,1,5,24686,1005
4,Alabama,Bibb County,1,7,22394,1007
5,Alabama,Blount County,1,9,57826,1009


In [4]:
mask_use_url = 'https://github.com/nytimes/covid-19-data/raw/master/mask-use/mask-use-by-county.csv'
mask_use_raw_df = pd.read_csv(mask_use_url, encoding="ISO-8859-1")
mask_use_raw_df.head()

Unnamed: 0,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.12,0.201,0.491
3,1007,0.02,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.18,0.194,0.459


In [5]:
int_table_01 = pd.merge(census_df, mask_use_raw_df, on="COUNTYFP", how="left")
int_table_01.head()

Unnamed: 0,STNAME,CTYNAME,STATE,COUNTY,POPESTIMATE2019,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS
0,Alabama,Autauga County,1,1,55869,1001,0.053,0.074,0.134,0.295,0.444
1,Alabama,Baldwin County,1,3,223234,1003,0.083,0.059,0.098,0.323,0.436
2,Alabama,Barbour County,1,5,24686,1005,0.067,0.121,0.12,0.201,0.491
3,Alabama,Bibb County,1,7,22394,1007,0.02,0.034,0.096,0.278,0.572
4,Alabama,Blount County,1,9,57826,1009,0.053,0.114,0.18,0.194,0.459


In [6]:
county_covid_case_url = 'https://github.com/nytimes/covid-19-data/raw/master/us-counties.csv'
county_covid_case_raw_df = pd.read_csv(county_covid_case_url, encoding="ISO-8859-1", parse_dates=['date'])

gb_month_year = county_covid_case_raw_df.groupby(
    [county_covid_case_raw_df['date'].dt.year.rename('year'), 
     county_covid_case_raw_df['date'].dt.month.rename('month'),
     'fips', 'county', 'state', 
    ])[['deaths', 'cases']].agg('sum').copy()
gb_month_year.reset_index(inplace=True)
gb_month_year['fips'] = gb_month_year['fips'].astype('int')

county_covid_2020_df = gb_month_year[gb_month_year["year"] == 2020].copy()

In [7]:
eoy_df = county_covid_2020_df[county_covid_2020_df["month"] == 12].copy()
eoy_df = eoy_df.rename(columns={"deaths": "deaths_dec", "cases": "cases_dec"}).copy()
for i in np.arange(1, 12):
    this_month = county_covid_2020_df[county_covid_2020_df["month"] == i].copy()
    this_month = this_month.rename(columns={"deaths": f"deaths_{calendar.month_abbr[i].lower()}", "cases": f"cases_{calendar.month_abbr[i].lower()}"}).copy()
    eoy_df = pd.merge(eoy_df, this_month[['fips', f"cases_{calendar.month_abbr[i].lower()}", f"deaths_{calendar.month_abbr[i].lower()}"]], on="fips", how="left", ).copy()
eoy_df['COUNTYFP'] = eoy_df['fips'] 

In [8]:
eoy_df.head()
eoy_df.fillna(0, inplace=True)

In [9]:
export = pd.merge(int_table_01, eoy_df, on="COUNTYFP", how="inner", ).copy()
export.head()

Unnamed: 0,STNAME,CTYNAME,STATE,COUNTY,POPESTIMATE2019,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,...,cases_jul,deaths_jul,cases_aug,deaths_aug,cases_sep,deaths_sep,cases_oct,deaths_oct,cases_nov,deaths_nov
0,Alabama,Autauga County,1,1,55869,1001,0.053,0.074,0.134,0.295,...,24241.0,551.0,38807.0,699.0,48577.0,733.0,60844.0,893.0,73997.0,1072.0
1,Alabama,Baldwin County,1,3,223234,1003,0.083,0.059,0.098,0.323,...,54475.0,440.0,120283.0,981.0,150651.0,1425.0,197611.0,1996.0,233249.0,2550.0
2,Alabama,Barbour County,1,5,24686,1005,0.067,0.121,0.12,0.201,...,14161.0,92.0,20859.0,189.0,24539.0,210.0,30228.0,265.0,33813.0,285.0
3,Alabama,Bibb County,1,7,22394,1007,0.02,0.034,0.096,0.278,...,7961.0,49.0,14745.0,168.0,18402.0,281.0,23833.0,393.0,30545.0,498.0
4,Alabama,Blount County,1,9,57826,1009,0.053,0.114,0.18,0.194,...,13625.0,37.0,31781.0,166.0,44569.0,397.0,56017.0,639.0,76472.0,981.0


In [10]:
export_csv = export[[
       'COUNTYFP','STATE', 'POPESTIMATE2019', 
       'NEVER', 'RARELY', 'SOMETIMES', 'FREQUENTLY', 'ALWAYS',
       'deaths_dec', 'cases_dec', 'cases_jan',
       'deaths_jan', 'cases_feb', 'deaths_feb', 'cases_mar', 'deaths_mar',
       'cases_apr', 'deaths_apr', 'cases_may', 'deaths_may', 'cases_jun',
       'deaths_jun', 'cases_jul', 'deaths_jul', 'cases_aug', 'deaths_aug',
       'cases_sep', 'deaths_sep', 'cases_oct', 'deaths_oct', 'cases_nov',
       'deaths_nov']]
export_csv.to_csv("covid_19_activity.csv", index=False)
export_csv.head()

Unnamed: 0,COUNTYFP,STATE,POPESTIMATE2019,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,deaths_dec,cases_dec,...,cases_jul,deaths_jul,cases_aug,deaths_aug,cases_sep,deaths_sep,cases_oct,deaths_oct,cases_nov,deaths_nov
0,1001,1,55869,0.053,0.074,0.134,0.295,0.444,1355.0,108652,...,24241.0,551.0,38807.0,699.0,48577.0,733.0,60844.0,893.0,73997.0,1072.0
1,1003,1,223234,0.083,0.059,0.098,0.323,0.436,4502.0,348455,...,54475.0,440.0,120283.0,981.0,150651.0,1425.0,197611.0,1996.0,233249.0,2550.0
2,1005,1,24686,0.067,0.121,0.12,0.201,0.491,931.0,40753,...,14161.0,92.0,20859.0,189.0,24539.0,210.0,30228.0,265.0,33813.0,285.0
3,1007,1,22394,0.02,0.034,0.096,0.278,0.572,1244.0,47009,...,7961.0,49.0,14745.0,168.0,18402.0,281.0,23833.0,393.0,30545.0,498.0
4,1009,1,57826,0.053,0.114,0.18,0.194,0.459,1590.0,121270,...,13625.0,37.0,31781.0,166.0,44569.0,397.0,56017.0,639.0,76472.0,981.0
