In [20]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.base import clone
from sklearn.model_selection import cross_val_score

#Import Datasets
counties = pd.read_csv('abridged_couties.csv')
deaths = pd.read_csv('time_series_covid19_deaths_US.csv')
cases = pd.read_csv('time_series_covid19_confirmed_US.csv')
states = pd.read_csv('4.18states.csv')

In [6]:
def replace_nan_with_mean(country, col):
    arr = states.loc[states['Country_Region'] == country, col]
    arr_mean = arr.mean()
    arr.fillna(arr_mean, inplace=True)
    states.loc[states['Country_Region'] == country,col] = arr

replace_nan_with_mean('US','People_Hospitalized')
replace_nan_with_mean('US','Hospitalization_Rate')
replace_nan_with_mean('US','Testing_Rate')
replace_nan_with_mean('US','People_Tested')
replace_nan_with_mean('US','Mortality_Rate')

# There is still an uncovered NaN value in Mortality_Rate but this time in a different country so we can remove that. Namely, Canada.
replace_nan_with_mean('Canada','Mortality_Rate')

In [8]:
# Dropping out of state FIPS, unassigned FIPS, correctional facilities and the Grand Princess
cases.drop(cases.index[3149:3255], inplace = True)
deaths.drop(deaths.index[3149:3255], inplace = True)

# Replace missing FIPS for Kansas City with data from census.gov
cases.loc[cases['Admin2']=='Kansas City', 'FIPS'] = 29380.0
deaths.loc[deaths['Admin2']=='Kansas City', 'FIPS'] = 29380.0

# Duke and Nantucket Counties' numbers have been counted jointly, recorded only under 'Dukes and Nantucket'. Ideally, the counts would be redistributed evenly between Duke and Nantucket, however it is not realistic to assign 0.5 deaths to a county. Therefore, we have assigned all counts to the FIPS of Nantucket and dropped the Nantucket row. This is not an ideal solution.
cases.loc[cases['Admin2']=='Dukes and Nantucket', 'FIPS'] = 25019.0
deaths.loc[deaths['Admin2']=='Dukes and Nantucket', 'FIPS'] = 25019.0
cases.drop(cases.loc[cases['Admin2']=='Nantucket',:].index, inplace = True)
deaths.drop(deaths.loc[deaths['Admin2']=='Nantucket',:].index, inplace = True)
#deaths[deaths['FIPS'].isnull()]

# Convert to string and add padded 0 for consistency with counties data
deaths['FIPS'] = deaths['FIPS'].astype(int).astype(str).str.zfill(5)
cases['FIPS'] = cases['FIPS'].astype(int).astype(str).str.zfill(5)

# Create a dataframe with the total confirmed cases and deaths by region.
total_cases = cases[['FIPS', 'Province_State', 'Lat', 'Long_', '4/18/20']]
total_cases.rename(columns={'4/18/20':'total_confirmed'}, inplace=True)
total_deaths = deaths[['FIPS', '4/18/20']]
total_deaths.rename(columns={'4/18/20':'total_deaths'}, inplace=True)
total_by_region = total_cases.merge(total_deaths, how = 'left', on = ['FIPS'])
total_by_region.rename(columns={'Long_':'Long'}, inplace = True)
total_by_region.head(10)

# Dropping America Samoa, Guam, Northern Mariana Islands, Puerto Rico and Virgin Islands
total_by_region.drop(total_by_region.index[0:5], inplace = True)

# Adding a column with mortality rate
total_by_region['mortality'] = total_by_region['total_deaths'] / total_by_region['total_confirmed']
total_by_region['mortality'] = np.nan_to_num(total_by_region['mortality'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [10]:
county_cases = total_by_region.merge(counties, how = 'left', left_on = 'FIPS', right_on = 'countyFIPS')

# There are 3 rows (FIPS 02158 in Alaska, FIPS 46102 in South Dakota, and FIPS 29380 in Missouri) that have multiple null values. Although 29380 in Missouri had 412 Covid cases, we decide to drop these 3 columns as we do not have a reliable method of filling in the missing values.
county_cases.drop(county_cases[county_cases['countyFIPS'].isnull()].index, inplace = True)

# We drop the columns that are either not of interest to our study or that have too many null values to be useful or reliable. We also drop redundant identifiers (ie countyFIPS and COUNTYFP).
county_cases.drop(['dem_to_rep_ratio', 'PopMale<52010', 'PopFmle<52010', 'PopMale5-92010', 'PopFmle5-92010', 'PopMale10-142010', 'PopFmle10-142010', 'PopMale15-192010','PopFmle15-192010', 'PopMale20-242010', 'PopFmle20-242010', 'PopMale25-292010', 'PopFmle25-292010', 'PopMale30-342010', 'PopFmle30-342010', 'PopMale35-442010', 'PopFmle35-442010', 'PopMale45-542010', 'PopFmle45-542010', 'PopMale55-592010', 'PopFmle55-592010', 'PopMale60-642010', 'PopFmle60-642010', 'PopMale65-742010', 'PopFmle65-742010', 'PopMale75-842010', 'PopFmle75-842010', 'PopMale>842010', 'PopFmle>842010', 'CensusRegionName', 'CensusDivisionName', 'Rural-UrbanContinuumCode2013', 'FracMale2017', '3-YrMortalityAge<1Year2015-17', '3-YrMortalityAge1-4Years2015-17', '3-YrMortalityAge5-14Years2015-17', '3-YrMortalityAge15-24Years2015-17', '3-YrMortalityAge25-34Years2015-17', '3-YrMortalityAge35-44Years2015-17', '3-YrMortalityAge45-54Years2015-17', '3-YrMortalityAge55-64Years2015-17', '3-YrMortalityAge65-74Years2015-17', '3-YrMortalityAge75-84Years2015-17', '3-YrMortalityAge85+Years2015-17', 'mortality2015-17Estimated', 'HPSAShortage', 'HPSAServedPop', 'HPSAUnderservedPop', '3-YrDiabetes2015-17', 'MedicareEnrollment,AgedTot2017', '#EligibleforMedicare2018', 'federal guidelines', 'foreign travel ban', 'countyFIPS', 'COUNTYFP', 'STATEFP', 'StateName', 'State', 'lat', 'lon', 'POP_LATITUDE', 'POP_LONGITUDE' ], axis = 1, inplace = True)

#The counties with null values for the date of certain bans did not institute such bans, so we filled those values with 0. (https://www.npr.org/2020/05/01/847413697/midwest-coronavirus-related-restrictions-by-state)
county_cases = county_cases.fillna({'>50 gatherings':0, '>500 gatherings':0, 'stay at home':0, 'entertainment/gym':0})

#Lastly, there are 3 columns with null values that are most appropriately accounted for by filling with mean.
county_cases['HeartDiseaseMortality'].fillna(county_cases['HeartDiseaseMortality'].mean(), inplace = True)
county_cases['StrokeMortality'].fillna(county_cases['StrokeMortality'].mean(), inplace = True)
county_cases['SVIPercentile'].fillna(county_cases['StrokeMortality'].mean(), inplace = True)

# This prints the number of null values in each column.
# print('Null values in county_cases:\n')
# for column in county_cases.columns.values.tolist():
#     print(column,':', sum(county_cases[column].isnull()) )

#'DiabetesPercentage', 'HeartDiseaseMortality', 'StrokeMortality', 'Smokers_Percentage', 'RespMortalityRate2014'

In [11]:
ca_cases = county_cases.loc[county_cases['Province_State'] == 'California']
ny_cases = county_cases.loc[county_cases['Province_State'] == 'New York']

In [12]:
from datetime import date
cols = ['stay at home','>50 gatherings','>500 gatherings','public schools','restaurant dine-in','entertainment/gym']
def date_change(df,cols):
    for col in cols:
        state_dates = df[col].unique()
        for dates in state_dates:
            new_state_date = date.fromordinal(int(dates)).isoformat()
            df[col].replace(dates, new_state_date, inplace=True)
date_change(ca_cases, cols)
date_change(ny_cases, cols)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [14]:
# Morgan's note to self: I realized that we don't necessarily know if the numbers (death, cases, mortality) in the counties dataframe are actually the same as what was reported in the states dataframe...Say if we train a model on counties data then apply it to states data and these numbers are different, it could be problematic. MUST EXPLORE!
# First we aggregate the county infomration by state.
state_cases_sum = county_cases[['Province_State','total_confirmed', 'total_deaths','PopulationEstimate2018', 'PopTotalMale2017', 'PopTotalFemale2017', 'PopulationEstimate65+2017',         'CensusPopulation2010', '#FTEHospitalTotal2017', "TotalM.D.'s,TotNon-FedandFed2017", '#HospParticipatinginNetwork2017', '#Hospitals', '#ICU_beds']].groupby(['Province_State']).sum().reset_index()

state_cases_mean = county_cases[['Province_State', 'PopulationDensityperSqMile2010','MedianAge2010', 'DiabetesPercentage', 'HeartDiseaseMortality', 'StrokeMortality', 'Smokers_Percentage', 'RespMortalityRate2014', 'stay at home', '>50 gatherings', '>500 gatherings', 'public schools', 'restaurant dine-in', 'entertainment/gym', 'SVIPercentile']].groupby(['Province_State']).mean().reset_index()

state_cases = state_cases_mean.merge(state_cases_sum, how = 'left', on = 'Province_State')

# Now we merge with the states dataframe.
state_cases = state_cases.merge(states[states['Country_Region'] == 'US'], how = 'left', on = 'Province_State')


In [16]:
# Time Series data with county info for counties with non-zero cases
time_deaths= county_cases[county_cases['total_confirmed'] != 0].merge(deaths.drop(['UID', 'iso2', 'iso3', 'code3', 'Admin2','Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key'], axis =1), how = 'left', on = 'FIPS')
time_cases= county_cases[county_cases['total_confirmed'] != 0].merge(cases.drop(['UID', 'iso2', 'iso3', 'code3', 'Admin2','Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key'], axis =1), how = 'left', on = 'FIPS')

date_array = ['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20', '1/30/20', '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20', '2/5/20', '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20', '2/11/20', '2/12/20', '2/13/20', '2/14/20', '2/15/20', '2/16/20', '2/17/20', '2/18/20', '2/19/20', '2/20/20', '2/21/20', '2/22/20', '2/23/20', '2/24/20', '2/25/20', '2/26/20', '2/27/20', '2/28/20', '2/29/20', '3/1/20', '3/2/20', '3/3/20', '3/4/20', '3/5/20', '3/6/20', '3/7/20', '3/8/20', '3/9/20', '3/10/20', '3/11/20', '3/12/20', '3/13/20', '3/14/20', '3/15/20', '3/16/20', '3/17/20', '3/18/20', '3/19/20', '3/20/20', '3/21/20', '3/22/20', '3/23/20', '3/24/20', '3/25/20', '3/26/20', '3/27/20', '3/28/20', '3/29/20', '3/30/20', '3/31/20', '4/1/20', '4/2/20', '4/3/20', '4/4/20', '4/5/20', '4/6/20', '4/7/20', '4/8/20', '4/9/20', '4/10/20', '4/11/20', '4/12/20', '4/13/20', '4/14/20', '4/15/20', '4/16/20', '4/17/20', '4/18/20']
plot_cases = time_cases.set_index('CountyName')[date_array] 

#print(list(time_cases.columns))
# ca = county_cases.loc[county_cases['Province_State'] == 'California']
# sns.scatterplot(x='RespMortalityRate2014',y='total_deaths',data=county_cases)
#sns.scatterplot(x='PopulationDensityperSqMile2010',y='total_deaths',data=county_cases)

<br>

***

<br>

### Exploratory Data Analysis: we can consider the data we have for the counties and group them as follows: 
1. ***Population* Stats**:  'PopulationEstimate2018',  'PopTotalMale2017',  'PopTotalFemale2017',  'PopulationEstimate65+2017',  'PopulationDensityperSqMile2010',  'CensusPopulation2010',
2. **At *Risk* Populations**:  'MedianAge2010',  'DiabetesPercentage',  'HeartDiseaseMortality',  'StrokeMortality',  'Smokers_Percentage', 'RespMortalityRate2014',
3. ***Infrastructure* (Ability to provide care)**:  '#FTEHospitalTotal2017', "TotalM.D.'s,TotNon-FedandFed2017", '#HospParticipatinginNetwork2017', '#Hospitals', '#ICU_beds', 'SVIPercentile'
4. **Pandemic *Policies* Enacted**: 'stay at home', '>50 gatherings', '>500 gatherings', 'public schools', 'restaurant dine-in', 'entertainment/gym'


In [21]:
def rmse(predicted, actual):
    return np.sqrt(np.mean((actual - predicted)**2))
def rmse_score(model, X, Y):
    return np.sqrt(np.mean((Y - model.predict(X))**2))

In [29]:
population_stats = ['PopulationEstimate2018', 'PopTotalMale2017', 'PopTotalFemale2017', 'PopulationEstimate65+2017', 'PopulationDensityperSqMile2010', 'CensusPopulation2010','mortality']

def select_columns(data, columns): 
    return data.loc[:,columns]

def process_data(data, cols, metric):
    
    # Select the desired columns
    df = select_columns(data, cols)
    
    # Return features matrix and response variable (X and Y)
    X = df.drop([metric], axis=1)
    Y = df.loc[:,metric]
    
    return X,Y

def test_models(model, X, Y):
    # Fit a Model 
    model.fit(X,Y)
    
    # Predict our Response Variable (Y)
    Y_pred = model.predict(X)
    
    # Test our model accuracy using RMSE 
    error = rmse(Y_pred, Y)
    
    # Test our model using Cross Validation and RMSE 
    cv_error = np.mean(cross_val_score(model, X, Y, scoring=rmse_score, cv=5))   
    
    # return the model we used to fit, the predicted response variable, and the two errors
    return Y_pred, error, cv_error

In [30]:
ca_cases_no_outlier = ca_cases[ca_cases['PopulationEstimate2018'] < 10000000]
ca_cases_no_outlier = ca_cases_no_outlier[ca_cases_no_outlier['mortality'] < 0.8]

In [31]:
X_ca_pop, Y_ca_pop = process_data(ca_cases, population_stats, 'mortality')
X_ca_pop_no_outlier, Y_ca_pop_no_outlier = process_data(ca_cases_no_outlier, population_stats, 'mortality')

***

In [35]:
pop_model = LinearRegression(fit_intercept=True)
Y_ca_pop_pred, ca_pop_error, ca_pop_cv_error = test_models(pop_model,X_ca_pop,Y_ca_pop)
print("CA Population Stats RMSE: {}".format(ca_pop_error))
print("CA Population Stats CV RMSE: {}".format(ca_pop_cv_error))

CA Population Stats RMSE: 0.12921425396580136
CA Population Stats CV RMSE: 0.09947370581970065


In [36]:
pop_model_no_outlier = LinearRegression(fit_intercept=True)
Y_ca_pop_pred_no_outlier, ca_pop_error_no_outlier, ca_pop_cv_error_no_outlier = test_models(pop_model_no_outlier, X_ca_pop_no_outlier, Y_ca_pop_no_outlier)
print("CA Population Stats without Outliers RMSE: {}".format(ca_pop_error_no_outlier))
print("CA Population Stats CV without Outliers RMSE: {}".format(ca_pop_cv_error_no_outlier))


CA Population Stats without Outliers RMSE: 0.024427608147709605
CA Population Stats CV without Outliers RMSE: 0.033457584394095415
