# CSCI-B 455 Final Project
## Author: Ben Duggan
## Goal:

Determine the total number of covid cases on a certain day for a given state.

In [None]:
# Import libraries

import numpy as np
import pandas as pd

import os, urllib, tarfile, requests, datetime

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [None]:
# Download the covid and population data

# Load NY-Times Covid data
covid_data_url="https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
dtypes = {'date':'string', 'county':'string', 'state':'string', 'fips':'float64', 'cases':'int64', 'deaths':'float64'}
covid_county_raw=pd.read_csv(covid_data_url, dtype=dtypes)

# Load population county data from the US Census
population_county_url="https://gist.githubusercontent.com/BenSDuggan/58c3e0e1cefaa4b8ac49209ef54ffa75/raw/ac6ed2cfabc0ee954e75db4014ddc2b3de677035/pop_2010-19.csv"
population_county_raw=pd.read_csv(population_county_url)

# Load population data from the US Census
population_state_url="https://gist.githubusercontent.com/BenSDuggan/4a5d357fc224247651692a8641b84530/raw/3d750e7acc00c024332966c09037edb6746845b9/pop_state_2010-19.csv"
population_state_raw=pd.read_csv(population_state_url)

# Load combined Hopkins data that has population built in
dtypes = {'Case_Type':'string', 'People_Total_Tested_Count':'float64', 'Cases':'int64', 'Difference':'int64', 'Date':'string', 'Combined_Key':'string', 'Country_Region':'string', 'Province_State':'string', 'Admin2':'string','iso2':'string','iso3':'string', 'FIPS':'float64', 'Lat':'float64', 'Long':'float64', 'Population_Count':'float64', 'People_Hospitalized_Cumulative_Count':'float64', 'Data_Source':'string', 'Prep_Flow_Runtime':'string'}
combined_data_raw = pd.read_csv('https://query.data.world/s/sadhsnfk2xnj5fwpkawqxyx5j3qtf4', dtype=dtypes)

In [None]:
# Create helpful dictionaries

# Find when each state had its first case
state_first_case = {}

for i in range(covid_county_raw.shape[0]):
  curr = covid_county_raw.loc[i,:]
  if curr['state'] not in state_first_case:
    state_first_case[curr['state']] = curr['date']

# Get state population
state_pop = {}
for i in range(population_state_raw.shape[0]):
  state_pop[population_state_raw['NAME'][i]] = population_state_raw['POPESTIMATE2019'][i]

In [None]:
# Process combined data (not actuall used)

# Drop results outside the US
combined_data = combined_data_raw.copy()
combined_data = combined_data[combined_data['FIPS'].notna()]

In [None]:
# Generate list of state data from Johns Hopkins data

# Range of dates to get
state_date = datetime.datetime(2020, 4, 12)
curr_date = state_date
end_date = '05-05-2020' # Exclusive

last_cases, last_deaths = {}, {}

# The dataframe we want to create
header = ['date', 'state', 'fips', 'cases', 'ncases', 'deaths', 'ndeaths', 'first_case_date', 'days_from_first_case', 'pop', 'Incident_Rate', 'People_Tested', 'People_Hospitalized', 'Mortality_Rate', 'Testing_Rate', 'Hospitalization_Rate']
covid_state_raw = pd.DataFrame(None, columns = header) 

while curr_date.strftime("%m-%d-%Y") != end_date:
  curr_raw = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/%s.csv' % curr_date.strftime("%m-%d-%Y"))
  
  rows = curr_raw.shape[0]
  row = {}
  
  row['date'] = [curr_date.strftime('%y-%m-%d')] * rows
  row['state'] = curr_raw['Province_State'].tolist()
  row['fips'] = curr_raw['FIPS'].tolist()
  row['cases'] = curr_raw['Confirmed'].tolist()
  row['deaths'] = curr_raw['Deaths'].tolist()
  row['Incident_Rate'] = curr_raw['Incident_Rate']
  row['People_Tested'] = curr_raw['People_Tested']
  row['People_Hospitalized'] = curr_raw['People_Hospitalized']
  row['Mortality_Rate'] = curr_raw['Mortality_Rate']
  row['Testing_Rate'] = curr_raw['Testing_Rate']
  row['Hospitalization_Rate'] = curr_raw['Hospitalization_Rate']
  row['pop'] = []
  row['first_case_date'] = []
  row['days_from_first_case'] = []
  row['ncases'] = []
  row['ndeaths'] = []
  for i in range(rows):
    if curr_raw['Province_State'][i] in state_pop:
      row['pop'].append(state_pop[curr_raw['Province_State'][i]])
    else:
      row['pop'].append(float("NaN"))
    if curr_raw['Province_State'][i] in state_first_case:
      row['first_case_date'].append(state_first_case[curr_raw['Province_State'][i]])
      row['days_from_first_case'].append(int((curr_date - datetime.datetime.strptime(state_first_case[curr_raw['Province_State'][i]], '%Y-%m-%d')).days))
    else:
      row['first_case_date'].append(float("NaN"))
      row['days_from_first_case'].append(float("NaN"))
    if curr_raw['Province_State'][i] in last_cases:
      row['ncases'].append(curr_raw['Confirmed'][i] - last_cases[curr_raw['Province_State'][i]])
      last_cases[curr_raw['Province_State'][i]] = curr_raw['Confirmed'][i]
    else:
      last_cases[curr_raw['Province_State'][i]] = curr_raw['Confirmed'][i]
      row['ncases'].append(0)
    if curr_raw['Province_State'][i] in last_deaths:
      row['ndeaths'].append(curr_raw['Deaths'][i] - last_deaths[curr_raw['Province_State'][i]])
      last_deaths[curr_raw['Province_State'][i]] = curr_raw['Deaths'][i]
    else:
      last_deaths[curr_raw['Province_State'][i]] = curr_raw['Deaths'][i]
      row['ndeaths'].append(0)
  try:
    covid_state_raw = pd.concat([covid_state_raw, pd.DataFrame(row, columns = header)], ignore_index = True)
  except:
    print('Error with : ')
    print(curr_date)
    
  curr_date += datetime.timedelta(days=1)

combined_state_data = covid_state_raw.copy()

In [None]:
# Start of model code

In [None]:
# Create data for Model 1

# Create an X and Y variable

csd = combined_state_data.copy()
csd = csd.loc[:, ['deaths', 'ndeaths', 'pop', 'days_from_first_case', 'Testing_Rate', 'cases']]

# Remove nan values
csd = csd.dropna()

# Randomize
np.random.seed(0)
csd = csd.reindex(np.random.permutation(csd.index))

# Get main vars
y = csd['cases'].tolist()
X = csd.drop('cases', axis=1).to_numpy()
X.astype(float)

# Normalize
#row_sums = X.sum(axis=1)
#X = X / row_sums[:, np.newaxis]

# Break into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

In [None]:
# Train and test regression model

lm = linear_model.LinearRegression()

scores = cross_val_score(lm, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# Create data for Model 2

# Create an X and Y variable

csd = combined_state_data.copy()
csd = csd.loc[:, ['deaths', 'ndeaths', 'pop', 'days_from_first_case', 'Testing_Rate', 'ncases']]

# Remove nan values
csd = csd.dropna()

# Randomize
np.random.seed(0)
csd = csd.reindex(np.random.permutation(csd.index))

# Get main vars
y = csd['ncases'].tolist()
X = csd.drop('ncases', axis=1).to_numpy()
X.astype(float)

# Normalize
#row_sums = X.sum(axis=1)
#X = X / row_sums[:, np.newaxis]

# Break into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

In [None]:
# Train and test regression model

lm = linear_model.LinearRegression()

scores = cross_val_score(lm, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


# CSCI-B 455 Final Project

## Introduction

COVID-19 is a world wide pandemic effecting almost everyone around the world in someway.  The United States of America has been hit hard with 1,237,633 cases at the time of submission (worldometer, https://www.worldometers.info/coronavirus/country/us/).  No one knows when the pandemic will end, what ever that looks like (no more social distancing, for example).  Some think that the virus will only end when we get a vacination.  Regardless, it is useful to be able to predict the number of coronavirus cases in a state.

The number of cases will obveously be increasing (or at least not decreasing), and the rate of increase is quite easy to see.  However, putting an exact number on this is more difficult.  This number can be estimated quite well by using the number of deaths, which lag the number of cases (Thomas Pueyo, https://medium.com/@tomaspueyo/coronavirus-act-today-or-people-will-die-f4d3d9cd99ca)(Khan Academy, https://www.youtube.com/watch?v=mCa0JXEwDEk).

Testing capability is also likely related to the number of cases.  If you can't test then you can't confirm a coronavirus case.  The date that a state first had a coronavirus case is likely important.  Lastly, the population of the state is important for estimateing the current number of cases in a state.

## Datasets

The five features I used is number of deaths, number of days since first death, population of state, days since first case, and the testing rate.  Each datapoint is from a particular state on a particular day.  To create this dataset I assembled data from three sources.

The data primarily came from "COVID-19 Data Repository" by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University hosted on GitHub at <https://github.com/CSSEGISandData/COVID-19>.  I'm using the `csse_covid_19_daily_reports_us` folder which contains data about each state collected each day from 4/14/2020 till today 5/4/2020.  There is a lot of useful data but I'm using the confirmed cases, deaths, and the testing rate.  To find the first test date, I used the New York Times county data to find the first date that a state recorded a cornoavirus case.  I then find the number of days from that when creating the dataset.  The dataset can be found here: <https://github.com/nytimes/covid-19-data/blob/master/us-counties.csv>.  I needed to use this data because it goes back all the way to when the virus entred the USA.  The last dataset I used was was the state population from 2019 found here <https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-total.html>.


## Data preparation

Some of the datasets were uploaded to a GitHub Gist to make it easier to load into Google CodeLabs.  Once downloading the data I found the first day COVID-19 was found in a particular state.  Then for each day in the Johns Hopkins data, I extracted the total cases, total deaths, and testing rate for that day.  I calculated the days since first seeing coronavirus and added that to the datapoint.  Finally, I calculated the number of new COVID cases and deaths (using the previous day as a base) and added this to the dataset.

While this may bias the data somehow, I think this method of dataprocessing allows for a lot of data to be producesd from little, time series information.  After creating the dataset, I dropped all rows that had missing data and randomized the data.
  
## Models and analysis

Scikit Learn was used to construct the models.  I built two different regression models.  Both models were evaluated using 5-fold cross validation.  I chose to use regression as my output attribute is continuous.  Additionally, it is quick to train models.

The first model used Linear Regression and total number of cases as the output attribute.  I didn't have great hope for this model, as I only had 1,173 observations.  I initially normalized the data and got a result of around 73% with a standard deviation of around 10%.  When I tried the model without normalization, I got an accuracy of 98% with a standard deviation of 1%.  I didn't think I could get a better accuracy than this, but I was currious how the model would train with number of new cases as the output feature.

The second model used the same regression model but number of new cases as the output attribute.  I thought this model would be better as predicting the number of new cases seems more difficult to me.  Giving the change seems like it would be easier.  When I trained the model, using the `Linear_Regression` moduel, and didn't normalize the data.  I got an accuracy of 63% with a standard deviation of 30%.  This model doesn't perform well at all.  

## Discussion

I was surprised by a lot of parts of this project.  The data collection and processing definetly took me the longest ammount of time.  It was hard to find a good sourse of COVID-19 data that was large enough to train a model with.  Initially, I wanted to look at county data, however, there was no where near enough data for that.  I settled on using the States which likely helped me because outlier county with high cases were removed.

I'm surprised that Model 1 worked as well as it did.  This might have been from incorperating death as one of the features.  I don't think this is bad, but the numbers are very much related.  Still, 98% accuracy seems too good to be true.  However, I think that you can predict the number of cases in a state, for a certain day, using deaths, number of new deaths, population, testing ratio, and days from first test.  My method seems to be very sucessful in predicting the number of new cases.  The number of new cases cannot be accurately predicted, however, using the same features.

