# Description

This Jupyter Notebook contains the code to download the JHU CSSE COVID-19 dataset from GitHub and import it into a Pandas DataFrame to be manipulated using Python.

This notebook is intended to be run in [Google Colaboratory](https://colab.research.google.com/).

Please note that this notebook's default kernel is Python. If you would like to run R code in it, you must pass the R code to an R kernel using Python magic functions: the cell magic function `%%R` or the line magic function `%R`.

# Set up notebook

In [1]:
# Import packages
import pandas as pd
import os
from collections import OrderedDict

# Enable interactive display of tabular data in Colab
%load_ext google.colab.data_table

# Download and import data

In [2]:
# Download JHU CSSE COVID-19 Dataset from GitHub
%%shell
cd /usr/local/bin
git clone https://github.com/CSSEGISandData/COVID-19.git

Cloning into 'COVID-19'...
remote: Enumerating objects: 39943, done.[K
remote: Total 39943 (delta 0), reused 0 (delta 0), pack-reused 39943[K
Receiving objects: 100% (39943/39943), 294.52 MiB | 8.54 MiB/s, done.
Resolving deltas: 100% (24430/24430), done.
Checking out files: 100% (500/500), done.




In [3]:
# Define function to import and clean .csv files
def import_csvs(csv_directory):
  '''
  Import all .csv files in a directory into a Pandas DataFrame. Filenames are interpreted as dates.

  input:
    csv_directory: path to directory containing .csv files (string)
  '''

  # Find all .csv files in directory
  csv_filenames = {}
  for csv_filename in os.listdir(csv_directory):
    if csv_filename.endswith('.csv'):
      csv_filenames[pd.to_datetime(csv_filename.rstrip('.csv'))] = os.path.join(csv_directory, csv_filename)
  csv_filenames = OrderedDict(sorted(csv_filenames.items()))

  # Import each .csv file as a Pandas DataFrame
  dataframes_separate = []
  for (csv_date, csv_filename) in csv_filenames.items():
    print('Importing %s ...' % (csv_filename))
    df = pd.read_csv(csv_filename, engine='c', header=0)
    # Replace old versions of column names with new ones for consistency
    for old, new in {'Province/State': 'Province_State', 'Country/Region': 'Country_Region', 'Last Update': 'Last_Update', 'Latitude':'Lat', 'Longitude':'Long_', 'Admin2':'County'}.items():
      df.columns = df.columns.str.replace(old, new)
    df['Last_Update'] = pd.to_datetime(df['Last_Update']) # format dates as such
    df.replace({' ':'_'}, regex=True, inplace=True) # Replace any spaces with underscores for use in Python
    dataframes_separate.append(df)
  print('Imported all CSVs in %s.' % (csv_directory))

  # Concatenate all DataFrames into one, sort by date, and remove duplicate rows
  dataframes_combined = pd.concat(dataframes_separate, ignore_index=True).sort_values('Last_Update', ignore_index=True).drop_duplicates(ignore_index=True)

  return (dataframes_separate, dataframes_combined)


# Call functions
if __name__ == '__main__':
  csv_directory = '/usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports'
  (separated_by_date, all_data) = import_csvs(csv_directory)

Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-23-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-24-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-25-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-26-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-27-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-28-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-29-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-30-2020.csv ...
Importing /usr/local/bin/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-31-2020.csv ...


In [5]:
# Show imported DataFrame
all_data

Unnamed: 0,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Lat,Long_,FIPS,County,Active,Combined_Key,Incidence_Rate,Case-Fatality_Ratio
0,Anhui,Mainland_China,2020-01-22 17:00:00,1.0,,,,,,,,,,
1,Ningxia,Mainland_China,2020-01-22 17:00:00,1.0,,,,,,,,,,
2,Qinghai,Mainland_China,2020-01-22 17:00:00,,,,,,,,,,,
3,Shaanxi,Mainland_China,2020-01-22 17:00:00,,,,,,,,,,,
4,Shandong,Mainland_China,2020-01-22 17:00:00,2.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398803,Texas,US,2020-07-16 04:44:59,192.0,0.0,0.0,31.319260,-95.422262,48225.0,Houston,192.0,"Houston,_Texas,_US",835.945664,0.000000
398804,Arkansas,US,2020-07-16 04:44:59,157.0,1.0,0.0,34.090074,-93.993487,5061.0,Howard,156.0,"Howard,_Arkansas,_US",1189.213755,0.636943
398805,Indiana,US,2020-07-16 04:44:59,621.0,58.0,0.0,40.485277,-86.113519,18067.0,Howard,563.0,"Howard,_Indiana,_US",752.326032,9.339775
398806,Kentucky,US,2020-07-16 04:44:59,321.0,33.0,0.0,37.309390,-87.546704,21107.0,Hopkins,288.0,"Hopkins,_Kentucky,_US",718.345791,10.280374
