In [132]:
from __future__ import print_function
import pandas as pd
import numpy as np
import re
import os
import pickle
import os.path
from datetime import datetime, date, time 
from dateutil.parser import parse
from time import strftime
import pyarrow
import json
import git

pd.options.display.max_rows = 1000

REPO = 'https://github.com/CSSEGISandData/COVID-19.git'
TMP_FOLDER = '/tmp/corona/'
TMP_GIT = os.path.join(TMP_FOLDER, 'COVID-19')
DATA = os.path.join(TMP_GIT, 'csse_covid_19_data/csse_covid_19_daily_reports/')


In [133]:
def clean_sheet_names(new_ranges):
    '''
    Get rid of the duplicate sheets, only take the sheets from the 
    latest point in the day
    '''
    indices = []
    
    # Remove all sheets that dont have a numeric header
    numeric_sheets = [x for x in new_ranges if re.search(r'\d', x)]
    
    return numeric_sheets

def clone_repo(TMP_FOLDER, REPO):
    print('Cloning Data Repo...')
    git.Git(TMP_FOLDER).clone(REPO)

# Create Tmp Folder
if not os.path.isdir(TMP_FOLDER):
    print('Creating folder...')
    print('...', TMP_FOLDER)
    os.mkdir(TMP_FOLDER)

#Check if repo exists
#git pull if it does
if not os.path.isdir(TMP_GIT):
    clone_repo(TMP_FOLDER, REPO)
else:
    try:
        print('git pull from', REPO)
        rep = git.Repo(TMP_GIT)
        rep.remotes.origin.pull()
    except:
        print('Could not pull from', REPO)
        sys.exit()
    
sheets = os.listdir(DATA)

# Clean the result to the sheet tabs we want
print('Cleaning sheets...')
cleaned_sheets = clean_sheet_names(sheets)


git pull from https://github.com/CSSEGISandData/COVID-19.git
Cleaning sheets...


In [134]:


'''
For assigning date by the time sheet name
'''

def clean_last_updates(last_update):
    date = parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d")
    time = parse(str(last_update).split(' ')[1]).strftime('%H:%M:%S')
    parsed_date = str(date) + ' ' + str(time)

    return parsed_date

def get_date(last_update):
    return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d")

def get_csv_date(file):
    return get_date(file.split('.')[0] + ' ')    

def drop_duplicates(df_raw):
    '''
    Take the max date value for each province for a given date
    '''
    days_list = []
    
    for datetime in df_raw.date.unique():
        tmp_df = df_raw[df_raw.date == datetime]
        tmp_df = tmp_df[df_raw.file_date != datetime].sort_values(['file_date']).drop_duplicates('Province/State', keep='last')
        days_list.append(tmp_df)

    return days_list


In [140]:

keep_cols = ['Confirmed', 'Country/Region', 'Deaths', 'Last Update', 'Province/State', 'Recovered']
numeric_cols = ['Confirmed', 'Deaths', 'Recovered']

def get_data(cleaned_sheets):
    all_csv = []
    # Import all CSV's
    for file in sorted(sheets):
        if 'csv' in file:
            print('...', file)
            tmp_df = pd.read_csv(os.path.join(DATA, file), index_col=None, header=0, parse_dates=['Last Update'])
            tmp_df = tmp_df[keep_cols]
            tmp_df[numeric_cols] = tmp_df[numeric_cols].fillna(0)
            tmp_df[numeric_cols] = tmp_df[numeric_cols].astype(int)
            tmp_df['Province/State'].fillna(tmp_df['Country/Region'], inplace=True)

            tmp_df['Last Update'] = tmp_df['Last Update'].apply(clean_last_updates)
            tmp_df['date'] = tmp_df['Last Update'].apply(get_date)
            tmp_df['file_date'] = get_csv_date(file)

            all_csv.append(tmp_df)

    df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True)
    df_raw = df_raw.sort_values(by=['Last Update'])

    #Get the last entry per region by date
#     frames = drop_duplicates(df_raw)
#     tmp = pd.concat(frames, axis=0, ignore_index=True, sort=True)
    
    return df_raw


df = get_data(cleaned_sheets)


... 01-22-2020.csv
... 01-23-2020.csv
... 01-24-2020.csv
... 01-25-2020.csv
... 01-26-2020.csv
... 01-27-2020.csv
... 01-28-2020.csv
... 01-29-2020.csv
... 01-30-2020.csv
... 01-31-2020.csv
... 02-01-2020.csv
... 02-02-2020.csv
... 02-03-2020.csv
... 02-04-2020.csv
... 02-05-2020.csv
... 02-06-2020.csv
... 02-07-2020.csv
... 02-08-2020.csv
... 02-09-2020.csv
... 02-10-2020.csv
... 02-11-2020.csv
... 02-12-2020.csv
... 02-13-2020.csv
... 02-14-2020.csv
... 02-15-2020.csv
... 02-16-2020.csv
... 02-17-2020.csv
... 02-18-2020.csv
... 02-19-2020.csv
... 02-20-2020.csv
... 02-21-2020.csv
... 02-22-2020.csv
... 02-23-2020.csv
... 02-24-2020.csv
... 02-25-2020.csv
... 02-26-2020.csv


In [141]:

# Now that we have all the data we now need to clean it 
# - Fill null values
# - remore suspected values
# - change column names
def clean_data(tmp_df):
    if 'Demised' in tmp_df.columns:
        tmp_df.rename(columns={'Demised':'Deaths'}, inplace=True)

    if 'Country/Region' in tmp_df.columns:
        tmp_df.rename(columns={'Country/Region':'country'}, inplace=True)
    
    if 'Province/State' in tmp_df.columns:
        tmp_df.rename(columns={'Province/State':'province'}, inplace=True)
        
    if 'Last Update' in tmp_df.columns:
        tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True)
        
    if 'Suspected' in tmp_df.columns:
        tmp_df = tmp_df.drop(columns='Suspected')

    for col in tmp_df.columns:
        tmp_df[col] = tmp_df[col].fillna(0)
    
    #Lower case all col names
    tmp_df.columns = map(str.lower, tmp_df.columns) 
    return tmp_df

print('Cleaning dataframes...')
df  = clean_data(df)


Cleaning dataframes...


In [142]:
# sheets need to be sorted by date value
print('Sorting by datetime...')
current_date = str(datetime.date(datetime.now()))

if df.date.max() == current_date:
    df = df[df.date != df.date.max()]
else:
    df = df[df.date != current_date]

df = df.sort_values('datetime')


Sorting by datetime...


In [196]:

'''
Get the difference of the sum totals for each
date and plot them on a trendline graph
'''
def get_new_cases(tmp, col):
    diff_list = []
    tmp_df_list = []
    df = tmp.copy()

    for i, day in enumerate(df.date.unique()):
        print('\n', day, i)
        tmp_df = df[df.date == day]
        tmp_df_list.append(tmp_df[col].sum() )
        print('... sum for day:', tmp_df[col].sum())
        
        
        if i == 0:
            diff_list.append(tmp_df[col].sum())
        else:
            if tmp_df[col].sum() < tmp_df_list[i-1]:
                print('... previous day vals:', tmp_df_list[i-1])
                print('... Delta current - prev:', tmp_df[col].sum() - tmp_df_list[i-1])
            diff_list.append(tmp_df[col].sum() - tmp_df_list[i-1])
        
    return diff_list

# def get_moving_average(tmp, col):
#     df = tmp.copy()
#     return df[col].rolling(window=2).mean()

# def get_exp_moving_average(tmp, col):
#     df = tmp.copy()
#     return df[col].ewm(span=2, adjust=True).mean()

# def get_confirmed(tmp_df, col):
#     return tmp_df.groupby('date').confirmed.sum().values
    

print('Calculating dataframe for new cases...')
daily_cases_df = pd.DataFrame([])
# daily_cases_df['confirmed'] = get_confirmed(df, 'confirmed')
daily_cases_df['new_confirmed_cases'] = get_new_cases(df, 'confirmed')
# daily_cases_df['new_deaths'] = get_new_cases(df, 'deaths')
# daily_cases_df['new_recoveries'] = get_new_cases(df, 'recovered')
daily_cases_df['date'] = df.date.unique()

# #Moving average
# daily_cases_df['confirmed_MA'] = get_moving_average(daily_cases_df, 'new_confirmed_cases')
# daily_cases_df['deaths_MA'] = get_moving_average(daily_cases_df, 'new_deaths')
# daily_cases_df['recovered_MA'] = get_moving_average(daily_cases_df, 'new_recoveries')

# #Exponential moving average
# daily_cases_df['confirmed_exp_MA'] = get_exp_moving_average(daily_cases_df, 'new_confirmed_cases')
# daily_cases_df['deaths_exp_MA'] = get_exp_moving_average(daily_cases_df, 'new_deaths')
# daily_cases_df['recovered_exp_MA'] = get_exp_moving_average(daily_cases_df, 'new_recoveries')
# # daily_cases_df.reset_index()


Calculating dataframe for new cases...

 2020-01-22 0
... sum for day: 555

 2020-01-23 1
... sum for day: 653

 2020-01-24 2
... sum for day: 941

 2020-01-25 3
... sum for day: 1438

 2020-01-26 4
... sum for day: 2118

 2020-01-27 5
... sum for day: 2927

 2020-01-28 6
... sum for day: 5578

 2020-01-29 7
... sum for day: 6165

 2020-01-30 8
... sum for day: 8235

 2020-01-31 9
... sum for day: 10151

 2020-02-01 10
... sum for day: 12274

 2020-02-02 11
... sum for day: 16804

 2020-02-03 12
... sum for day: 19878

 2020-02-04 13
... sum for day: 24027

 2020-02-05 14
... sum for day: 27582

 2020-02-06 15
... sum for day: 30798

 2020-02-07 16
... sum for day: 34485

 2020-02-08 17
... sum for day: 37066

 2020-02-09 18
... sum for day: 40318

 2020-02-10 19
... sum for day: 42696

 2020-02-11 20
... sum for day: 44584

 2020-02-12 21
... sum for day: 45512

 2020-02-13 22
... sum for day: 60604

 2020-02-14 23
... sum for day: 66634

 2020-02-15 24
... sum for day: 69112

 2020-0

######################################################### 

In [200]:
df[(df.file_date == '2020-02-22') | (df.file_date == '2020-02-23') | (df.file_date == '2020-02-24') | (df.file_date == '2020-02-25')].sort_values(['country', 'province', 'date'])

Unnamed: 0,confirmed,country,deaths,datetime,province,recovered,date,file_date
2258,1,Afghanistan,0,2020-02-24 23:33:02,Afghanistan,0,2020-02-24,2020-02-24
2351,1,Afghanistan,0,2020-02-24 23:33:02,Afghanistan,0,2020-02-24,2020-02-25
2352,1,Algeria,0,2020-02-25 23:43:03,Algeria,0,2020-02-25,2020-02-25
2157,7,Australia,0,2020-02-22 17:03:05,From Diamond Princess,0,2020-02-22,2020-02-23
2242,7,Australia,0,2020-02-22 17:03:05,From Diamond Princess,0,2020-02-22,2020-02-24
2073,7,Australia,0,2020-02-22 17:03:05,From Diamond Princess,0,2020-02-22,2020-02-22
2334,7,Australia,0,2020-02-22 17:03:05,From Diamond Princess,0,2020-02-22,2020-02-25
2161,4,Australia,0,2020-02-13 17:53:03,New South Wales,4,2020-02-13,2020-02-23
2245,4,Australia,0,2020-02-13 17:53:03,New South Wales,4,2020-02-13,2020-02-24
2338,4,Australia,0,2020-02-13 17:53:03,New South Wales,4,2020-02-13,2020-02-25


In [163]:
day21 = pd.read_csv(os.path.join(DATA, '02-21-2020.csv'), index_col=None, header=0, parse_dates=['Last Update'])
day21.Confirmed.sum()
# day21

76843

In [164]:
day22 = pd.read_csv(os.path.join(DATA, '02-22-2020.csv'), index_col=None, header=0, parse_dates=['Last Update'])
day22.Confirmed.sum()

78599

In [168]:
day23 = pd.read_csv(os.path.join(DATA, '02-23-2020.csv'), index_col=None, header=0, parse_dates=['Last Update'])
day23.Confirmed.sum()
# day23.sort_values('Country/Region')

78985

In [169]:
day24 = pd.read_csv(os.path.join(DATA, '02-24-2020.csv'), index_col=None, header=0, parse_dates=['Last Update'])
day24.Confirmed.sum()
# day24.sort_values('Country/Region')

79570

In [167]:
day25 = pd.read_csv(os.path.join(DATA, '02-25-2020.csv'), index_col=None, header=0, parse_dates=['Last Update'])
day25.Confirmed.sum()

80415

In [162]:
# day24['Last Update'] = day24['Last Update'].astype(str)
# day24[~day24['Last Update'].str.contains('2020-02-24', regex= True, na=False)].Confirmed.sum()

# day24.Confirmed.sum() - day23.Confirmed.sum()
# day25.Confirmed.sum() - day24.Confirmed.sum()

In [181]:
df_merged = pd.concat([day23, day24])
df_merged = df_merged.sort_values(['Country/Region', 'Province/State'])

In [185]:
df_merged

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
63,,Afghanistan,2020-02-24 23:33:02,1,0,0
47,From Diamond Princess,Australia,2020-02-22 17:03:05,7,0,0
47,From Diamond Princess,Australia,2020-02-22 17:03:05,7,0,0
51,New South Wales,Australia,2020-02-13 17:53:03,4,0,4
50,New South Wales,Australia,2020-02-13 17:53:03,4,0,4
49,Queensland,Australia,2020-02-21 05:43:02,5,0,1
49,Queensland,Australia,2020-02-21 05:43:02,5,0,1
55,South Australia,Australia,2020-02-17 08:13:09,2,0,2
55,South Australia,Australia,2020-02-17 08:13:09,2,0,2
52,Victoria,Australia,2020-02-13 17:53:03,4,0,4


######################################################################## 

In [85]:

'''
Calculate the number of people that are ACTUALLY infected on a given day
currently infected = sum of people date - (recovored + died)
ex: 5 = 10 - (4 - 1)

'''
current_infected = pd.DataFrame([])
current_infected['currently_infected'] = (df.groupby('date').confirmed.sum() - \
                                          (df.groupby('date').deaths.sum() + df.groupby('date').recovered.sum()))
current_infected['delta'] = (current_infected['currently_infected'] - df.groupby('date').confirmed.sum())
daily_cases_df = pd.merge(daily_cases_df, current_infected, how='outer', on='date')



In [86]:
#Create date of extraction folder
save_dir  = './data/' + str(datetime.date(datetime.now()))

print('Saving to data subdirectory...')
print('...', save_dir)

if not os.path.exists(save_dir):
    os.mkdir(save_dir)
    
print('Saving...')
file_name = 'agg_data_{}.parquet.gzip'.format(datetime.date(datetime.now()))
df.astype(str).to_parquet(os.path.join(save_dir, file_name), compression='gzip')
print('...', file_name)


csv_file_name = 'agg_data_{}.csv'.format(datetime.date(datetime.now()))
df.astype(str).to_csv(os.path.join(save_dir, csv_file_name))
print('...', csv_file_name)


daily_cases_file_name = 'trend_{}.csv'.format(datetime.date(datetime.now()))
daily_cases_df.astype(str).to_csv(os.path.join(save_dir, daily_cases_file_name))
print('...', daily_cases_file_name)

print('Done!')

Saving to data subdirectory...
... ./data/2020-02-18
Saving...
... agg_data_2020-02-18.parquet.gzip
... agg_data_2020-02-18.csv
... trend_2020-02-18.csv
Done!
