In [1]:
import pandas as pd
import numpy as np
import wget
import os, datetime
import shutil
import pycountry_convert as pc

In [2]:
# global csv files
csv_confirmed = "time_series_covid19_confirmed_global.csv"
csv_deaths = "time_series_covid19_deaths_global.csv"
csv_recovered = "time_series_covid19_recovered_global.csv"

In [3]:
# urls of the files
urls = ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', 
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv']

In [4]:
currDir = "../../DataStore/COVID-19-global"

isdir = os.path.isdir(currDir) 

if isdir:
    try:
        shutil.rmtree(currDir, ignore_errors=True)
    except OSError:
        print ("Deletition of the directory %s failed" % currDir)

try:
    os.mkdir(currDir)
except OSError:
    print ("Creation of the directory %s failed" % currDir)
else:
    print ("Successfully created the directory %s " % currDir)

Successfully created the directory ../../DataStore/COVID-19-global 


In [5]:
# download files
for url in urls:
    filename = wget.download(url, currDir)

In [6]:
# Datasets loaded to DataFrame
df_confirmed = pd.read_csv(currDir + "/time_series_covid19_confirmed_global.csv")
df_deaths = pd.read_csv(currDir + "/time_series_covid19_deaths_global.csv")
df_recovered = pd.read_csv(currDir + "/time_series_covid19_recovered_global.csv")

In [7]:
df_confirmed.columns

Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '5/13/20', '5/14/20', '5/15/20', '5/16/20', '5/17/20', '5/18/20',
       '5/19/20', '5/20/20', '5/21/20', '5/22/20'],
      dtype='object', length=126)

In [8]:
dates = df_confirmed.columns[4:]

conf_df_long = df_confirmed.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Confirmed')

deaths_df_long = df_deaths.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Deaths')

recv_df_long = df_recovered.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Recovered')

recv_df_long = recv_df_long[recv_df_long['Country/Region']!='Canada']

print(conf_df_long.shape)
print(deaths_df_long.shape)
print(recv_df_long.shape)

(32452, 6)
(32452, 6)
(30744, 6)


In [9]:
full_table = pd.merge(left=conf_df_long, right=deaths_df_long, how='left',
                      on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long'])
full_table = pd.merge(left=full_table, right=recv_df_long, how='left',
                      on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long'])

full_table.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.0,65.0,1/22/20,0,0,0.0
1,,Albania,41.1533,20.1683,1/22/20,0,0,0.0
2,,Algeria,28.0339,1.6596,1/22/20,0,0,0.0
3,,Andorra,42.5063,1.5218,1/22/20,0,0,0.0
4,,Angola,-11.2027,17.8739,1/22/20,0,0,0.0


In [10]:
# Changing the conuntry names as required by pycountry_convert Lib
full_table.loc[full_table['Country/Region'] == "US", "Country/Region"] = "USA"

full_table.loc[full_table['Country/Region'] == 'Korea, South', "Country/Region"] = 'South Korea'

full_table.loc[full_table['Country/Region'] == 'Taiwan*', "Country/Region"] = 'Taiwan'

full_table.loc[full_table['Country/Region'] == 'Congo (Kinshasa)', "Country/Region"] = 'Democratic Republic of the Congo'

full_table.loc[full_table['Country/Region'] == 'Congo (Brazzaville)', "Country/Region"] = 'Republic of the Congo'

full_table.loc[full_table['Country/Region'] == 'Bahamas, The', "Country/Region"] = 'Bahamas'

full_table.loc[full_table['Country/Region'] == 'Gambia, The', "Country/Region"] = 'Gambia'

full_table.loc[full_table['Country/Region'] == "Cote d'Ivoire", "Country/Region"] = "Côte d'Ivoire"

full_table.loc[full_table['Country/Region'] == "Reunion", "Country/Region"] = "Réunion"


In [11]:
print(full_table.shape)

(32452, 8)


In [12]:
full_table['Recovered'] = full_table['Recovered'].fillna(0)
full_table['Recovered'] = full_table['Recovered'].astype('int')

In [13]:
# getting all countries
countries = np.asarray(full_table["Country/Region"])

# Continent_code to Continent_names
continents = {
    'NA': 'North America',
    'SA': 'South America', 
    'AS': 'Asia',
    'OC': 'Australia',
    'AF': 'Africa',
    'EU' : 'Europe',
    'na' : 'Others'
}

# Defininng Function for getting continent code for country.
def country_to_continent_code(country):
    try:
        return pc.country_alpha2_to_continent_code(pc.country_name_to_country_alpha2(country))
    except :
        return 'na'

#Collecting Continent Information
full_table.insert(2,"Continent", [continents[country_to_continent_code(country)] for country in countries[:]] )   

In [14]:
# removing
# =======

# removing canada's recovered values
full_table = full_table[full_table['Province/State'].str.contains('Recovered')!=True]

In [15]:
full_table.loc[full_table['Province/State'] == "Bonaire, Sint Eustatius and Saba", "Province/State"] = "Caribbean Netherlands"
full_table.shape

(32452, 9)

In [16]:
# new values
feb_12_conf = {'Hubei' : 34874}

In [17]:
# function to change value
def change_val(date, ref_col, val_col, dtnry):
    for key, val in dtnry.items():
        full_table.loc[(full_table['Date']==date) & (full_table[ref_col]==key), val_col] = val

In [18]:
# changing values
change_val('2/12/20', 'Province/State', 'Confirmed', feb_12_conf)

In [19]:

# checking values
full_table[(full_table['Date']=='2/12/20') & (full_table['Province/State']=='Hubei')]

Unnamed: 0,Province/State,Country/Region,Continent,Lat,Long,Date,Confirmed,Deaths,Recovered
5648,Hubei,China,Asia,30.9756,112.2707,2/12/20,34874,1068,2686


In [20]:
full_table = full_table.replace(np.nan, '', regex=True)
full_table['Date'] = pd.to_datetime(full_table.Date)
full_table.head()

Unnamed: 0,Province/State,Country/Region,Continent,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,Asia,33.0,65.0,2020-01-22,0,0,0
1,,Albania,Europe,41.1533,20.1683,2020-01-22,0,0,0
2,,Algeria,Africa,28.0339,1.6596,2020-01-22,0,0,0
3,,Andorra,Europe,42.5063,1.5218,2020-01-22,0,0,0
4,,Angola,Africa,-11.2027,17.8739,2020-01-22,0,0,0


In [21]:
full_table.to_csv(currDir + '/covid_19_global_complete.csv', index=False)

In [22]:
for column in full_table.columns:
    print (column)

Province/State
Country/Region
Continent
Lat
Long
Date
Confirmed
Deaths
Recovered
