In [5]:
import os
import pandas as pd
import numpy as np
import subprocess
from datetime import datetime
import json
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_rows', 500)

# Data fetch and preparation

In [6]:
def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))
if __name__ == '__main__':

    store_relational_JH_data()


 Number of rows stored: 63042
 Latest date is: 2020-09-14 00:00:00


# Web scrapping : 
# Confirmed : https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv
# Recovered : https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv
# Deaths : https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv

In [7]:
# Get the data from the URL mentioned above
def getLatestData(info_type):
    if info_type == "confirmed":
        response = requests.get("https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
    elif info_type == "deaths":
        response = requests.get("https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
    elif info_type == "recovered":
        response = requests.get("https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
    soup = BeautifulSoup(response.content, 'html.parser')
    html_table=soup.find('table')
    all_rows=html_table.find_all('tr')
    jh_data_list=[]
    for pos,rows in enumerate(all_rows):
        if pos==0:
            header_list = [each_col.get_text(strip=True) for each_col in rows.find_all('th')]
        else:
            col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')] #td for data element
            jh_data_list.append(col_list)
    return jh_data_list,header_list
    

In [8]:
# Prepare the data for visualization and modelling
def prepareDataframe(jh_data_list,header_list):
    header_list.insert(0,'index')
    jh_data_df=pd.DataFrame(jh_data_list)
    jh_data_df.columns=header_list
    #jh_data_df.head()
    time_idx=jh_data_df.columns[5:]
    country_list=jh_data_df['Country/Region']
    jh_data_transformed_df = pd.DataFrame({'date':time_idx})
    for each in country_list:
        jh_data_transformed_df[each] = np.array(jh_data_df[jh_data_df['Country/Region']==each].iloc[:,5::].astype(int).sum(axis=0))
    #jh_data_transformed_df.tail()
    time_idx=[datetime.strptime( each,"%m/%d/%y") for each in jh_data_transformed_df.date] # convert to datetime
    time_str=[each.strftime('%Y-%m-%d') for each in time_idx] # convert back to date ISO norm (str)
    jh_data_transformed_df['date']=time_idx
    return jh_data_transformed_df

In [9]:
jh_confirmed_list,header_list = getLatestData("confirmed")
jh_confirmed_df = prepareDataframe(jh_confirmed_list,header_list)
#jh_confirmed_df
jh_deaths_list,header_list = getLatestData("deaths")
jh_deaths_df = prepareDataframe(jh_deaths_list,header_list)
#jh_deaths_df
jh_recovered_list,header_list = getLatestData("recovered")
jh_recovered_df = prepareDataframe(jh_recovered_list,header_list)
#jh_recovered_df

In [10]:
#save the df to files
jh_confirmed_df.to_csv('../data/processed/COVID_small_flat_confirmed_table.csv',sep=';',index=False)
jh_recovered_df.to_csv('../data/processed/COVID_small_flat_recovered_table.csv',sep=';',index=False)
jh_deaths_df.to_csv('../data/processed/COVID_small_flat_deaths_table.csv',sep=';',index=False)

In [11]:
def getPopulationData():
    response = requests.get("https://www.worldometers.info/world-population/population-by-country/")
    response.content
    soup = BeautifulSoup(response.content, 'html.parser')
    html_table=soup.find('table')
    all_rows=html_table.find_all('tr')
    jh_data_list=[]
    for pos,rows in enumerate(all_rows):
        if pos==0:
            header_list = [each_col.get_text(strip=True) for each_col in rows.find_all('th')]
        else:
            col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')]
            jh_data_list.append(col_list)
    jh_data_df = pd.DataFrame(jh_data_list)
    return jh_data_df.iloc[:, 1:3]

In [12]:
pop_data = getPopulationData()
countries = pop_data[1]
population = pop_data[2]
pop_df = pd.DataFrame(columns=countries)
pop_df.loc[len(pop_df)] = population.tolist()
pop_df.rename(columns={'United States': 'US'}, inplace=True)
pop_df.to_csv('../data/processed/world_population.csv',sep=';',index=False)