In [1]:
import os
import urllib3
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import datetime

In [6]:
def get_url_content(url_to_parse):
    url_handle = urllib3.PoolManager()
    url_content = url_handle.urlopen('GET', url_to_parse)
    
    return url_content

def get_table_html_data(html_data):
    soup = BeautifulSoup(html_data, 'html.parser')
    table = soup.find('table')
    all_html_rows = table.find_all('tr')
    
    return all_html_rows

def parse_number(num_string, num_type='int'):
    if num_string == '':
        return 0
    
    if num_type == 'int':
        num = int(num_string.replace(',', ''))
    else:
        num = float(num_string.replace(',', ''))
    
    return num

def main():
    url_to_parse = 'https://www.worldometers.info/coronavirus/'
    url_content = get_url_content(url_to_parse)
    
    if url_content.status != 200:
        print('Content retrieval failed')
    else:
        html_data = url_content.data
        all_html_rows = get_table_html_data(html_data)
        
        column_names = ['region', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_recovered', 'active_cases', 'serious_cases', 'total_cases_per_million']

        all_data = []
        
        for i in range(1, len(all_html_rows)):
            columns = all_html_rows[i].find_all('td')
            temp_row = []
        
            for j in range(len(columns)):
                value = columns[j].get_text().strip()
                
                if j == 0:
                    value_required = value.replace(':', '').replace('.', '').replace(' ', '_')
                elif j == len(columns) - 1:
                    value_required = parse_number(value, num_type='float')
                else:
                    value_required = parse_number(value)
                    
                temp_row.append(value_required)
            
            all_data.append(temp_row)
        
        df = pd.DataFrame(data=all_data, columns=column_names)
        df['mortality_rate'] = np.round(100 * df.total_deaths.values / df.total_cases.values, 2)
        
        
        data_dir = str(datetime.datetime.now()).replace(' ', '_')[:-7].replace(':', '-')
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
            
        df.to_csv(os.path.join(data_dir, 'data.csv'), index=False)

In [7]:
main()