In [1]:
# Imports
import os, subprocess, json

# Environmental Variables
from dotenv import load_dotenv

import pandas as pd

# HTTP Client
import requests
# For parsing and sifting through HTML
from bs4 import BeautifulSoup



In [2]:
# Load environmental variables specified in .env
load_dotenv()

# Set max rows for Pandas
pd.set_option('display.max_rows', 500)

## CRISP-DM
![CRISP-DM](../reports/figures/CRISP_DM.png)

# Data Understanding

## Data Sources:
**Wordlwide:**
- [John Hopkings](https://github.com/CSSEGISandData/COVID-19.git) (GITHUB)
- [REST API services to retrieve Data](https://npgeo-corona-npgeo-de.hub.arcgis.com/)

**Nigeria:**
- [Nigeria Center for Disease Contrl (NCDC)](https://covid19.ncdc.gov.ng/)

**Germany:**
- [Robert Koch Institute](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html) (webscraping)

## GITHUB- Johns Hopkings

In [3]:
# Create directory for John Hopkins Dataset if it doesn't already exist
if(not os.path.exists("../data/raw/JH_dataset")):
    # Create directory
    os.mkdir("../data/raw/JH_dataset")

# Check if Dataset doesn't already exist in filesystem
if(not os.path.exists("../data/raw/JH_dataset/COVID-19")):
    # Command to clone dataset
    cmd=  "git clone https://github.com/CSSEGISandData/COVID-19.git"
    cmd_wd= "../data/raw/JH_dataset"

# Otherwise if Dataset repo has already been cloned, peform pull operation
else:
    cmd= "git pull"
    cmd_wd= "../data/raw/JH_dataset/COVID-19"


# Git process
git_proc= subprocess.Popen(
    cmd,
    cwd=cmd_wd, shell=True, 
    stdout= subprocess.PIPE, stderr= subprocess.STDOUT
)

# Communicate with child process and set timeout
# Timeout
# Give a maximum of 10 mins because cloning/pulling might take a while
proc_timeout= 600
try:
    git_proc_out= git_proc.communicate(timeout=proc_timeout)[0]
    print(str(git_proc_out))
except TimeoutError:
    print("Git operation on Johns Hopkins Dataset from GITHUB failed...\n")



b'From https://github.com/CSSEGISandData/COVID-19\n   e77ccd3a..dc7740d5  web-data   -> origin/web-data\nAlready up to date.\n'


In [4]:
# Data path
data_path= "../data/raw/JH_dataset/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
# Raw data
pd_raw= pd.read_csv(data_path)

In [5]:
pd_raw.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,8/5/20,8/6/20,8/7/20,8/8/20,8/9/20,8/10/20,8/11/20,8/12/20,8/13/20,8/14/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,36829,36896,37015,37054,37054,37162,37269,37345,37424,37431
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,5889,6016,6151,6275,6411,6536,6676,6817,6971,7117
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,33055,33626,34155,34693,35160,35712,36204,36699,37187,37664
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,939,944,955,955,955,963,963,977,981,989
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1395,1483,1538,1572,1672,1679,1735,1762,1815,1852


## REST API- [Smartable](smartable.ai)

In [6]:
# Create directory for Smartable.ai Dataset if it doesn't already exist
if(not os.path.exists("../data/raw/SMARTABLE")):
    # Create directory
    os.mkdir("../data/raw/SMARTABLE")

# Endpoint URL for Nigeria stats
api_end_url="https://api.smartable.ai/coronavirus/stats/NG"

# Setup Headers for API call
headers={
    "Cache-Control": "no-cache",
    "Subscription-Key": os.getenv("SMAI_PRIMARY_KEY")
}

# Make API call
res= requests.get(api_end_url, headers=headers)
# Load JSON response
res_json= json.loads(res.content)
# Save data to file
with open("../data/raw/SMARTABLE/NG_data.json", "w") as js_fp:
    json.dump(res_json, js_fp, indent=4)


# Index the data with dates as primary key
daily_stats= []
for a_day in res_json["stats"]["history"]:
    daily_stats.append([
        a_day["date"], a_day["confirmed"], 
        a_day["deaths"], a_day["recovered"]
    ])

# Create pandas frame from list
col_names= {
    0: "date", 1: "confirmed", 2: "deaths",
    3: "recovered"
}
# New frame
pd_frame= pd.DataFrame(daily_stats).rename(columns=col_names)


In [7]:
# Show frame
pd_frame.tail()


Unnamed: 0,date,confirmed,deaths,recovered
202,2020-08-11T00:00:00,47290,956,33609
203,2020-08-12T00:00:00,47703,961,33959
204,2020-08-13T00:00:00,48116,966,34309
205,2020-08-14T00:00:00,48445,973,35998
206,2020-08-15T00:00:00,48445,973,35998


In [8]:
# Write frame to .csv
pd_frame.to_csv(
    "../data/processed/SMARTABLE_daily_NG.csv", sep=";", index=False
)

## Web Scraping
- **Data Miner** Chrome extension is a nice tool that might come in handy


### Web Scraping- Robert Koch Institute

In [9]:
# Get Robert Koch Institute page
page= requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [10]:
# Parse HTML
soup=  BeautifulSoup(page.content, 'html.parser')

In [11]:
# Find table
html_table= soup.find('table')
# Find rows in table
table_rows= html_table.find_all('tr')

In [12]:
# Data List
data_list= []

# Loop through table rows
for idx,row in enumerate(table_rows):
    col_list=[col.get_text(strip=True) for col in row.find_all('td')]
    data_list.append(col_list)



In [13]:
# Display data in Pandas
scraped_data= pd.DataFrame(data_list)
# Remove empty rows
scraped_data= scraped_data.dropna()
# Rename column
scraped_data= scraped_data.rename(columns={0:"state", 1:'number', 2:'diff. to prev. day', 3:'last 7 days', 5:'deaths'})

In [14]:
# Show frame
scraped_data.tail()

Unnamed: 0,state,number,diff. to prev. day,last 7 days,4,deaths
14,Sachsen,5.719,14.0,75.0,18,225.0
15,Sachsen-Anhalt,2.114,15.0,55.0,25,64.0
16,Schles­wig-Holstein,3.768,24.0,187.0,65,158.0
17,Thüringen,3.492,10.0,67.0,31,186.0
18,Gesamt,222.828,1.415,6.914,83,9.231


### Web Scraping- Nigeria Centre for Disease Control (NCDC)


In [15]:
# Pull page on COVID-19
page= requests.get("https://covid19.ncdc.gov.ng/")

In [16]:
# Parse HTML
parsed_page= BeautifulSoup(page.content, 'html.parser')
# Pull Table
html_table= parsed_page.find('table')
# Pull table rows
table_rows= html_table.find_all('tr')

# Table Header
table_header= dict()
# Table data
table_data=[]

# Loop through table rows
for idx,row in enumerate(table_rows):
    # Table headers in first row
    if(idx==0):
        # Pull column headers
        col_headers= row.find_all('th')
        # Make a dictionary of column headers
        table_headers= { idx:col_header.get_text(strip=True) for idx,col_header in enumerate(col_headers) }
    
    # Table data
    # Get row columns
    row_cols= row.find_all('td')
    # Get data body into list
    row_data= [ col.get_text(strip=True) for col in row_cols ]
    # Append col to row list
    table_data.append(row_data)


In [17]:
# Make data into Pandas Frame
pd_table= pd.DataFrame(table_data)
# Remove empty rows
pd_table= pd_table.dropna()
# Insert column names
pd_table= pd_table.rename(columns=table_headers)

In [18]:
# Show frame
pd_table.tail()

Unnamed: 0,States Affected,No. of Cases (Lab Confirmed),No. of Cases (on admission),No. Discharged,No. of Deaths
33,Taraba,78,19,55,4
34,Zamfara,77,1,71,5
35,Cross River,73,23,42,8
36,Yobe,67,2,57,8
37,Kogi,5,0,3,2


In [22]:
# Drop column "No. of Cases (on admission)"
pd_table= pd_table.drop(["No. of Cases (on admission)"], axis=1)

KeyError: "['No. of Cases (on admission)'] not found in axis"

In [23]:
# Rename "No. of Cases (Lab Confirmed)"
pd_table= pd_table.rename(
    columns={"No. of Cases (Lab Confirmed)": "No. of Cases"}
)

In [24]:
pd_table.to_csv(
    "../data/processed/NCDC.csv", sep=";", 
)