In [9]:
# Imports
import os, subprocess, json

# Environmental Variables
from dotenv import load_dotenv

import pandas as pd

# HTTP Client
import requests
# For parsing and sifting through HTML
from bs4 import BeautifulSoup



In [3]:
# Load environmental variables specified in .env
load_dotenv()

# Set max rows for Pandas
pd.set_option('display.max_rows', 500)

1745e47cb28d4822b0e33cfa2c1d3343


## CRISP-DM
![CRISP-DM](../reports/figures/CRISP_DM.png)

# Data Understanding

## Data Sources:
**Wordlwide:**
- [John Hopkings](https://github.com/CSSEGISandData/COVID-19.git) (GITHUB)
- [REST API services to retrieve Data](https://npgeo-corona-npgeo-de.hub.arcgis.com/)

**Nigeria:**
- [Nigeria Center for Disease Contrl (NCDC)](https://covid19.ncdc.gov.ng/)

**Germany:**
- [Robert Koch Institute](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html) (webscraping)

## GITHUB- John Hopkings

In [12]:
# Create directory for John Hopkins Dataset if it doesn't already exist
if(not os.path.exists("../data/raw/JH_dataset")):
    # Create directory
    os.mkdir("../data/raw/JH_dataset")

# Check if Dataset doesn't already exist in filesystem
if(not os.path.exists("../data/raw/JH_dataset/COVID-19")):
    # Command to clone dataset
    cmd=  "git clone https://github.com/CSSEGISandData/COVID-19.git"
    cmd_wd= "../data/raw/JH_dataset"

# Otherwise if Dataset repo has already been cloned, peform pull operation
else:
    cmd= "git pull"
    cmd_wd= "../data/raw/JH_dataset/COVID-19"


# Git process
git_proc= subprocess.Popen(
    cmd,
    cwd=cmd_wd, shell=True, 
    stdout= subprocess.PIPE, stderr= subprocess.STDOUT
)

# Communicate with child process and set timeout
# Timeout
# Give a maximum of 10 mins because cloning might take a while
proc_timeout= 600
try:
    git_proc_out= git_proc.communicate(timeout=proc_timeout)[0]
    print(str(git_proc_out))
except TimeoutError:
    print("Git operation on John Hopkins Dataset from GITHUB failed...\n")



b"Cloning into 'COVID-19'...\nUpdating files:  94% (470/496)\rUpdating files:  95% (472/496)\rUpdating files:  96% (477/496)\rUpdating files:  97% (482/496)\rUpdating files:  98% (487/496)\rUpdating files:  99% (492/496)\rUpdating files: 100% (496/496)\rUpdating files: 100% (496/496), done.\n"


In [13]:
# Data path
data_path= "../data/raw/JH_dataset/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
# Raw data
pd_raw= pd.read_csv(data_path)

## REST API- [Smartable](smartable.ai)

In [18]:
# Create directory for Smartable.ai Dataset if it doesn't already exist
if(not os.path.exists("../data/raw/SMARTABLE")):
    # Create directory
    os.mkdir("../data/raw/SMARTABLE")

# Endpoint URL for Nigeria stats
api_end_url="https://api.smartable.ai/coronavirus/stats/NG"

# Setup Headers for API call
headers={
    "Cache-Control": "no-cache",
    "Subscription-Key": os.getenv("SMAI_PRIMARY_KEY")
}

# Make API call
res= requests.get(api_end_url, headers=headers)
# Load JSON response
res_json= json.loads(res.content)
# Save data to file
with open("../data/raw/SMARTABLE/NG_data.json", "w") as js_fp:
    json.dump(res_json, js_fp, indent=4)


# Index the data with dates as primary key
daily_stats= []
for a_day in res_json["stats"]["history"]:
    daily_stats.append([
        a_day["date"], a_day["confirmed"], 
        a_day["deaths"], a_day["recovered"]
    ])

# Create pandas frame from data
col_names= {
    0: "date", 1: "confirmed", 2: "deaths",
    3: "recovered"
}
# New frame
pd_frame= pd.DataFrame(daily_stats).rename(columns=col_names)
# Show frame
pd_frame
# Write frame to .csv
pd_frame.to_csv(
    "../data/raw/SMARTABLE/daily_NG.csv", sep=";", index=False
)



## Web Scraping
- **Data Miner** for Chrome is a nice tool that might come in handy


### Web Scraping- Robert Koch Institute

In [5]:
# Get Robert Koch Institute page
page= requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [6]:
# Parse HTML
soup=  BeautifulSoup(page.content, 'html.parser')

In [12]:
# Find table
html_table= soup.find('table')
# Find rows in table
table_rows= html_table.find_all('tr')

In [24]:
# Data List
data_list= []

# Loop through table rows
for idx,row in enumerate(table_rows):
    # print(idx)
    # print(row)
    # print("============")
    # Pull out columns from rows
    # for col in row.find_all('td'):
    #     # print(col)
    #     # Get text
    #     print(col.get_text(strip=True))
    col_list=[col.get_text(strip=True) for col in row.find_all('td')]
    data_list.append(col_list)



In [29]:
# Display data in Pandas
scraped_data= pd.DataFrame(data_list)
# Remove empty rows
scraped_data= scraped_data.dropna()
# Rename column
scraped_data= scraped_data.rename(columns={0:"state", 1:'number', 2:'diff. to prev. day', 3:'last 7 days', 5:'deaths'})
scraped_data

Unnamed: 0,state,number,diff. to prev. day,last 7 days,4,deaths
2,Baden-Württem­berg,35.99,48,185.0,17,1.837
3,Bayern,49.086,63,453.0,35,2.61
4,Berlin,8.586,49,205.0,55,216.0
5,Branden­burg,3.452,-31,22.0,9,168.0
6,Bremen,1.688,4,13.0,19,54.0
7,Hamburg,5.231,1,15.0,8,261.0
8,Hessen,11.104,42,202.0,32,514.0
9,Meck­lenburg-Vor­pommern,803.0,-2,0.0,0,20.0
10,Nieder­sachsen,13.791,37,121.0,15,641.0
11,Nord­rhein-West­falen,44.645,153,888.0,50,1.701


### Web Scraping- Nigeria Centre for Disease Control (NCDC)


In [30]:
# Pull page on COVID-19
page= requests.get("https://covid19.ncdc.gov.ng/")

In [52]:
# Parse HTML
parsed_page= BeautifulSoup(page.content, 'html.parser')
# Pull Table
html_table= parsed_page.find('table')
# Pull table rows
table_rows= html_table.find_all('tr')

# Table Header
table_header= dict()
# Table data
table_data=[]

# Loop through table rows
for idx,row in enumerate(table_rows):
    # Table headers in first row
    if(idx==0):
        # Pull column headers
        col_headers= row.find_all('th')
        col_headers
        # Make a dictionary of column headers
        table_headers= { idx:col_header.get_text(strip=True) for idx,col_header in enumerate(col_headers) }
    
    # Table data
    # Get row columns
    row_cols= row.find_all('td')
    # Get data body into list
    row_data= [ col.get_text(strip=True) for col in row_cols ]
    # Append col to row list
    table_data.append(row_data)


In [55]:
# Make data into Pandas Frame
pd_table= pd.DataFrame(table_data)
# Remove empty rows
pd_table= pd_table.dropna()
# Insert column names
pd_table= pd_table.rename(columns=table_headers)
# Show Frame
pd_table

Unnamed: 0,States Affected,No. of Cases (Lab Confirmed),No. of Cases (on admission),No. Discharged,No. of Deaths
1,Lagos,11827,9993,1701,133
2,FCT,2365,1621,709,35
3,Oyo,1604,751,834,19
4,Edo,1562,580,925,57
5,Delta,1337,852,454,31
6,Kano,1302,220,1030,52
7,Rivers,1294,407,843,44
8,Ogun,1063,346,695,22
9,Kaduna,907,294,601,12
10,Katsina,646,182,441,23
