In [15]:
# Imports
import os, subprocess, json

# Environmental Variables
from dotenv import load_dotenv

import pandas as pd

# HTTP Client
import requests
# For parsing and sifting through HTML
from bs4 import BeautifulSoup



In [16]:
# Load environmental variables specified in .env
load_dotenv()

# Set max rows for Pandas
pd.set_option('display.max_rows', 500)

## CRISP-DM
![CRISP-DM](../reports/figures/CRISP_DM.png)

# Data Understanding

## Data Sources:
**Wordlwide:**
- [John Hopkings](https://github.com/CSSEGISandData/COVID-19.git) (GITHUB)
- [REST API services to retrieve Data](https://npgeo-corona-npgeo-de.hub.arcgis.com/)

**Nigeria:**
- [Nigeria Center for Disease Contrl (NCDC)](https://covid19.ncdc.gov.ng/)

**Germany:**
- [Robert Koch Institute](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html) (webscraping)

## GITHUB- John Hopkings

In [17]:
# Create directory for John Hopkins Dataset if it doesn't already exist
if(not os.path.exists("../data/raw/JH_dataset")):
    # Create directory
    os.mkdir("../data/raw/JH_dataset")

# Check if Dataset doesn't already exist in filesystem
if(not os.path.exists("../data/raw/JH_dataset/COVID-19")):
    # Command to clone dataset
    cmd=  "git clone https://github.com/CSSEGISandData/COVID-19.git"
    cmd_wd= "../data/raw/JH_dataset"

# Otherwise if Dataset repo has already been cloned, peform pull operation
else:
    cmd= "git pull"
    cmd_wd= "../data/raw/JH_dataset/COVID-19"


# Git process
git_proc= subprocess.Popen(
    cmd,
    cwd=cmd_wd, shell=True, 
    stdout= subprocess.PIPE, stderr= subprocess.STDOUT
)

# Communicate with child process and set timeout
# Timeout
# Give a maximum of 10 mins because cloning might take a while
proc_timeout= 600
try:
    git_proc_out= git_proc.communicate(timeout=proc_timeout)[0]
    print(str(git_proc_out))
except TimeoutError:
    print("Git operation on John Hopkins Dataset from GITHUB failed...\n")



b'From https://github.com/CSSEGISandData/COVID-19\n   061a439d..c45ac5ba  master     -> origin/master\n   f910d50b..6af92833  web-data   -> origin/web-data\nUpdating 061a439d..c45ac5ba\nFast-forward\n README.md                                          |  10 +-\n csse_covid_19_data/README.md                       |  14 +-\n csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv   |   1 +\n .../csse_covid_19_time_series/Errata.csv           |   2 +\n .../time_series_covid19_confirmed_global.csv       | 534 ++++++++++-----------\n .../time_series_covid19_deaths_global.csv          | 534 ++++++++++-----------\n 6 files changed, 550 insertions(+), 545 deletions(-)\n'


In [18]:
# Data path
data_path= "../data/raw/JH_dataset/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
# Raw data
pd_raw= pd.read_csv(data_path)

## REST API- [Smartable](smartable.ai)

In [19]:
# Create directory for Smartable.ai Dataset if it doesn't already exist
if(not os.path.exists("../data/raw/SMARTABLE")):
    # Create directory
    os.mkdir("../data/raw/SMARTABLE")

# Endpoint URL for Nigeria stats
api_end_url="https://api.smartable.ai/coronavirus/stats/NG"

# Setup Headers for API call
headers={
    "Cache-Control": "no-cache",
    "Subscription-Key": os.getenv("SMAI_PRIMARY_KEY")
}

# Make API call
res= requests.get(api_end_url, headers=headers)
# Load JSON response
res_json= json.loads(res.content)
# Save data to file
with open("../data/raw/SMARTABLE/NG_data.json", "w") as js_fp:
    json.dump(res_json, js_fp, indent=4)


# Index the data with dates as primary key
daily_stats= []
for a_day in res_json["stats"]["history"]:
    daily_stats.append([
        a_day["date"], a_day["confirmed"], 
        a_day["deaths"], a_day["recovered"]
    ])

# Create pandas frame from data
col_names= {
    0: "date", 1: "confirmed", 2: "deaths",
    3: "recovered"
}
# New frame
pd_frame= pd.DataFrame(daily_stats).rename(columns=col_names)
# Show frame
pd_frame
# Write frame to .csv
pd_frame.to_csv(
    "../data/processed/SMARTABLE_daily_NG.csv", sep=";", index=False
)



## Web Scraping
- **Data Miner** for Chrome is a nice tool that might come in handy


### Web Scraping- Robert Koch Institute

In [20]:
# Get Robert Koch Institute page
page= requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [21]:
# Parse HTML
soup=  BeautifulSoup(page.content, 'html.parser')

In [22]:
# Find table
html_table= soup.find('table')
# Find rows in table
table_rows= html_table.find_all('tr')

In [23]:
# Data List
data_list= []

# Loop through table rows
for idx,row in enumerate(table_rows):
    # print(idx)
    # print(row)
    # print("============")
    # Pull out columns from rows
    # for col in row.find_all('td'):
    #     # print(col)
    #     # Get text
    #     print(col.get_text(strip=True))
    col_list=[col.get_text(strip=True) for col in row.find_all('td')]
    data_list.append(col_list)



In [24]:
# Display data in Pandas
scraped_data= pd.DataFrame(data_list)
# Remove empty rows
scraped_data= scraped_data.dropna()
# Rename column
scraped_data= scraped_data.rename(columns={0:"state", 1:'number', 2:'diff. to prev. day', 3:'last 7 days', 5:'deaths'})
scraped_data

Unnamed: 0,state,number,diff. to prev. day,last 7 days,4,deaths
2,Baden-Württem­berg,36.264,37,228.0,21,1.839
3,Bayern,49.631,93,481.0,37,2.616
4,Berlin,8.725,26,123.0,33,221.0
5,Branden­burg,3.47,4,19.0,8,167.0
6,Bremen,1.701,4,13.0,19,55.0
7,Hamburg,5.232,-1,6.0,3,260.0
8,Hessen,11.324,41,217.0,35,514.0
9,Meck­lenburg-Vor­pommern,805.0,0,1.0,1,20.0
10,Nieder­sachsen,13.905,18,92.0,12,644.0
11,Nord­rhein-West­falen,45.71,322,1.026,57,1.711


### Web Scraping- Nigeria Centre for Disease Control (NCDC)


In [25]:
# Pull page on COVID-19
page= requests.get("https://covid19.ncdc.gov.ng/")

In [26]:
# Parse HTML
parsed_page= BeautifulSoup(page.content, 'html.parser')
# Pull Table
html_table= parsed_page.find('table')
# Pull table rows
table_rows= html_table.find_all('tr')

# Table Header
table_header= dict()
# Table data
table_data=[]

# Loop through table rows
for idx,row in enumerate(table_rows):
    # Table headers in first row
    if(idx==0):
        # Pull column headers
        col_headers= row.find_all('th')
        col_headers
        # Make a dictionary of column headers
        table_headers= { idx:col_header.get_text(strip=True) for idx,col_header in enumerate(col_headers) }
    
    # Table data
    # Get row columns
    row_cols= row.find_all('td')
    # Get data body into list
    row_data= [ col.get_text(strip=True) for col in row_cols ]
    # Append col to row list
    table_data.append(row_data)


In [27]:
# Make data into Pandas Frame
pd_table= pd.DataFrame(table_data)
# Remove empty rows
pd_table= pd_table.dropna()
# Insert column names
pd_table= pd_table.rename(columns=table_headers)
# Show Frame
pd_table

Unnamed: 0,States Affected,No. of Cases (Lab Confirmed),No. of Cases (on admission),No. Discharged,No. of Deaths
1,Lagos,13097,10973,1948,176
2,FCT,2761,1882,840,39
3,Oyo,1989,959,1011,19
4,Edo,1874,662,1150,62
5,Rivers,1480,469,964,47
6,Delta,1404,784,585,35
7,Kano,1331,243,1035,53
8,Ogun,1146,347,777,22
9,Kaduna,1087,331,744,12
10,Ondo,865,705,139,21
