In [1]:
import subprocess
from subprocess import Popen, PIPE

import os

import pandas as pd 

import requests #for webscraping
from bs4 import BeautifulSoup

import json #API call

pd.set_option('display.max_rows', 500)

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Business Understanding

We would like to track Corona VIRUS spread across countries and with personal local information

The general information is not so relevant for me
I would like to have a deep dive local development of the spread

## Goals

* We would like to understand the data quality

* Everything should be automated as much as possible:
    how many clicks do we need to execute the full pipeline
    

# Constraints:

* Each notebook should be left clean and ready for full execution

## Data Understanding

* RKI (webscraping): https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services : https://npgeo-corona-npgeo-de.hub.arcgis.com/

## GITHUB csv data
git clone/pull  https://github.com/CSSEGISandData/COVID-19.git

In [27]:
git_pull = subprocess.Popen( "git pull" ,
                          cwd = os.path.dirname('C:/Users/Alicia_Pillay/ads_covid-19/data/raw/COVID-19'),
                          shell = True,
                          stdout = subprocess.PIPE,
                          stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()

print("Error :" + str(error))
print("out : " + str(out))

Error :b'fatal: not a git repository (or any of the parent directories): .git\n'
out : b''


In [28]:
data_path = 'C:/Users/Alicia_Pillay/ads_covid-19/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw = pd.read_csv(data_path)

## Webscraping:

In [29]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [30]:
soup = BeautifulSoup(page.content, 'html.parser')

In [31]:
html_table = soup.find('table')

In [7]:
all_rows = html_table.find_all('tr')

In [8]:
final_data_list = []

In [9]:
for pos,rows in enumerate(all_rows):
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')]
    final_data_list.append(col_list)

In [10]:
pd_daily_status=pd.DataFrame(final_data_list).dropna().rename(columns ={0:'State',
                                                                        1:'Cases',
                                                                        2:'Changes since yesterday',
                                                                        3:'Cases/100k residents',
                                                                        4:'Deaths'})

In [11]:
pd_daily_status.head()

Unnamed: 0,State,Cases,Changes since yesterday,Cases/100k residents,Deaths,5
2,Baden-Württem­berg,44.903,285,1.44,130,1.867
3,Bayern,61.561,403,2.415,185,2.645
4,Berlin,12.216,126,529.0,141,226.0
5,Branden­burg,4.003,19,66.0,26,169.0
6,Bremen,2.117,8,61.0,89,58.0


## REST API calls

In [12]:
data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [13]:
json_object=json.loads(data.content)

In [14]:
type(json_object)

dict

In [15]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'serverGens', 'geometryType', 'spatialReference', 'fields', 'features'])

In [16]:
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [17]:
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death
0,1,1,Schleswig-Holstein,Land,2896712,15,4218,1599861600000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,145.613371,45737310000.0,2881496.0,161
1,2,2,Hamburg,Freie und Hansestadt,1841179,6,6770,1599861600000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,367.699175,2089396000.0,418800.2,267
2,3,3,Niedersachsen,Land,7982448,9,17783,1599861600000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,222.776271,129983600000.0,4008988.0,667
3,4,4,Bremen,Freie Hansestadt,682986,5,2117,1599861600000,4,4132268b-54de-4327-ac1e-760e915112f1,309.96243,1119157000.0,335717.7,58
4,5,5,Nordrhein-Westfalen,Land,17932651,10,61820,1599861600000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,344.734306,87829360000.0,2648673.0,1828
