# Covid-19 data download & processing
---
This Python Script downloads an up-to-date dataset for **Covid-19** and exports the data to the data folder.

The Data is from the **R**obert **K**och **I**nstitut downloaded over [ArcGis Hub](https://hub.arcgis.com/datasets/dd4580c810204019a7b8eb3e0b329dd6?page=15976).

*Script was created on Python: 3.7.6 64-bit Kernel*

In [1]:
import pandas as pd
import math

import io               # file operations
import json

import ssl              # secure client-server connection
import requests         # html-requests

In [2]:
# Uncomment next 2 lines to install jsonmerge
#import sys
#!{sys.executable} -m pip install jsonmerge
from jsonmerge import Merger

In [3]:
sourceURL = 'https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_COVID19/FeatureServer/0/query?'
objectIdsQuery = 'where=1%3D1&returnIdsOnly=true&f=json'
dataSetQuery = 'where=ObjectId+BETWEEN+0+AND+0' # just an example gets created later dynamically
dataQuery = '&outSR=4326&outFields=IdBundesland,Bundesland,Landkreis,Altersgruppe,AnzahlFall,AnzahlTodesfall,ObjectId,Meldedatum,IdLandkreis,Datenstand,NeuerFall,NeuerTodesfall,Refdatum,NeuGenesen,AnzahlGenesen,IstErkrankungsbeginn&f=json'

## Requesting which Features (ObjectID´s) are available

In [4]:
objectIdsRequest = requests.get(sourceURL + objectIdsQuery)
objectIdsRequest.status_code

200

In [5]:
objectIds = json.loads(objectIdsRequest.text)

numOfObjectIds = len(objectIds['objectIds'])

objectIdStart = objectIds['objectIds'][0]
objectIdEnd = objectIds['objectIds'][numOfObjectIds - 1]
print(f'Range of ObjectIds: [{objectIdStart}, {objectIdEnd}]')

Range of ObjectIds: [28886606, 29068508]


## Requesting Features

In [6]:
dataRequest = requests.get(sourceURL + 'where=1%3D1' + dataQuery)
dataRequest.status_code

200

In [7]:
data = json.loads(dataRequest.text)
maxApiRequest = len(data['features'])

neededRequests = math.ceil(numOfObjectIds / maxApiRequest)

print(f'The download will require {neededRequests - 1} more requests due to the server limit of {maxApiRequest} features/request.')

The download will require 36 more requests due to the server limit of 5000 features/request.


In [8]:
# Json-Merger with custom rule
jsonMergeSchema = {"properties":{"features":{"mergeStrategy":"append"}}}
dataMerger = Merger(jsonMergeSchema)

In [9]:
i = 0
rangeLowerEnd = data['features'][maxApiRequest - 1]['attributes']['ObjectId'] + 1
rangeUpperEnd = rangeLowerEnd + maxApiRequest

while (i < neededRequests - 1): # neededRequests - 1 because of initial download
    dataSetQuery = f'where=ObjectId+BETWEEN+{rangeLowerEnd}+AND+{rangeUpperEnd}'
    temp_sourceURL = sourceURL + dataSetQuery + dataQuery
    print(i, f'Pulling ObjectIds: [{rangeLowerEnd}, {rangeUpperEnd}]')

    temp_dataRequest = requests.get(temp_sourceURL)
    if (temp_dataRequest.status_code > 200): # stop when a request isn´t working
        print(f'Error in request: {temp_dataRequest.status_code}')
        break
    temp_data = json.loads(temp_dataRequest.text)

    # append new data to already downloaded one
    data = dataMerger.merge(data, temp_data)

    temp_dataLength = len(data['features'])
    t_le = data['features'][0]['attributes']['ObjectId']
    t_ue = data['features'][temp_dataLength - 1]['attributes']['ObjectId']
    print(f'Total collected features: {temp_dataLength} From ObjectIds: [{t_le}, {t_ue}]')

    rangeLowerEnd = rangeUpperEnd + 1
    rangeUpperEnd += maxApiRequest + 1
    if (rangeUpperEnd > objectIdEnd):
        rangeUpperEnd = objectIdEnd
    i += 1

print('Done')

0 Pulling ObjectIds: [28891606, 28896606]
Total collected features: 10000 From ObjectIds: [28886606, 28896605]
1 Pulling ObjectIds: [28896607, 28901607]
Total collected features: 15000 From ObjectIds: [28886606, 28901606]
2 Pulling ObjectIds: [28901608, 28906608]
Total collected features: 20000 From ObjectIds: [28886606, 28906607]
3 Pulling ObjectIds: [28906609, 28911609]
Total collected features: 25000 From ObjectIds: [28886606, 28911608]
4 Pulling ObjectIds: [28911610, 28916610]
Total collected features: 30000 From ObjectIds: [28886606, 28916609]
5 Pulling ObjectIds: [28916611, 28921611]
Total collected features: 35000 From ObjectIds: [28886606, 28921610]
6 Pulling ObjectIds: [28921612, 28926612]
Total collected features: 40000 From ObjectIds: [28886606, 28926611]
7 Pulling ObjectIds: [28926613, 28931613]
Total collected features: 45000 From ObjectIds: [28886606, 28931612]
8 Pulling ObjectIds: [28931614, 28936614]
Total collected features: 50000 From ObjectIds: [28886606, 28936613]
9

In [10]:
print('Entries: ', len(data['features']))
print('Structure: ', data['features'][0])
print('Latest data: ', data['features'][0]['attributes']['Datenstand'])

Entries:  181868
Structure:  {'attributes': {'IdBundesland': 1, 'Bundesland': 'Schleswig-Holstein', 'Landkreis': 'SK Flensburg', 'Altersgruppe': 'A05-A14', 'AnzahlFall': 1, 'AnzahlTodesfall': 0, 'ObjectId': 28886606, 'Meldedatum': 1597449600000, 'IdLandkreis': '01001', 'Datenstand': '18.08.2020, 00:00 Uhr', 'NeuerFall': 0, 'NeuerTodesfall': -9, 'Refdatum': 1597449600000, 'NeuGenesen': -9, 'AnzahlGenesen': 0, 'IstErkrankungsbeginn': 0}}
Latest data:  18.08.2020, 00:00 Uhr


In [11]:
def display_n(df,n): 
    with pd.option_context('display.max_rows',n*2):
        display(df)

In [12]:
dfx = pd.DataFrame.from_dict(data['features'])
display_n(dfx, 2)

Unnamed: 0,attributes
0,"{'IdBundesland': 1, 'Bundesland': 'Schleswig-H..."
1,"{'IdBundesland': 1, 'Bundesland': 'Schleswig-H..."
...,...
181866,"{'IdBundesland': 16, 'Bundesland': 'Thüringen'..."
181867,"{'IdBundesland': 16, 'Bundesland': 'Thüringen'..."


In [13]:
# turning the collumn attributes in seperated collumns
for rowid in data['fields']:
    dfx[rowid['name']] = dfx.apply(lambda row: row.loc['attributes'][rowid['name']], axis=1)
dfx = dfx.drop(['attributes'], axis=1)
display_n(dfx, 2)

Unnamed: 0,IdBundesland,Bundesland,Landkreis,Altersgruppe,AnzahlFall,AnzahlTodesfall,ObjectId,Meldedatum,IdLandkreis,Datenstand,NeuerFall,NeuerTodesfall,Refdatum,NeuGenesen,AnzahlGenesen,IstErkrankungsbeginn
0,1,Schleswig-Holstein,SK Flensburg,A05-A14,1,0,28886606,1597449600000,01001,"18.08.2020, 00:00 Uhr",0,-9,1597449600000,-9,0,0
1,1,Schleswig-Holstein,SK Flensburg,A15-A34,1,0,28886607,1584144000000,01001,"18.08.2020, 00:00 Uhr",0,-9,1584316800000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181866,16,Thüringen,LK Altenburger Land,A80+,2,0,29068507,1590624000000,16077,"18.08.2020, 00:00 Uhr",0,-9,1590624000000,0,2,0
181867,16,Thüringen,LK Altenburger Land,A80+,1,0,29068508,1591660800000,16077,"18.08.2020, 00:00 Uhr",0,-9,1591660800000,0,1,0


In [14]:
data['features'][0]['attributes']['Landkreis']

'SK Flensburg'

In [21]:
frameByLK = dfx.groupby(['Landkreis'])['AnzahlFall', 'AnzahlTodesfall', 'AnzahlGenesen'].sum().reset_index().set_index('Landkreis')

In [22]:
frameByLK

Unnamed: 0_level_0,AnzahlFall,AnzahlTodesfall,AnzahlGenesen
Landkreis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LK Ahrweiler,256,1,230
LK Aichach-Friedberg,409,20,385
LK Alb-Donau-Kreis,715,27,656
LK Altenburger Land,79,4,75
LK Altenkirchen,197,11,177
...,...,...,...
SK Worms,241,8,222
SK Wuppertal,1291,86,1051
SK Würzburg,504,52,442
SK Zweibrücken,45,1,43


In [24]:
frameByLK.to_csv('frameByLK.csv', index=True, encoding='utf-8')