# Joining global COVID-19 statistics with survey data

Data source: https://github.com/owid/covid-19-data/tree/master/public/data/  
For detailed information of the data see the README.md file  
Data downloaded on 24/11/2020 at 09:37 (last update of dataset when downloaded 2020-11-23T14:22:31)

In [1]:
import sys
sys.path.insert(1, '/Users/jakoliendenhollander/capstone/capstone')

import pandas as pd
import warnings
import datetime
from datetime import timedelta

import tidy_functions.load_data

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None) # To display all columns

### Read in data

In [2]:
covid_cases = pd.read_csv("/Users/jakoliendenhollander/capstone/capstone/data/Corona_stats/owid-covid-data.csv")

In [3]:
# Reading in survey data from csv into a dictionary of dataframes.
dfs_country = tidy_functions.load_data.load_survey_data("/Users/jakoliendenhollander/capstone/capstone/data/CMU_Global_data/Full_Survey_Data/country/smooth/", "country")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
survey_data = pd.concat(dfs_country, ignore_index=True)

Read in survey data completed.


In [4]:
covid_cases[covid_cases["location"] == "Germany"].tail()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,tests_per_case,positive_rate,tests_units,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
13999,DEU,Europe,Germany,2020-11-19,855916.0,22609.0,18337.571,13370.0,251.0,198.286,10215.752,269.849,218.867,159.577,2.996,2.367,,,,,,,,,,,,,,,,,,,,83783945.0,237.016,46.6,21.453,15.957,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
14000,DEU,Europe,Germany,2020-11-20,879564.0,23648.0,18352.714,13630.0,260.0,204.286,10498.002,282.25,219.048,162.68,3.103,2.438,,,,,,,,,,,,,,,,,,,,83783945.0,237.016,46.6,21.453,15.957,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
14001,DEU,Europe,Germany,2020-11-21,902528.0,22964.0,18424.571,13884.0,254.0,215.143,10772.088,274.086,219.906,165.712,3.032,2.568,,,,,,,,,,,,,,,,,,,,83783945.0,237.016,46.6,21.453,15.957,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
14002,DEU,Europe,Germany,2020-11-22,918269.0,15741.0,18252.286,14022.0,138.0,219.571,10959.964,187.876,217.849,167.359,1.647,2.621,,,,,,,,,,,,,,,,,,,,83783945.0,237.016,46.6,21.453,15.957,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936
14003,DEU,Europe,Germany,2020-11-23,929133.0,10864.0,18258.0,14112.0,90.0,223.571,11089.631,129.667,217.918,168.433,1.074,2.668,,,,,,,,,,,,,,,,,,,,83783945.0,237.016,46.6,21.453,15.957,45229.245,,156.139,8.31,28.2,33.1,,8.0,81.33,0.936


## Selecting data

In [5]:
# Select columns in covid cases data
df_cases = covid_cases[["iso_code","location","date","total_cases_per_million","new_cases_smoothed_per_million",
                       "total_deaths_per_million","new_deaths_smoothed_per_million","median_age","aged_65_older"]]

### Countries

In [6]:
# Rename 'GID_0' as 'iso_code' in survey data
survey_data['iso_code']=survey_data['GID_0']
survey_data.drop(columns='GID_0', axis=1, inplace=True)

# Check differences in included countries between covid cases and survey data
unique_countries = set(survey_data["iso_code"]).symmetric_difference(set(df_cases["iso_code"]))
unique_countries_survey = set(survey_data["iso_code"]).intersection(unique_countries)
unique_countries_cases = set(df_cases["iso_code"]).intersection(unique_countries)
print('The following countries occur only in the survey data:')
print(unique_countries_survey)
print('The following countries occur only in the cases data:')
print(unique_countries_cases)

The following countries occur only in the survey data:
{'ALA', 'ASM', 'MAC'}
The following countries occur only in the cases data:
{nan, 'IMN', 'GRL', 'SWZ', 'VUT', 'BRB', 'COG', 'GRD', 'WLF', 'GNB', 'GUM', 'MHL', 'BDI', 'TJK', 'SOM', 'GNQ', 'GAB', 'SLE', 'CUB', 'BES', 'MCO', 'OWID_KOS', 'STP', 'LSO', 'PNG', 'OWID_WRL', 'TLS', 'ABW', 'LIE', 'TTO', 'KNA', 'BTN', 'SUR', 'VIR', 'FRO', 'MSR', 'BMU', 'SYC', 'TGO', 'LBR', 'GMB', 'LVA', 'BRN', 'GUY', 'MNP', 'AIA', 'MKD', 'GEO', 'PYF', 'MDV', 'MNE', 'MWI', 'NER', 'GIB', 'MUS', 'FLK', 'SXM', 'JEY', 'CYM', 'TCA', 'ERI', 'TCD', 'COM', 'IRN', 'CHN', 'FJI', 'ISL', 'ZMB', 'CUW', 'MLT', 'NCL', 'EST', 'GGY', 'LCA', 'JAM', 'SMR', 'LTU', 'CPV', 'SLB', 'VGB', 'MNG', 'SYR', 'RWA', 'VAT', 'NAM', 'USA', 'VCT', 'DJI'}


In [7]:
# Delete rows of countries that only occur in one data set
df_survey = survey_data[~survey_data['iso_code'].isin(unique_countries_survey)]
df_covid_cases = df_cases[~df_cases['iso_code'].isin(unique_countries_cases)]

#Check whether it worked
print('Difference:',set(df_survey["iso_code"]).symmetric_difference(set(df_covid_cases["iso_code"])))

Difference: set()


### Dates

In [55]:
df_covid19 = df_covid_cases.copy()

In [56]:
df_covid19['date_obj'] = pd.to_datetime(df_covid19.loc[:,'date'])

In [57]:
df_covid19 = df_covid19[df_covid19['date_obj'] > pd.to_datetime("2020-04-20")]

In [58]:
df_covid19.loc[(df_covid19['location']== 'Venezuela') & (df_covid19['date']== '2020-06-18'), "total_cases_per_million"] = 110.775

In [64]:
# Add cases from two weeks ahead to data frame
next_14days = []

for i in range(len(df_covid19)):
    
    in_14days = df_covid19.at[df_covid19.index[i],'date_obj'] + timedelta(days=14)
    iso_code = df_covid19.at[df_covid19.index[i],'iso_code']

    if in_14days in df_covid19[df_covid19["iso_code"] == iso_code].date_obj.values:
        value = df_covid19.loc[(df_covid19.date_obj == in_14days) & (df_covid19.iso_code == iso_code), 'total_cases_per_million']
        next_14days = [*next_14days, *value.values]
    else:
        next_14days = [*next_14days, 'NaN']
        
df_covid19['next_14days'] = next_14days

In [65]:
# one week ago
previous_7days = []

for i in range(len(df_covid19)):
    
    if df_covid19.at[df_covid19.index[i],'date_obj'] > df_covid19[df_covid19['iso_code'] == df_covid19.at[df_covid19.index[i],'iso_code']].date_obj.min() + timedelta(days=6):
        last_week = df_covid19.at[df_covid19.index[i],'date_obj'] - timedelta(days=7)
        iso_code = df_covid19.at[df_covid19.index[i],'iso_code']
        
        if last_week in df_covid19[df_covid19["iso_code"] == iso_code].date_obj.values:
            value = df_covid19.loc[(df_covid19.date_obj == last_week) & (df_covid19.iso_code == iso_code), 'total_cases_per_million']
            previous_7days = [*previous_7days, *value.values]
        else:
            previous_7days = [*previous_7days, 'NaN']
    else:
        previous_7days = [*previous_7days, 'NaN']
        
df_covid19['previous_7days'] = previous_7days

In [66]:
# Delete rows of dates in covid_cases that are before and after the survey dates
df_covid = df_covid19[df_covid19['date'].isin(df_survey['date'])]

In [67]:
# Save as csv file
df_covid.to_csv(r'/Users/jakoliendenhollander/capstone/capstone/data/corona_stats_14days.csv', index=False)