# Joining global COVID-19 statistics with survey data

Data source: https://github.com/owid/covid-19-data/tree/master/public/data/  
For detailed information of the data see the README.md file  
Data downloaded on 24/11/2020 at 09:37 (last update of dataset when downloaded 2020-11-23T14:22:31)

In [None]:
import pandas as pd
import warnings
import functions.functions_data

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None) # To display all columns

### Read in data

In [None]:
covid_cases = pd.read_csv("data/Corona_stats/owid-covid-data.csv")

In [None]:
# Reading in survey data from csv into a dictionary of dataframes.
dfs_country = functions.functions_data.get_data("data/CMU_Global_data/Full_Survey_Data/country/smooth/", "country")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
survey_data = pd.concat(dfs_country, ignore_index=True)

In [None]:
covid_cases[covid_cases["location"] == "Germany"].tail()

## Selecting data

In [None]:
# Select columns in covid cases data
df_cases = covid_cases[["iso_code","date","total_cases_per_million","new_cases_smoothed_per_million",
                       "total_deaths_per_million","new_deaths_smoothed_per_million","median_age","aged_65_older"]]

### Countries

In [None]:
# Rename 'GID_0' as 'iso_code' in survey data
survey_data['iso_code']=survey_data['GID_0']
survey_data.drop(columns='GID_0', axis=1, inplace=True)

# Check differences in included countries between covid cases and survey data
unique_countries = set(survey_data["iso_code"]).symmetric_difference(set(df_cases["iso_code"]))
unique_countries_survey = set(survey_data["iso_code"]).intersection(unique_countries)
unique_countries_cases = set(df_cases["iso_code"]).intersection(unique_countries)
print('The following countries occur only in the survey data:')
print(unique_countries_survey)
print('The following countries occur only in the cases data:')
print(unique_countries_cases)

In [None]:
# Delete rows of countries that only occur in one data set
df_survey = survey_data[~survey_data['iso_code'].isin(unique_countries_survey)]
df_covid_cases = df_cases[~df_cases['iso_code'].isin(unique_countries_cases)]

#Check whether it worked
print('Difference:',set(df_survey["iso_code"]).symmetric_difference(set(df_covid_cases["iso_code"])))

### Dates

In [None]:
# Delete rows of dates in covid_cases that are before and after the survey dates
df_covid = df_covid_cases[df_covid_cases['date'].isin(df_survey['date'])]

## Join datasets on iso code and date

In [None]:
# Join datasets on iso_code and date
df_combined = pd.merge(df_survey,df_covid,on=["iso_code","date"])

In [None]:
df_combined.head()