# EDA covid cases

In [None]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import functions.functions_data
import functions.functions_corona_stats
import functions.functions_HDI
import functions.functions_mask_wearing

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None) # To display all columns

sns.set_style("whitegrid")
mono_palette = ["steelblue"]
heatmap_palette = sns.diverging_palette(240, 10, n=9)
gender_palette = ["indianred", "steelblue"]

### Read in data

In [None]:
covid_cases = pd.read_csv("data/Corona_stats/owid-covid-data.csv")

In [None]:
# Reading in survey data from csv into a dictionary of dataframes.
dfs_country = functions.functions_data.get_data("data/CMU_Global_data/Full_Survey_Data/country/smooth/", "country")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
survey_data = pd.concat(dfs_country, ignore_index=True)

In [None]:
# Merging survey data and covid cases
df_merged = functions.functions_corona_stats.merge_corona_stats(survey_data,covid_cases)

In [None]:
# Getting the dictionaries for the hdi and the hdi-levels.
dict_hdi, dict_hdi_levels = functions.functions_HDI.get_hdi("data","hdro_statistical_data_tables_1_15_d1_d5.xlsx")

# Creating a new column with the hdi and the hdi-levels in the regions dataframe
df_merged_hdi = functions.functions_HDI.create_hdi_columns(df_merged, dict_hdi, dict_hdi_levels)

### Data cleaning

In [None]:
# Change object into datetime object and create month column
df_month = functions.functions_data.insert_month(df_merged_hdi)

# Deal with NaN's in the wearing mask all the time column
df_complete = functions.functions_mask_wearing.deal_with_NaNs_masks(df_month)

# Delete 'other' and 'overall' in gender column and 'overall' in age bucket
df = functions.functions_mask_wearing.delete_overall(df_complete)

In [None]:
# Make a dataframe for the corona stats data.
df_stats = df[["total_cases_per_million","new_cases_smoothed_per_million",
               "total_deaths_per_million","new_deaths_smoothed_per_million",
              "median_age","aged_65_older"]]
df_stats.describe()

The number of new cases and the number of new deaths is below zero, so every row below 0 is excluded.

In [None]:
df1=df[df["new_cases_smoothed_per_million"] >= 0]
df2=df1[df1["new_deaths_smoothed_per_million"] >= 0]

### Plotting

#### COVID-like-illness
As the number of COVID cases and deaths goes up, the number of individuals with COVID-like-illness decreases, which does not make sense. So, COVID-like-illness may not be a helpful variable.

In [None]:
f, axes = plt.subplots(2, 2, figsize=(14,7))
sns.set_palette(mono_palette)

sns.scatterplot(x="smoothed_pct_cli_weighted", y="total_cases_per_million", data=df2, ax=axes[0][0]);
sns.scatterplot(ax=axes[0][0]).set_title('Total cases');
sns.scatterplot(x="smoothed_pct_cli_weighted", y="new_cases_smoothed_per_million", data=df2, ax=axes[0][1]);
sns.scatterplot(ax=axes[0][1]).set_title('New cases');
sns.scatterplot(x="smoothed_pct_cli_weighted", y="total_deaths_per_million", data=df2, ax=axes[1][0]);
sns.scatterplot(ax=axes[1][0]).set_title('Total deaths');
sns.scatterplot(x="smoothed_pct_cli_weighted", y="new_deaths_smoothed_per_million", data=df2, ax=axes[1][1]);
sns.scatterplot(ax=axes[1][1]).set_title('New deaths');

plt.tight_layout()

#### Wear mask all the time

As the number of COVID cases and deaths increase, the number of people wearing masks increases.

In [None]:
f, axes = plt.subplots(2, 2, figsize=(14,7))
sns.set_palette(mono_palette)

sns.scatterplot(x="smoothed_pct_wear_mask_all_time_weighted", y="total_cases_per_million", data=df2, ax=axes[0][0]);
sns.scatterplot(ax=axes[0][0]).set_title('Total cases');
sns.scatterplot(x="smoothed_pct_wear_mask_all_time_weighted", y="new_cases_smoothed_per_million", data=df2, ax=axes[0][1]);
sns.scatterplot(ax=axes[0][1]).set_title('New cases');
sns.scatterplot(x="smoothed_pct_wear_mask_all_time_weighted", y="total_deaths_per_million", data=df2, ax=axes[1][0]);
sns.scatterplot(ax=axes[1][0]).set_title('Total deaths');
sns.scatterplot(x="smoothed_pct_wear_mask_all_time_weighted", y="new_deaths_smoothed_per_million", data=df2, ax=axes[1][1]);
sns.scatterplot(ax=axes[1][1]).set_title('New deaths');

plt.tight_layout()

#### Social distancing attended public event

As the number of COVID cases and deaths increase, the number of people attending a public event decreases.

In [None]:
f, axes = plt.subplots(2, 2, figsize=(14,7))
sns.set_palette(mono_palette)

sns.scatterplot(x="smoothed_pct_attended_public_event_weighted", y="total_cases_per_million", data=df2, ax=axes[0][0]);
sns.scatterplot(ax=axes[0][0]).set_title('Total cases');
sns.scatterplot(x="smoothed_pct_attended_public_event_weighted", y="new_cases_smoothed_per_million", data=df2, ax=axes[0][1]);
sns.scatterplot(ax=axes[0][1]).set_title('New cases');
sns.scatterplot(x="smoothed_pct_attended_public_event_weighted", y="total_deaths_per_million", data=df2, ax=axes[1][0]);
sns.scatterplot(ax=axes[1][0]).set_title('Total deaths');
sns.scatterplot(x="smoothed_pct_attended_public_event_weighted", y="new_deaths_smoothed_per_million", data=df2, ax=axes[1][1]);
sns.scatterplot(ax=axes[1][1]).set_title('New deaths');

plt.tight_layout()

### Plots for the presentation

In [None]:
# Plotting settings
plt.style.use('seaborn-talk')
slides_palette = ["#00287f", "#00c5ff"]
sns.set_palette(slides_palette)
order1 = ["Germany", "Netherlands"]
order2 = ["Germany", "Czech Republic"]

#### Mask wearing

In [None]:
# Create a data frame with Netherlands and Germany
df_nl_deu = df2[df2['country_agg'].str.contains('|'.join(['Netherlands','Germany']))]
df_nl_deu["smoothed_pct_wear_mask_all_time_weighted"] = df_nl_deu["smoothed_pct_wear_mask_all_time_weighted"]/100

In [None]:
# Plot total cases per million and wearing a mask all the time
f, axes = plt.subplots(1, 2, figsize=(15,5))

g1 = sns.lineplot(data=df_nl_deu, x="date", y="total_cases_per_million", hue="country_agg", ax=axes[0], hue_order=order1);
g1.set_title('COVID-19 total cases per million');
g1.set_xlabel('date');
g1.set_ylabel('total cases per million');
g1.set_ylim(ymin=0,ymax=35000);
g1.legend().set_title('')
g1.legend(loc='upper left')

g2 = sns.lineplot(data=df_nl_deu, x="date", y="smoothed_pct_wear_mask_all_time_weighted", hue="country_agg", legend=False, ax=axes[1], hue_order=order1);
g2.set_title('Wearing a mask all the time');
g2.set_xlabel('date');
g2.set_ylabel('percentile wearing a mask all the time');
g2.set_ylim(ymin=0,ymax=1);

#### Social distancing

In [None]:
# Create a data fram with Czech Republic and Germany
df_cze_deu = df2[df2['iso_code'].str.contains('|'.join(['CZE','DEU']))]
df_cze_deu["smoothed_pct_attended_public_event_weighted"] = df_cze_deu["smoothed_pct_attended_public_event_weighted"]/100

In [None]:
# Plot total cases per million and attending public events
f, axes = plt.subplots(1, 2, figsize=(15,5))

g1 = sns.lineplot(data=df_cze_deu, x="date", y="total_cases_per_million", hue="country_agg", ax=axes[0], hue_order=order2);
g1.set_title('COVID-19 total cases per million');
g1.set_ylabel('total cases per million');
g1.set_ylim(ymin=0,ymax=35000);
g1.legend().set_title('')
g1.legend(loc='upper left')

g2 = sns.lineplot(data=df_cze_deu, x="date", y="smoothed_pct_attended_public_event_weighted", hue="country_agg", legend=False, ax=axes[1], hue_order=order2);
g2.set_title('Attending public events');
g2.set_ylabel('percentile attending public events');
g2.set_ylim(ymin=0,ymax=1);