# Full EDA mask wearing

The purpose of this EDA is to identify factors that have impact on mask wearing world wide. It is hypothesized that the number of cases with covid-like-illness would correlate with mask wearing.

## Data preparation

### Import libraries

In [None]:
import pandas as pd
import warnings
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import functions
import functions_mask_wearing
import get_group

warnings.filterwarnings(action='ignore')

pd.set_option('display.max_columns', None) # To display all columns

sns.set_style("whitegrid")
mono_palette = ["steelblue"]
heatmap_palette = sns.diverging_palette(240, 10, n=9)
gender_palette = ["indianred", "steelblue"]

### Read in data

In [None]:
# Reading in data files from csv into a dictionary of dataframes.
dfs_country = functions.get_data("data/CMU_Global_data/Full_Survey_Data/country/smooth/", "country")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
countries = pd.concat(dfs_country, ignore_index=True)

In [None]:
countries["country_agg"].nunique()

### Get HDI

In [None]:
# Getting the dictionaries for the hdi and the hdi-levels.
dict_hdi, dict_hdi_levels = functions.get_hdi("data","hdro_statistical_data_tables_1_15_d1_d5.xlsx")

# Creating a new column with the hdi and the hdi-levels in the regions dataframe
countries = functions.create_hdi_columns(countries, dict_hdi, dict_hdi_levels)

### Data cleaning and selection

In [None]:
df_countries = countries

In [None]:
print('Columns including the words mask and weighted:')
print(df_countries.columns[(df_countries.columns.str.contains("mask") & (df_countries.columns.str.contains("weighted")))])

In [None]:
# Deal with NaN's in the wearing mask all the time column
df_complete = functions_mask_wearing.deal_with_NaNs_masks(df_countries)

# Change object into datetime object and create month column
df_month = functions.insert_month(df_complete)

# Select relevant columns for EDA on mask wearing
df_masks = functions_mask_wearing.create_df_masks(df_month)

# Delete 'other' and 'overall' in gender column and 'overall' in age bucket
df_masks_delete_overall = functions_mask_wearing.delete_overall(df_masks)

# Normalize data
df_masks_normalized = functions_mask_wearing.normalize_mask_data(df_masks_delete_overall)

## Exploring correlations

In [None]:
# create dummy variables for age and gender
dummies = pd.get_dummies(df_masks_normalized[['age_bucket','gender']],drop_first=True)
df_heatmap = df_masks_normalized.join(dummies)

# delete unused columns for world heatmap.
df_heatmap_world = df_heatmap.drop(['date','country_id','country','hdi_level','age_bucket','gender'], axis=1)

# prepare continents heatmap and add column 'country_agg' to be able to group by country.
df_heatmap_continents = df_heatmap
df_heatmap_continents["country_agg"] = df_heatmap_continents["country"]

### World

Interesting correlations with mask wearing:

* wear mask none time and hdi
* wear mask half time and gender

In [None]:
corr_world = df_heatmap_world.corr()
mask_world = np.triu(np.ones_like(corr_world, dtype=bool))
sns.heatmap(corr_world, mask=mask_world, cmap=heatmap_palette);

### Asia

Interesting correlations with mask wearing:

* mask wearing all/none time and hdi
* wear mask most/half/some time and gender
* wear mask some/none time and covid like illness

In [None]:
asia = get_group.group_asia(df_heatmap_continents)
df_asia = pd.concat(asia, ignore_index=True)
df_heatmap_asia = df_asia.drop(['date','country_id','country','hdi_level','age_bucket','gender','country_agg'], axis=1)
corr_asia = df_heatmap_asia.corr()
mask_asia = np.triu(np.ones_like(corr_asia, dtype=bool))
sns.heatmap(corr_asia, mask=mask_asia, cmap=heatmap_palette);

### Oceania

Interesting correlations with mask wearing:

* wear mask all/most/half/some time and month
* wear mask all time and hdi

In [None]:
df_oceania = get_group.group_oceania(df_heatmap_continents)
df_heatmap_oceania = df_oceania.drop(['date','country_id','country','hdi_level','age_bucket','gender','country_agg'], axis=1)
corr_oceania = df_heatmap_oceania.corr()
mask_oceania = np.triu(np.ones_like(corr_oceania, dtype=bool))
sns.heatmap(corr_oceania,mask=mask_oceania,cmap=heatmap_palette);

### Europe

Interesting correlations with mask wearing:

* wear mask all/none time and hdi
* wear mask most time and month
* wear mask half/some time and gender

In [None]:
europe = get_group.group_europe(df_heatmap_continents)
df_europe = pd.concat(europe, ignore_index=True)
df_heatmap_europe = df_europe.drop(['date','country_id','country','hdi_level','age_bucket','gender','country_agg'], axis=1)
corr_europe = df_heatmap_europe.corr()
mask_europe = np.triu(np.ones_like(corr_europe, dtype=bool))
sns.heatmap(corr_europe, mask=mask_europe, cmap=heatmap_palette);

### Africa

Interesting correlations with mask wearing:

* wear mask most/half time and month
* wear mask most/half/some time and gender

In [None]:
africa = get_group.group_africa(df_heatmap_continents)
df_africa = pd.concat(africa, ignore_index=True)
df_heatmap_africa = df_africa.drop(['date','country_id','country','hdi_level','age_bucket','gender','country_agg'], axis=1)
corr_africa = df_heatmap_africa.corr()
mask_africa = np.triu(np.ones_like(corr_africa, dtype=bool))
sns.heatmap(corr_africa, mask=mask_africa, cmap=heatmap_palette);

### America

Interesting correlations with mask wearing:

* wear mask all time and month
* wear mask most time and gender

In [None]:
america = get_group.group_americas(df_heatmap_continents)
df_america = pd.concat(america, ignore_index=True)
df_heatmap_america = df_america.drop(['date','country_id','country','hdi_level','age_bucket','gender','country_agg'], axis=1)
corr_america = df_heatmap_america.corr()
mask_america = np.triu(np.ones_like(corr_america, dtype=bool))
sns.heatmap(corr_america, mask=mask_america, cmap=heatmap_palette);

## Conclusion
* There does not seem to be a clear interaction between mask wearing and the prevalence of individuals with covid-like-illness.
* Mask wearing interacts with hdi, month and gender, which we will focus on in the next section.

## Visualizing correlations

### Mask wearing and HDI

The **scatterplots** below show:

1) in Asia, the higher the HDI, the more people wear a mask all the time and vice versa.

2) in Europe, the higher the HDI, the fewer people wear a mask all the time and vice versa.

3) in Oceania, the difference between Australia and New Zealand is not large in HDI, so there on cannot measure an effect of HDI.

4) world wide, more people wear a mask none of the time in countries with a hdi below 0.55 and above 0.9.

In [None]:
f, axes = plt.subplots(2, 3, figsize=(15,10), sharex=True, sharey=True)
sns.set_palette(mono_palette)

sns.scatterplot(x="hdi", y="pct_wear_mask_all_time", data=df_heatmap_asia, ax=axes[0][0]);
sns.scatterplot(ax=axes[0][0]).set_title('Asia: all time')
sns.scatterplot(x="hdi", y="pct_wear_mask_all_time", data=df_heatmap_europe, ax=axes[0][1]);
sns.scatterplot(ax=axes[0][1]).set_title('Europe: all time')
sns.scatterplot(x="hdi", y="pct_wear_mask_all_time", data=df_heatmap_oceania, ax=axes[0][2]);
sns.scatterplot(ax=axes[0][2]).set_title('Oceania: all time')


sns.scatterplot(x="hdi", y="pct_wear_mask_none_time", data=df_heatmap_asia, ax=axes[1][0]);
sns.scatterplot(ax=axes[1][0]).set_title('Asia: none time')
sns.scatterplot(x="hdi", y="pct_wear_mask_none_time", data=df_heatmap_europe, ax=axes[1][1]);
sns.scatterplot(ax=axes[1][1]).set_title('Europe: none time')
sns.scatterplot(x="hdi", y="pct_wear_mask_none_time", data=df_heatmap_world, ax=axes[1][2]);
sns.scatterplot(ax=axes[1][2]).set_title('World: none time')

plt.tight_layout()

### Mask wearing and month

The **scatterplots** below show that:

1) In America, the number of people wearing a mask all the time slightly increased over the months.

2) In Oceania, mask wearing highly fluctuated throughout the months.

3) In Africa, there was a peak of mask wearing most of the time and half the time during their winter.

4) In Europe, the number of people wearing masks most of the time strongly increased over the months.

In [None]:
f, axes = plt.subplots(2, 4, figsize=(14,7), sharex=True, sharey=True)
sns.set_palette(mono_palette)

sns.scatterplot(x="month", y="pct_wear_mask_all_time", data=df_heatmap_america, ax=axes[0][0]);
sns.scatterplot(ax=axes[0][0]).set_title('America: all time')
sns.scatterplot(x="month", y="pct_wear_mask_all_time", data=df_heatmap_oceania, ax=axes[0][1]);
sns.scatterplot(ax=axes[0][1]).set_title('Oceania: all time')
sns.scatterplot(x="month", y="pct_wear_mask_most_time", data=df_heatmap_africa, ax=axes[0][2]);
sns.scatterplot(ax=axes[0][2]).set_title('Africa: most time')
sns.scatterplot(x="month", y="pct_wear_mask_most_time", data=df_heatmap_europe, ax=axes[0][3]);
sns.scatterplot(ax=axes[0][3]).set_title('Europe: most time')


sns.scatterplot(x="month", y="pct_wear_mask_most_time", data=df_heatmap_oceania, ax=axes[1][0]);
sns.scatterplot(ax=axes[1][0]).set_title('Oceania: most time')
sns.scatterplot(x="month", y="pct_wear_mask_half_time", data=df_heatmap_africa, ax=axes[1][1]);
sns.scatterplot(ax=axes[1][1]).set_title('Africa: half time')
sns.scatterplot(x="month", y="pct_wear_mask_half_time", data=df_heatmap_oceania, ax=axes[1][2]);
sns.scatterplot(ax=axes[1][2]).set_title('Oceania: half time')
sns.scatterplot(x="month", y="pct_wear_mask_some_time", data=df_heatmap_oceania, ax=axes[1][3]);
sns.scatterplot(ax=axes[1][3]).set_title('Oceania: some time')

plt.tight_layout()

### Mask wearing and gender

The **boxplots** below show that:

1) in Africa, America and Asia more men than women wear a mask most of the time.

2) in Africa, Asia, Europe and in countries world-wide more men than women wear a mask half of the time.

3) in Africa, Asia and Europe more men than women wear a mask some of the time.

In [None]:
f, axes = plt.subplots(2, 5, figsize=(14,7), sharex=True, sharey=True)
sns.set_palette(gender_palette)

sns.boxplot(x="gender_male", y="pct_wear_mask_most_time",data=df_heatmap_africa, ax=axes[0][0]);
sns.scatterplot(ax=axes[0][0]).set_title('Africa: most time')
sns.boxplot(x="gender_male", y="pct_wear_mask_most_time",data=df_heatmap_america, ax=axes[0][1]);
sns.scatterplot(ax=axes[0][1]).set_title('America: most time')
sns.boxplot(x="gender_male", y="pct_wear_mask_most_time",data=df_heatmap_asia, ax=axes[0][2]);
sns.scatterplot(ax=axes[0][2]).set_title('Asia: most time')
sns.boxplot(x="gender_male", y="pct_wear_mask_half_time",data=df_heatmap_africa, ax=axes[0][3]);
sns.scatterplot(ax=axes[0][3]).set_title('Africa: half time')
sns.boxplot(x="gender_male", y="pct_wear_mask_half_time",data=df_heatmap_asia, ax=axes[0][4]);
sns.scatterplot(ax=axes[0][4]).set_title('Asia: half time')

sns.boxplot(x="gender_male", y="pct_wear_mask_half_time",data=df_heatmap_europe, ax=axes[1][0]);
sns.scatterplot(ax=axes[1][0]).set_title('Europe: half time')
sns.boxplot(x="gender_male", y="pct_wear_mask_half_time",data=df_heatmap_world, ax=axes[1][1]);
sns.scatterplot(ax=axes[1][1]).set_title('World: half time')
sns.boxplot(x="gender_male", y="pct_wear_mask_some_time",data=df_heatmap_africa, ax=axes[1][2]);
sns.scatterplot(ax=axes[1][2]).set_title('Africa: some time')
sns.boxplot(x="gender_male", y="pct_wear_mask_some_time",data=df_heatmap_asia, ax=axes[1][3]);
sns.scatterplot(ax=axes[1][3]).set_title('Asia: some time')
sns.boxplot(x="gender_male", y="pct_wear_mask_some_time",data=df_heatmap_europe, ax=axes[1][4]);
sns.scatterplot(ax=axes[1][4]).set_title('Europe: some time')

plt.tight_layout()

### Mask wearing and covid-like-illness

The **scatterplots** below show that as the number of people with covid-like-illness in Asia increases, the number of people wearing a mask some time of none of the time in Asia decreases. They are likely to increase their mask wearing.

In [None]:
f, axes = plt.subplots(1,2, figsize=(10,5), sharex=True, sharey=True)
sns.set_palette(mono_palette)

sns.scatterplot(x="pct_covid_like_illness", y="pct_wear_mask_some_time", data=df_heatmap_asia, ax=axes[0]);
sns.scatterplot(ax=axes[0]).set_title('Asia: some time')

sns.scatterplot(x="pct_covid_like_illness", y="pct_wear_mask_none_time", data=df_heatmap_asia, ax=axes[1]);
sns.scatterplot(ax=axes[1]).set_title('Asia: none time')

plt.tight_layout()

## Conclusion

The impact of HDI and month on mask wearing differs among countries, but the effect of gender shows the same impact on mask wearing in all countries. These three variables should be included in our model.