# Joining mask wearing requirements with survey data

Data source: https://masks4all.co/what-countries-require-masks-in-public/ 

Data downloaded on 3/12/2020 at 10:57.

In [None]:
import sys
sys.path.insert(1, '/Users/jakoliendenhollander/capstone/capstone')

import pandas as pd
import warnings
import functions.functions_data

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None) # To display all columns

## Read in data

In [None]:
mask_wearing_requirements = pd.read_csv("/Users/jakoliendenhollander/capstone/capstone/data/data-nbhtq.csv")

In [None]:
# Reading in survey data from csv into a dictionary of dataframes.
dfs_country = functions.functions_data.get_data("/Users/jakoliendenhollander/capstone/capstone/data/CMU_Global_data/Full_Survey_Data/country/smooth/", "country")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
survey_data = pd.concat(dfs_country, ignore_index=True)

## Data cleaning

In [None]:
# Drop columns GDP and Population
mask_wearing_requirements.drop(columns=["GDP (2018 in Millions)","Population"], axis=1, inplace=True)

# Rename columns
mask_wearing_requirements.rename(columns={"Country": "country",
                                          "Masks Required? (At Least In Businesses)": "required",
                                          "Type of Requirement?": "requirement_type",
                                          "Date of Full Country Requirement": "mask_requirement_date",
                                          "Recommend To Wear Masks?": "mask_recommended"}, inplace=True)

### Extract country name

In [None]:
# Extract country name between brackets in first column
location = []
for i in range(len(mask_wearing_requirements)):
    country_name = mask_wearing_requirements.at[mask_wearing_requirements.index[i],"country"].split('[', 1)[1].split(']')[0]
    location.append(country_name)
mask_wearing_requirements["country"] = location

### Create dummy variables for required column

In [None]:
# add observations of yes public transport to full country
mask_wearing_requirements["required"].replace("Yes (Public Transport)", "Full Country", inplace=True)

# Make dummy variables out of the required column
dummies = pd.get_dummies(mask_wearing_requirements['required'])
mask_wearing_requirements = mask_wearing_requirements.join(dummies)

# Rename dummy variables
mask_wearing_requirements.rename(columns={"Full Country": "mask_required_full_country",
                                          "No": "mask_not_required",
                                          "No, But Recommends Masks": "mask_not_required_recommended",
                                          "No, But Universal Mask Usage": "mask_not_required_universal",
                                          "Parts of Country": "mask_required_part_country"}, inplace=True)

### Create variables for requirement_type column

In [None]:
# Make content of requirement_type column lower case
mask_wearing_requirements["requirement_type"] = mask_wearing_requirements["requirement_type"].str.lower()

In [None]:
# Create everywhere in public dummy variable
everywhere_in_public = []
everywhere = [
    "everywhere in public",
    "everywhere in public where social distancing isn't possible",
    "public transport + everywhere in public with more than 10 people",
    "public transport + select states: everywhere",
    "all crowded places + universal mask usage",
    "all indoor public places + outdoor within 20 meters of others",
    "public transport + everywhere in public where social distancing isn't possible",
    "public transport, markets, supermarkets & crowded places",
    "everywhere in public (major cities)"
    ]

for j in range(len(mask_wearing_requirements)):
    if mask_wearing_requirements.at[mask_wearing_requirements.index[j],"requirement_type"] in everywhere:
        everywhere_in_public.append(1)
    else:
        everywhere_in_public.append(0)

mask_wearing_requirements["mask_everywhere_in_public"] = everywhere_in_public

In [None]:
# Create public indoors variable
public_indoors = []
indoors = [
    "all indoor public places",
    "public transport & stores",
    "all indoor public places with multiple people",
    "public transport + shops",
    "all commercial establishments",
    "public transport & shopping",
    "supermarkets, banks & some indoor spaces",
    "public roads & business employees",
    "public transport + shopping",
    "public transport + markets + most public places",
    "public transportation, medical facilities, shops, and malls",
    "certain public places",
    "public transit, shops, and supermarkets",
    "all indoor public places + outdoor within 20 meters of others",
    "public transit, cinemas, churches, theaters, banks, and restaurants",
    "public transport, markets, supermarkets & crowded places"
    ]

for j in range(len(mask_wearing_requirements)):
    if mask_wearing_requirements.at[mask_wearing_requirements.index[j],"requirement_type"] in indoors:
        public_indoors.append(1)
    else:
        public_indoors.append(0)
        
mask_wearing_requirements["mask_public_indoors"] = public_indoors

In [None]:
# create public transport variable
public_transport = []
transport= [
    "public transport",
    "public transport & stores",
    "public transport + shops",
    "public transport & shopping",
    "public roads & business employees",
    "public transport + shoppingpublic transport + markets + most public places",
    "public transportation, medical facilities, shops, and malls",
    "public transport + everywhere in public with more than 10 people",
    "public transit, shops, and supermarkets",
    "public transport + select states: everywhere",
    "public transit, cinemas, churches, theaters, banks, and restaurants",
    "public transport + everywhere in public where social distancing isn't possible",
    "public buses & at airports",
    "public transport, markets, supermarkets & crowded places"
    ]

for j in range(len(mask_wearing_requirements)):
    if mask_wearing_requirements.at[mask_wearing_requirements.index[j],"requirement_type"] in transport:
        public_transport.append(1)
    else:
        public_transport.append(0)
        
mask_wearing_requirements["mask_public_transport"] = public_transport

In [None]:
# Drop required and requirement_type columns
mask_wearing_requirements.drop(columns=["required","requirement_type"], axis=1, inplace=True)

In [None]:
mask_wearing_requirements.head()

## Selecting data

In [None]:
# Check differences in included countries between covid cases and survey data
unique_countries = set(survey_data["country_agg"]).symmetric_difference(set(mask_wearing_requirements["country"]))
unique_countries_survey = set(survey_data["country_agg"]).intersection(unique_countries)
unique_countries_masks = set(mask_wearing_requirements["country"]).intersection(unique_countries)
print('The following countries occur only in the survey data:')
print(unique_countries_survey)
print('The following countries occur only in the requirements data:')
print(unique_countries_masks)

In [None]:
# Change country names in mask wearing set to make them match to the survey data set
mask_wearing_requirements["country"].replace({"Antigua and Barbuda": "Antigua", 
                                              "Myanmar (formerly Burma)": "Myanmar",
                                              "Czechia (Czech Republic)": "Czech Republic",
                                              "Palestine State": "Palestine"}, inplace=True)

# Check differences in included countries between covid cases and survey data
unique_countries2 = set(survey_data["country_agg"]).symmetric_difference(set(mask_wearing_requirements["country"]))
unique_countries_masks2 = set(mask_wearing_requirements["country"]).intersection(unique_countries2)

In [None]:
# Delete rows of countries that only occur in one data set
df_survey = survey_data[~survey_data['country_agg'].isin(unique_countries_survey)]
df_masks = mask_wearing_requirements[~mask_wearing_requirements['country'].isin(unique_countries_masks2)]

# Check whether it worked
print('Difference:',set(df_survey["country_agg"]).symmetric_difference(set(df_masks["country"])))

In [None]:
# Rename country column
df_masks.rename(columns={"country":"country_agg"}, inplace=True)

## Join datasets on country

In [None]:
# Join datasets on iso_code and date
df_combined = pd.merge(df_survey,df_masks,on=["country_agg"])

In [None]:
df_combined.head()