# Processing pipeline

In [31]:
import sys
sys.path.insert(1, '/Users/jakoliendenhollander/capstone/capstone')
import warnings

import pandas as pd
import numpy as np

import tidy_functions.load_data
import tidy_functions.clean_data
import tidy_functions.merge_data
import tidy_functions.feature_engineering

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None) # To display all columns

## Read in data

In [2]:
# Reading in survey data from csv into a dictionary of dataframes.
dfs_country = tidy_functions.load_data.load_survey_data("/Users/jakoliendenhollander/capstone/capstone/data/CMU_Global_data/Full_Survey_Data/country/smooth/", "country")

# Concatenating individuals dataframes from the dictionary into one dataframe for regions.
survey_data = pd.concat(dfs_country, ignore_index=True)

# Corona stats
covid_cases = pd.read_csv("/Users/jakoliendenhollander/capstone/capstone/data/Corona_stats/owid-covid-data.csv")
print('Read in covid data completed.')

# Mask wearing requirements
mask_wearing_requirements = pd.read_csv("/Users/jakoliendenhollander/capstone/capstone/data/data-nbhtq.csv")
print('Read in mask wearing requirements data completed.')

Read in survey data completed.
Read in covid data completed.
Read in mask wearing requirements data completed.


## Cleaning data

In [3]:
# Survey data
survey_data = tidy_functions.clean_data.delete_other_gender(survey_data)
survey_data = tidy_functions.clean_data.deal_with_NaNs_masks(survey_data)

# Corona stats
covid_cases = tidy_functions.clean_data.deal_with_NaNs_corona_stats(covid_cases)

# Mask wearing requirements
mask_wearing_requirements = tidy_functions.clean_data.prepare_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_public_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_indoors_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.dummies_transport_mask_req(mask_wearing_requirements)
mask_wearing_requirements = tidy_functions.clean_data.data_types_mask_req(mask_wearing_requirements)

# HDI
hdi_data = tidy_functions.clean_data.rename_hdi_countries("/Users/jakoliendenhollander/capstone/capstone/data/","hdro_statistical_data_tables_1_15_d1_d5.xlsx")
dict_hdi = tidy_functions.clean_data.create_hdi_dict(hdi_data)
dict_hdi_levels = tidy_functions.clean_data.create_hdi_levels_dict(hdi_data)

NaNs before update: 152923
NaNs after update: 0
Updated NaNs in wear_mask_all_time.
NaNs removed.
Step 1 of cleaning requirements completed.
Step 2 of cleaning requirements completed.
Step 3 of cleaning requirements completed.
Step 4 of cleaning requirements completed.
Step 5 of cleaning requirements completed.
Step 6 of cleaning requirements completed.
Creating dictionaries for hdi completed.
Creating dictionaries for hdi-levels completed.


## Merging data

In [4]:
covid_merge = tidy_functions.merge_data.merge_corona_stats(survey_data,covid_cases)
requirements_merge = tidy_functions.merge_data.merge_mask_req(covid_merge,mask_wearing_requirements)
hdi_merge = tidy_functions.merge_data.create_hdi_columns(requirements_merge, dict_hdi, dict_hdi_levels)

Merging corona stats completed.
Merging mask wearing requirements completed.
Creating hdi list completed.
Creating hdi-level list completed.


## Feature engineering

In [5]:
date_fixed = tidy_functions.feature_engineering.insert_month(hdi_merge)
requirement_date = tidy_functions.feature_engineering.add_requirement_by_date(date_fixed)

Month column created.
Feature engineering completed.


In [6]:
df = requirement_date.copy()

In [34]:
df_select = df[['date','smoothed_pct_worked_outside_home_weighted','smoothed_pct_grocery_outside_home_weighted',
               'smoothed_pct_ate_outside_home_weighted','smoothed_pct_attended_public_event_weighted',
                'smoothed_pct_used_public_transit_weighted','smoothed_pct_direct_contact_with_non_hh_weighted',
               'smoothed_pct_wear_mask_all_time_weighted','smoothed_pct_wear_mask_most_time_weighted',
                'total_cases_per_million','median_age','hdi','cur_mask_recommended','cur_mask_not_required',
               'cur_mask_not_required_recommended','cur_mask_not_required_universal','cur_mask_required_part_country',
                'cur_mask_everywhere_in_public','cur_mask_public_indoors','cur_mask_public_transport']]

In [36]:
df_select.isna().sum()

date                                                0
smoothed_pct_worked_outside_home_weighted           0
smoothed_pct_grocery_outside_home_weighted          0
smoothed_pct_ate_outside_home_weighted              0
smoothed_pct_attended_public_event_weighted         0
smoothed_pct_used_public_transit_weighted           0
smoothed_pct_direct_contact_with_non_hh_weighted    0
smoothed_pct_wear_mask_all_time_weighted            0
smoothed_pct_wear_mask_most_time_weighted           0
total_cases_per_million                             0
median_age                                          0
hdi                                                 0
cur_mask_recommended                                0
cur_mask_not_required                               0
cur_mask_not_required_recommended                   0
cur_mask_not_required_universal                     0
cur_mask_required_part_country                      0
cur_mask_everywhere_in_public                       0
cur_mask_public_indoors     