In [1]:
import pandas as pd
import sqlalchemy
import psycopg2
import sql_functions as sf
import python_functions_sp as pfsp

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

# Import

In [2]:
# define the schema in the database
schema = 'capstone_health_education'

In [3]:
# HEPA_data_filtered = sf.get_dataframe(f'SELECT * FROM {schema}."hepa_filtered_final"')

In [4]:
%store -r HEPA_data_filtered

In [5]:
HEPA_data_filtered

Unnamed: 0,Measure code,YES_NO,COUNTRY_REGION,YEAR,VALUE
0,HEPA_1,YES,AUT,2015,1.0
1,HEPA_1,YES,AUT,2018,1.0
2,HEPA_1,YES,AUT,2021,1.0
3,HEPA_1,YES,BEL,2015,1.0
4,HEPA_1,YES,BEL,2018,1.0
...,...,...,...,...,...
1927,HEPA_23,YES,SVN,2018,1.0
1928,HEPA_23,YES,SVN,2021,1.0
1929,HEPA_23,NO,SWE,2015,0.0
1930,HEPA_23,YES,SWE,2018,1.0


In [6]:
# delete YES_NO column, change 'COUNTRY_REGION' to 'country', lower case all headers
HEPA_filtered = HEPA_data_filtered.copy()  #.drop(columns='YES_NO')
HEPA_filtered.rename(columns = {'COUNTRY_REGION':'country'}, inplace = True)
HEPA_filtered.columns = HEPA_filtered.columns.str.lower()



In [7]:
# check for data types and empty cells
HEPA_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1932 entries, 0 to 1931
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   measure code  1932 non-null   object 
 1   yes_no        1932 non-null   object 
 2   country       1932 non-null   object 
 3   year          1932 non-null   int64  
 4   value         1886 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 90.6+ KB


In [8]:
#check for duplicates
HEPA_filtered.duplicated().value_counts()

False    1932
Name: count, dtype: int64

In [9]:
print(HEPA_filtered[HEPA_filtered.isna().any(axis=1)])

     measure code yes_no country  year  value
35         HEPA_1    DNP     GBR  2021    NaN
36         HEPA_1    DNP     GRC  2015    NaN
119        HEPA_2    DNP     GBR  2021    NaN
120        HEPA_2    DNP     GRC  2015    NaN
203        HEPA_3    DNP     GBR  2021    NaN
204        HEPA_3    DNP     GRC  2015    NaN
287        HEPA_4    DNP     GBR  2021    NaN
288        HEPA_4    DNP     GRC  2015    NaN
371        HEPA_5    DNP     GBR  2021    NaN
372        HEPA_5    DNP     GRC  2015    NaN
455        HEPA_6    DNP     GBR  2021    NaN
456        HEPA_6    DNP     GRC  2015    NaN
539        HEPA_7    DNP     GBR  2021    NaN
540        HEPA_7    DNP     GRC  2015    NaN
623        HEPA_8    DNP     GBR  2021    NaN
624        HEPA_8    DNP     GRC  2015    NaN
707        HEPA_9    DNP     GBR  2021    NaN
708        HEPA_9    DNP     GRC  2015    NaN
791       HEPA_10    DNP     GBR  2021    NaN
792       HEPA_10    DNP     GRC  2015    NaN
875       HEPA_11    DNP     GBR  

In [10]:
# delete the rows containing empty cells
HEPA_filtered.dropna(axis=0, inplace=True)

In [11]:
print(HEPA_filtered[HEPA_filtered.isna().any(axis=1)])

Empty DataFrame
Columns: [measure code, yes_no, country, year, value]
Index: []


In [12]:
# change the 'value' to int
HEPA_filtered['value'] = HEPA_filtered['value'].astype('int64')
HEPA_filtered

Unnamed: 0,measure code,yes_no,country,year,value
0,HEPA_1,YES,AUT,2015,1
1,HEPA_1,YES,AUT,2018,1
2,HEPA_1,YES,AUT,2021,1
3,HEPA_1,YES,BEL,2015,1
4,HEPA_1,YES,BEL,2018,1
...,...,...,...,...,...
1927,HEPA_23,YES,SVN,2018,1
1928,HEPA_23,YES,SVN,2021,1
1929,HEPA_23,NO,SWE,2015,0
1930,HEPA_23,YES,SWE,2018,1


In [13]:
dataframes = [HEPA_filtered]

pfsp.columns_lower_snake_case_2(dataframes)

Unnamed: 0,measure_code,yes_no,country,year,value
0,HEPA_1,YES,AUT,2015,1
1,HEPA_1,YES,AUT,2018,1
2,HEPA_1,YES,AUT,2021,1
3,HEPA_1,YES,BEL,2015,1
4,HEPA_1,YES,BEL,2018,1
...,...,...,...,...,...
1927,HEPA_23,YES,SVN,2018,1
1928,HEPA_23,YES,SVN,2021,1
1929,HEPA_23,NO,SWE,2015,0
1930,HEPA_23,YES,SWE,2018,1


In [14]:
# replace the measure codes with the measures description
measure_dict = {
    "HEPA_1": "Recommendations on physical activity",
    "HEPA_2": "Levels of physical activity in adults",
    "HEPA_3": "Levels of physical activity in children and adolescents",
    "HEPA_4": "Coordination mechanism on HEPA promotion",
    "HEPA_5": "Funding for HEPA promotion",
    "HEPA_6": "Sports for All",
    "HEPA_7": "Sports Club for Health",
    "HEPA_8": "Access for socially disadvantaged groups",
    "HEPA_9": "Special target groups",
    "HEPA_10": "Surveillance of physical activity in health",
    "HEPA_11": "Counselling on physical activity",
    "HEPA_12": "Training of health professionals",
    "HEPA_13": "Physical education in schools",
    "HEPA_14": "Physical activity promotion in schools",
    "HEPA_15": "Training of physical education teachers",
    "HEPA_16": "Active travel to school",
    "HEPA_17": "Level of cycling and walking",
    "HEPA_18": "Infrastructures for leisure-time physical activity",
    "HEPA_19": "Active travel to work",
    "HEPA_20": "Physical activity at the workplace",
    "HEPA_21": "Community interventions for older adults",
    "HEPA_22": "Evaluation of HEPA policies",
    "HEPA_23": "Awareness campaign on physical activity"}

HEPA_filtered['measure_label'] = HEPA_filtered['measure_code'].replace(measure_dict)

HEPA_filtered

Unnamed: 0,measure_code,yes_no,country,year,value,measure_label
0,HEPA_1,YES,AUT,2015,1,Recommendations on physical activity
1,HEPA_1,YES,AUT,2018,1,Recommendations on physical activity
2,HEPA_1,YES,AUT,2021,1,Recommendations on physical activity
3,HEPA_1,YES,BEL,2015,1,Recommendations on physical activity
4,HEPA_1,YES,BEL,2018,1,Recommendations on physical activity
...,...,...,...,...,...,...
1927,HEPA_23,YES,SVN,2018,1,Awareness campaign on physical activity
1928,HEPA_23,YES,SVN,2021,1,Awareness campaign on physical activity
1929,HEPA_23,NO,SWE,2015,0,Awareness campaign on physical activity
1930,HEPA_23,YES,SWE,2018,1,Awareness campaign on physical activity


Change the order of the columns:

In [15]:
HEPA_filtered.columns

Index(['measure_code', 'yes_no', 'country', 'year', 'value', 'measure_label'], dtype='object')

In [16]:
columns = ['measure_code', 'measure_label', 'yes_no', 'country', 'year', 'value']

HEPA_filtered = HEPA_filtered[columns]

In [17]:
HEPA_filtered.columns

Index(['measure_code', 'measure_label', 'yes_no', 'country', 'year', 'value'], dtype='object')

In [18]:
# push to cloud
# sf.push_to_cloud(HEPA_data_filtered, 'hepa_filtered_final')

In [19]:
hepa_filtered_final = HEPA_data_filtered

%store hepa_filtered_final

Stored 'hepa_filtered_final' (DataFrame)
