In [None]:
import pandas as pd
import sqlalchemy
import psycopg2
import sql_functions as sf

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

In [None]:
# define the schema in the database
schema = 'capstone_health_education'

In [None]:
# import the table as dataframe
GHED_data_or_filtered = sf.get_dataframe(f'SELECT * FROM {schema}."GHED_data_or_filtered"')

In [None]:
GHED_data_or_filtered

In [None]:
# check for data types and empty cells
GHED_data_or_filtered.info()

In [None]:
#check for duplicates
GHED_data_or_filtered.duplicated().value_counts()

In [None]:
# delete the rows containing empty cells
GHED_data_or_filtered.dropna(axis=0)

In [None]:
# visualize the income categories
GHED_data_or_filtered['income'].unique()

In [None]:
# visualize the regions
GHED_data_or_filtered['region'].unique()

African Region  
Region of the Americas  
Eastern Mediterranean Region  
European Region  
South-East Asia Region  
Western Pacific Region

che_gdp: Current Health Expenditure (CHE) as % of Gross Domestic Product (GDP)  
che_pc_usd: Current Health Expenditure (CHE) per Capita in US$

In [None]:
# Group by 'region' and 'year', then sum 'che_gdp' and 'che_pc_usd'
GHED_groupby_region = GHED_data_or_filtered.groupby(['region', 'year'])[['che_gdp', 'che_pc_usd']].sum()

# Reset index to make 'region' and 'year' columns instead of index
GHED_groupby_region = GHED_groupby_region.reset_index()

# Display the result
GHED_groupby_region


In [None]:
# calculate the average of all years (2000-20022) per region
GHED_groupby_region_average = GHED_groupby_region.groupby('region').mean()


# if I want some of the years:
#filtered_data = GHED_data_or_filtered[GHED_data_or_filtered['year'].isin([2021, 2022])]


# Reset index to make 'region' a column instead of index
GHED_groupby_region_average = GHED_groupby_region_average.reset_index()

# Display the result
GHED_groupby_region_average[['region', 'che_gdp', 'che_pc_usd']]


In [None]:
# push to cloud
dataframes = {'GHED_groupby_region': GHED_groupby_region,
    'GHED_groupby_region_average': GHED_groupby_region_average}

for table_name, df in dataframes.items():
    sf.push_to_cloud(df, table_name)