In [1]:
import pandas as pd
import sqlalchemy
import psycopg2
import sql_functions as sf

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

In [2]:
# define the schema in the database
schema = 'capstone_health_education'

In [3]:
# import the table as dataframe
GHED_data_or_filtered = sf.get_dataframe(f'SELECT * FROM {schema}."GHED_data_or_filtered"')

In [4]:
GHED_data_or_filtered

Unnamed: 0,country,code,region,income,year,che_gdp,che_pc_usd
0,Algeria,DZA,AFR,Lower-middle,2000,3.49,62.12
1,Algeria,DZA,AFR,Lower-middle,2001,3.84,67.34
2,Algeria,DZA,AFR,Lower-middle,2002,3.73,66.95
3,Algeria,DZA,AFR,Lower-middle,2003,3.60,76.24
4,Algeria,DZA,AFR,Lower-middle,2004,3.54,93.02
...,...,...,...,...,...,...,...
4239,Viet Nam,VNM,WPR,Lower-middle,2017,4.99,149.31
4240,Viet Nam,VNM,WPR,Lower-middle,2018,5.03,164.24
4241,Viet Nam,VNM,WPR,Lower-middle,2019,4.97,173.65
4242,Viet Nam,VNM,WPR,Lower-middle,2020,4.30,154.24


In [5]:
# check for data types and empty cells
GHED_data_or_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4244 entries, 0 to 4243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     4244 non-null   object 
 1   code        4244 non-null   object 
 2   region      4244 non-null   object 
 3   income      4244 non-null   object 
 4   year        4244 non-null   int64  
 5   che_gdp     4154 non-null   float64
 6   che_pc_usd  4153 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 232.2+ KB


In [6]:
#check for duplicates
GHED_data_or_filtered.duplicated().value_counts()

False    4244
dtype: int64

In [7]:
# delete the rows containing empty cells
GHED_data_or_filtered.dropna(axis=0)

Unnamed: 0,country,code,region,income,year,che_gdp,che_pc_usd
0,Algeria,DZA,AFR,Lower-middle,2000,3.49,62.12
1,Algeria,DZA,AFR,Lower-middle,2001,3.84,67.34
2,Algeria,DZA,AFR,Lower-middle,2002,3.73,66.95
3,Algeria,DZA,AFR,Lower-middle,2003,3.60,76.24
4,Algeria,DZA,AFR,Lower-middle,2004,3.54,93.02
...,...,...,...,...,...,...,...
4239,Viet Nam,VNM,WPR,Lower-middle,2017,4.99,149.31
4240,Viet Nam,VNM,WPR,Lower-middle,2018,5.03,164.24
4241,Viet Nam,VNM,WPR,Lower-middle,2019,4.97,173.65
4242,Viet Nam,VNM,WPR,Lower-middle,2020,4.30,154.24


In [8]:
# visualize the income categories
GHED_data_or_filtered['income'].unique()

array(['Lower-middle', 'Upper-middle', 'Low', 'High'], dtype=object)

In [9]:
# visualize the regions
GHED_data_or_filtered['region'].unique()

array(['AFR', 'AMR', 'EMR', 'EUR', 'SEAR', 'WPR'], dtype=object)

African Region  
Region of the Americas  
Eastern Mediterranean Region  
European Region  
South-East Asia Region  
Western Pacific Region

che_gdp: Current Health Expenditure (CHE) as % of Gross Domestic Product (GDP)  
che_pc_usd: Current Health Expenditure (CHE) per Capita in US$

In [10]:
# Group by 'region' and 'year', then sum 'che_gdp' and 'che_pc_usd'
GHED_groupby_region = GHED_data_or_filtered.groupby(['region', 'year'])[['che_gdp', 'che_pc_usd']].sum()

# Reset index to make 'region' and 'year' columns instead of index
GHED_groupby_region = GHED_groupby_region.reset_index()

# Display the result
GHED_groupby_region


Unnamed: 0,region,year,che_gdp,che_pc_usd
0,AFR,2000,196.69,2098.87
1,AFR,2001,206.60,2108.47
2,AFR,2002,213.01,2184.45
3,AFR,2003,221.14,2793.80
4,AFR,2004,229.66,3309.58
...,...,...,...,...
130,WPR,2018,190.80,29789.69
131,WPR,2019,198.40,30335.69
132,WPR,2020,207.48,31494.94
133,WPR,2021,221.12,36082.72


In [11]:
# calculate the average of all years (2000-20022) per region
GHED_groupby_region_average = GHED_groupby_region.groupby('region').mean()


# if I want some of the years:
#filtered_data = GHED_data_or_filtered[GHED_data_or_filtered['year'].isin([2021, 2022])]


# Reset index to make 'region' a column instead of index
GHED_groupby_region_average = GHED_groupby_region_average.reset_index()

# Display the result
GHED_groupby_region_average[['region', 'che_gdp', 'che_pc_usd']]


Unnamed: 0,region,che_gdp,che_pc_usd
0,AFR,238.319545,4647.940909
1,AMR,216.993913,27652.521304
2,EMR,90.308182,8321.764091
3,EUR,379.53913,109922.588261
4,SEAR,41.958182,1233.095
5,WPR,183.381739,21472.487391


In [12]:
# push to cloud
dataframes = {'GHED_groupby_region': GHED_groupby_region,
    'GHED_groupby_region_average': GHED_groupby_region_average}

for table_name, df in dataframes.items():
    sf.push_to_cloud(df, table_name)

The GHED_groupby_region table was imported successfully.
The GHED_groupby_region_average table was imported successfully.
