In [1]:
import pandas as pd
import sqlalchemy
import psycopg2
import sql_functions as sf

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

In [2]:
# define the schema in the database
schema = 'capstone_health_education'

In [3]:
# # import the table as dataframe
# GHED_data_or_filtered = sf.get_dataframe(f'SELECT * FROM {schema}."GHED_data_or_filtered"')

In [4]:
%store -r GHED_data_or_filtered

In [5]:
GHED_data_or_filtered

Unnamed: 0,country,code,region,income,year,che_gdp,che_pc_usd
0,Algeria,DZA,AFR,Lower-middle,2000,3.49,62.12
1,Algeria,DZA,AFR,Lower-middle,2001,3.84,67.34
2,Algeria,DZA,AFR,Lower-middle,2002,3.73,66.95
3,Algeria,DZA,AFR,Lower-middle,2003,3.60,76.24
4,Algeria,DZA,AFR,Lower-middle,2004,3.54,93.02
...,...,...,...,...,...,...,...
4239,Viet Nam,VNM,WPR,Lower-middle,2017,4.99,149.31
4240,Viet Nam,VNM,WPR,Lower-middle,2018,5.03,164.24
4241,Viet Nam,VNM,WPR,Lower-middle,2019,4.97,173.65
4242,Viet Nam,VNM,WPR,Lower-middle,2020,4.30,154.24


In [6]:
# check for data types and empty cells
GHED_data_or_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4244 entries, 0 to 4243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     4244 non-null   object 
 1   code        4244 non-null   object 
 2   region      4244 non-null   object 
 3   income      4244 non-null   object 
 4   year        4244 non-null   int32  
 5   che_gdp     4154 non-null   float64
 6   che_pc_usd  4153 non-null   float64
dtypes: float64(2), int32(1), object(4)
memory usage: 215.6+ KB


In [7]:
#check for duplicates
GHED_data_or_filtered.duplicated().value_counts()

False    4244
Name: count, dtype: int64

In [8]:
# delete the rows containing empty cells
GHED_data_or_filtered.dropna(axis=0)

Unnamed: 0,country,code,region,income,year,che_gdp,che_pc_usd
0,Algeria,DZA,AFR,Lower-middle,2000,3.49,62.12
1,Algeria,DZA,AFR,Lower-middle,2001,3.84,67.34
2,Algeria,DZA,AFR,Lower-middle,2002,3.73,66.95
3,Algeria,DZA,AFR,Lower-middle,2003,3.60,76.24
4,Algeria,DZA,AFR,Lower-middle,2004,3.54,93.02
...,...,...,...,...,...,...,...
4239,Viet Nam,VNM,WPR,Lower-middle,2017,4.99,149.31
4240,Viet Nam,VNM,WPR,Lower-middle,2018,5.03,164.24
4241,Viet Nam,VNM,WPR,Lower-middle,2019,4.97,173.65
4242,Viet Nam,VNM,WPR,Lower-middle,2020,4.30,154.24


In [9]:
# visualize the income categories
GHED_data_or_filtered['income'].unique()

array(['Lower-middle', 'Upper-middle', 'Low', 'High'], dtype=object)

In [10]:
# visualize the regions
GHED_data_or_filtered['region'].unique()

array(['AFR', 'AMR', 'EMR', 'EUR', 'SEAR', 'WPR'], dtype=object)

African Region  
Region of the Americas  
Eastern Mediterranean Region  
European Region  
South-East Asia Region  
Western Pacific Region

che_gdp: Current Health Expenditure (CHE) as % of Gross Domestic Product (GDP)  
che_pc_usd: Current Health Expenditure (CHE) per Capita in US$

In [11]:
# Group by 'region' and 'year', then sum 'che_gdp' and 'che_pc_usd'
GHED_groupby_region_per_year = GHED_data_or_filtered.groupby(['region', 'year'])[['che_gdp', 'che_pc_usd']].mean().round(2)

# Reset index to make 'region' and 'year' columns instead of index
GHED_groupby_region_per_year = GHED_groupby_region_per_year.reset_index()

# Display the result
GHED_groupby_region_per_year


Unnamed: 0,region,year,che_gdp,che_pc_usd
0,AFR,2000,4.37,46.64
1,AFR,2001,4.59,46.85
2,AFR,2002,4.73,48.54
3,AFR,2003,4.91,62.08
4,AFR,2004,5.10,73.55
...,...,...,...,...
130,WPR,2018,7.07,1103.32
131,WPR,2019,7.35,1123.54
132,WPR,2020,7.68,1166.48
133,WPR,2021,8.19,1336.40


In [12]:
# calculate the average of all years (2000-20022) per region
GHED_groupby_region_all_years = GHED_groupby_region_per_year.groupby('region').mean()


# if I want filter for some of the years:
#filtered_data = GHED_data_or_filtered[GHED_data_or_filtered['year'].isin([2021, 2022])]

# # drop the year column
GHED_groupby_region_all_years = GHED_groupby_region_all_years.drop(columns='year')


# # Reset index to make 'region' a column instead of index
GHED_groupby_region_all_years = GHED_groupby_region_all_years.reset_index().round(2)

# # Display the result
GHED_groupby_region_all_years


Unnamed: 0,region,che_gdp,che_pc_usd
0,AFR,5.2,101.14
1,AMR,6.83,1074.27
2,EMR,4.87,454.18
3,EUR,7.5,2227.41
4,SEAR,4.25,123.96
5,WPR,7.09,861.74


In [13]:
# # push to cloud
# dataframes = {'GHED_groupby_region_per_year': GHED_groupby_region_per_year,
#     'GHED_groupby_region_all_years': GHED_groupby_region_all_years}

# for table_name, df in dataframes.items():
#     sf.push_to_cloud(df, table_name)

---
---

In [14]:
import python_functions_sp as pfsp

## Splitting the countries into the continents
- northern america
- southern america
- europe (4 parts for further analysis)
- africa
- asia
- pacific
- middle-east

Creating a copy with a meaningful name:

In [15]:
GHED_continents = GHED_data_or_filtered.copy()

In [16]:
GHED_continents = GHED_continents.drop('region', axis=1)

In [17]:
# check if it worked
GHED_continents

Unnamed: 0,country,code,income,year,che_gdp,che_pc_usd
0,Algeria,DZA,Lower-middle,2000,3.49,62.12
1,Algeria,DZA,Lower-middle,2001,3.84,67.34
2,Algeria,DZA,Lower-middle,2002,3.73,66.95
3,Algeria,DZA,Lower-middle,2003,3.60,76.24
4,Algeria,DZA,Lower-middle,2004,3.54,93.02
...,...,...,...,...,...,...
4239,Viet Nam,VNM,Lower-middle,2017,4.99,149.31
4240,Viet Nam,VNM,Lower-middle,2018,5.03,164.24
4241,Viet Nam,VNM,Lower-middle,2019,4.97,173.65
4242,Viet Nam,VNM,Lower-middle,2020,4.30,154.24


Assigning the continent to the countries in a new created column:

In [18]:
GHED_continents['continent'] = GHED_continents['country'].apply(pfsp.assign_continent)

Assigning the four parts of europe to the countries in another new created column:

In [19]:
GHED_continents['continent_region'] = GHED_continents['country'].apply(pfsp.assign_europe_region)

In [20]:
GHED_continents.sample(10)

Unnamed: 0,country,code,income,year,che_gdp,che_pc_usd,continent,continent_region
519,Kenya,KEN,Lower-middle,2013,4.93,67.89,Africa,Africa
2686,Hungary,HUN,High,2017,6.74,985.0,Europe,East Europe
3420,Uzbekistan,UZB,Lower-middle,2014,4.43,118.0,Asia,Asia
3507,Indonesia,IDN,Lower-middle,2013,2.87,103.38,Pacific,Pacific
1447,Guyana,GUY,Upper-middle,2015,4.02,228.02,South America,South America
3688,Brunei Darussalam,BRN,High,2018,2.41,753.81,Middle East,Middle East
1745,United States of America,USA,High,2005,14.58,6404.19,North America,North America
3895,Micronesia (Federated States of),FSM,Lower-middle,2005,17.73,399.61,Pacific,Pacific
388,Ethiopia,ETH,Low,2014,4.03,21.9,Africa,Africa
1927,Iraq,IRQ,Upper-middle,2010,3.24,143.53,Asia,Asia


Changing the order of the columns:

In [21]:
columns = [ 'code','country', 'continent', 'continent_region', 'income', 'year', 'che_gdp', 'che_pc_usd']
GHED_continents = GHED_continents[columns]

Checking the result:

In [22]:
GHED_continents.sample(10)

Unnamed: 0,code,country,continent,continent_region,income,year,che_gdp,che_pc_usd
1222,CHL,Chile,South America,South America,High,2011,6.8,994.67
11,DZA,Algeria,Africa,Africa,Lower-middle,2011,5.27,288.31
1831,BHR,Bahrain,Middle East,Middle East,High,2002,3.68,471.58
110,BDI,Burundi,Africa,Africa,Low,2000,4.22,5.83
122,BDI,Burundi,Africa,Africa,Low,2012,8.64,20.57
3696,KHM,Cambodia,Pacific,Pacific,Lower-middle,2004,7.08,29.03
2828,LVA,Latvia,Europe,North Europe,High,2002,5.76,237.0
3947,NRU,Nauru,Other,Other,High,2013,9.85,818.12
3301,TJK,Tajikistan,Asia,Asia,Lower-middle,2006,5.03,20.0
3550,MMR,Myanmar,Pacific,Pacific,Lower-middle,2012,2.41,37.0


Check if some countries couldn't be assigned to a continent:

In [23]:
GHED_continents[GHED_continents['continent'] == 'Other']['country'].unique()

array(['Maldives', 'Cook Islands', 'Kiribati', 'Marshall Islands',
       'Nauru', 'Niue', 'Palau', 'Samoa', 'Solomon Islands', 'Tonga',
       'Tuvalu', 'Vanuatu'], dtype=object)

In [24]:
GHED_continents[GHED_continents['continent_region'] == 'Other']['country'].unique()

array(['Maldives', 'Cook Islands', 'Kiribati', 'Marshall Islands',
       'Nauru', 'Niue', 'Palau', 'Samoa', 'Solomon Islands', 'Tonga',
       'Tuvalu', 'Vanuatu'], dtype=object)

---

Check if the new created columns match:

In [25]:
pfsp.check_values(GHED_continents)

All values match.


---

Writing the table into the database:

In [26]:
# sf.push_to_cloud(GHED_continents, 'ghed_continents')

In [27]:
ghed_continents = GHED_continents

%store ghed_continents

Stored 'ghed_continents' (DataFrame)
