In [3]:
import pandas as pd
import sqlalchemy
import psycopg2
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sql_functions_sp as sf
import python_functions_sp as pf

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

### Import data from Postgres ###

In [4]:
schema = 'capstone_health_education'
table = 'HFA_data_p1'

sql_query = f'SELECT * FROM {schema}."{table}";'
HFA_data_p1 = sf.get_dataframe(sql_query)

In [5]:
schema = 'capstone_health_education'
table = 'HFA_data_p2'

sql_query2 = f'SELECT * FROM {schema}."{table}";'
HFA_data_p2 = sf.get_dataframe(sql_query2)

In [6]:
schema = 'capstone_health_education'
table = 'HFA_data_p3'

sql_query3 = f'SELECT * FROM {schema}."{table}";'
HFA_data_p3 = sf.get_dataframe(sql_query3)

In [7]:
dataframes = [HFA_data_p1, HFA_data_p2, HFA_data_p3]

pf.columns_lower_snake_case_2(dataframes)

Unnamed: 0,measure_code,sex,yes_no,country_region,1959,1960,1961,1962,1963,1964,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2025
0,HFA_535,ALL,,ALB,,,,,,,...,,,,,,,,,,
1,HFA_535,ALL,,AND,,,,,,,...,8034.0,8044.0,8097.0,8290.0,8482.0,7601.0,,,,
2,HFA_535,ALL,,ARM,,,,,,,...,406552.0,393540.0,399734.0,397633.0,406393.0,435470.0,394470.0,474039.0,,
3,HFA_535,ALL,,AUT,,,,,,,...,2244340.0,,,,,,,,,
4,HFA_535,ALL,,AZE,,,,,,,...,633572.0,629058.0,636481.0,640812.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6074,HFA_640,FEMALE,,UZB,,,,,,,...,137.0,139.0,126.0,150.0,155.0,145.0,,,,
6075,HFA_640,FEMALE,,CARINFONET,,,,,,,...,346.0,332.0,295.0,310.0,329.0,310.0,,,,
6076,HFA_640,FEMALE,,CIS,,,,,,,...,,,,,,,,,,
6077,HFA_640,FEMALE,,EU_BEFORE_MAY2004,,,,,,,...,210.0,181.0,,,,,,,,


In [58]:
HFA_data_p1.columns

Index(['measure_code', 'sex', 'country_region', '1949', '1950', '1951', '1952',
       '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970',
       '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979',
       '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988',
       '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2025', '2030',
       '2035', '2040', '2045', '2050'],
      dtype='object')

In [59]:
HFA_data_p2.columns

Index(['measure_code', 'sex', 'place_residence', 'country_region', '1960',
       '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969',
       '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978',
       '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
       '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
       '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'],
      dtype='object')

In [10]:
HFA_data_p3.shape

(6079, 69)

In [11]:
frames = [HFA_data_p1, HFA_data_p2, HFA_data_p3]
HFA_df = pd.concat(frames)

In [12]:
HFA_df.shape

(36986, 85)

In [13]:
HFA_df.head()

Unnamed: 0,measure_code,sex,country_region,1949,1950,1951,1952,1953,1954,1955,...,2021,2022,2025,2030,2035,2040,2045,2050,place_residence,yes_no
0,HFA_1,ALL,ALB,,,,,,,,...,2811666.0,2842321.0,,,,,,,,
1,HFA_1,ALL,AND,,,,,,,,...,79034.0,79824.0,,,,,,,,
2,HFA_1,ALL,ARM,,,,,,,,...,2962309.0,2780469.0,,,,,,,,
3,HFA_1,ALL,AUT,,,,,,,,...,8951520.0,8939617.0,,,,,,,,
4,HFA_1,ALL,AZE,,,,,,,,...,10312992.0,10358074.0,,,,,,,,


In [61]:
HFA_df['place_residence'].info()

<class 'pandas.core.series.Series'>
Int64Index: 36986 entries, 0 to 6078
Series name: place_residence
Non-Null Count  Dtype 
--------------  ----- 
378 non-null    object
dtypes: object(1)
memory usage: 577.9+ KB


#### Reducing the dataset #####

In [70]:
columns_to_keep = ['measure_code', 'sex', 'country_region', '2000', '2001','2002',	'2003',	'2004',	'2005',	'2006',	'2007',	'2008',	'2009',	'2010', '2011', '2012',	'2013',	'2014',	'2015',	'2016',	'2017',	'2018',	'2019',	'2020',	'2021',	'2022']

hfa_df = HFA_df[columns_to_keep]

In [71]:
hfa_df

Unnamed: 0,measure_code,sex,country_region,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,HFA_1,ALL,ALB,3114000.0,3069275.0,3084150.0,3102764.0,3127263.0,3011490.0,2992551.0,...,2896655.0,2889676.0,2889173.0,2903700.0,2870324.0,2862427.0,2862427.0,2862427.0,2811666.0,2842321.0
1,HFA_1,ALL,AND,65907.0,66087.0,66744.0,69739.5,74597.5,77712.0,79885.0,...,76172.0,72785.0,77481.0,77872.0,79236.0,80242.0,81011.0,81011.0,79034.0,79824.0
2,HFA_1,ALL,ARM,3226899.0,3215312.0,3212878.0,3211267.0,3214030.0,3217534.0,3230086.0,...,3021979.0,3006154.0,3004588.0,3026048.0,2979442.0,2969001.0,2962482.0,2961473.0,2962309.0,2780469.0
3,HFA_1,ALL,AUT,8011566.0,8042293.0,8082121.0,8118245.0,8169441.0,8225278.0,8267948.0,...,8477230.0,8543932.0,8629519.0,8739806.0,8795073.0,8837707.0,8877637.0,8916845.0,8951520.0,8939617.0
4,HFA_1,ALL,AZE,8048600.0,8111200.0,8171950.0,8234100.0,8306500.0,8391850.0,8609450.0,...,9416800.0,9629779.0,9649300.0,9757790.5,9854050.0,9854050.0,9854050.0,9854050.0,10312992.0,10358074.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6074,HFA_640,FEMALE,UZB,174.0,175.0,174.0,164.0,163.0,156.0,137.0,...,136.0,137.0,139.0,126.0,150.0,155.0,145.0,,,
6075,HFA_640,FEMALE,CARINFONET,481.0,452.0,461.0,402.0,399.0,411.0,435.0,...,331.0,346.0,332.0,295.0,310.0,329.0,310.0,,,
6076,HFA_640,FEMALE,CIS,1181.0,1085.0,1070.0,999.0,881.0,928.0,937.0,...,650.0,,,,,,,,,
6077,HFA_640,FEMALE,EU_BEFORE_MAY2004,,,204.0,221.0,215.0,179.0,223.0,...,203.0,210.0,181.0,,,,,,,


In [72]:
list_of_objects = [ 'HFA_1',
'HFA_2',
'HFA_3',
'HFA_10',
'HFA_11',
'HFA_12',
'HFA_13',
'HFA_14',
'HFA_15',
'HFA_26',
'HFA_27',
'HFA_28',
'HFA_29',
'HFA_30',
'HFA_32',
'HFA_33',
'HFA_35',
'HFA_36',
'HFA_37',
'HFA_38',
'HFA_39',
'HFA_40',
'HFA_41',
'HFA_43',
'HFA_44',
'HFA_45',
'HFA_357',
'HFA_391',
'HFA_617',
'HFA_618',
'HFA_636',
'HFA_634',
'HFA_635' ]

hfa_eduHealth[list_of_objects] = hfa_eduHealth[list_of_objects].apply(pd.to_numeric, errors='coerce')

#### Reducing measure codes ####

In [73]:
measure_codes_to_keep = ['HFA_1',
'HFA_2',
'HFA_3',
'HFA_10',
'HFA_11',
'HFA_12',
'HFA_13',
'HFA_14',
'HFA_15',
'HFA_26',
'HFA_27',
'HFA_28',
'HFA_29',
'HFA_30',
'HFA_32',
'HFA_33',
'HFA_35',
'HFA_36',
'HFA_37',
'HFA_38',
'HFA_39',
'HFA_40',
'HFA_41',
'HFA_43',
'HFA_44',
'HFA_45',
'HFA_357',
'HFA_391',
'HFA_617',
'HFA_618',
'HFA_636',
'HFA_634',
'HFA_635'
]


hfa_filtered = hfa_df[hfa_df['measure_code'].isin(measure_codes_to_keep)]

In [74]:
hfa_filtered.shape

(2024, 26)

In [75]:
hfa_filtered.sample(15)

Unnamed: 0,measure_code,sex,country_region,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
1489,HFA_29,ALL,EU_AFTER_MAY2004,12.4,12.94,13.34,13.43,12.93,11.76,9.72,...,10.83,9.62,8.35,7.11,5.77,5.12,4.7,5.5,,
1626,HFA_32,ALL,FIN,26480.0,25850.0,25620.0,28800.0,35380.0,40230.0,42840.0,...,49510.0,49420.0,47180.0,45530.0,44840.0,48220.0,49940.0,50080.0,53660.0,
5793,HFA_636,ALL,BIH,65.49486,,,,,65.62045,,...,,,66.93045,67.17299,,,,,,
1524,HFA_30,ALL,LUX,52.1,51.9,52.5,51.2,51.3,52.3,52.7,...,55.9,56.6,56.0,55.2,56.0,56.5,57.3,56.7,58.7,58.95
1426,HFA_28,ALL,EU_BEFORE_MAY2004,3240496.0,3240516.0,3240826.0,3241176.0,3240886.0,3240906.0,3241206.0,...,3239081.0,3239122.0,3239171.0,3239357.0,3239357.0,3240109.0,,,,
1323,HFA_27,ALL,HUN,109.76,109.51,109.2,108.88,108.64,108.43,108.26,...,106.34,106.3,105.93,105.57,105.21,105.08,,,,
4781,HFA_618,ALL,SRB,,,,,,,0.16,...,3.5,1.22,0.79,0.83,1.03,1.43,,,,
1726,HFA_33,ALL,UKR,658.35,807.8,911.91,1087.79,1416.6,1894.47,2391.32,...,4187.74,3104.64,2124.66,2187.73,2638.33,3096.56,3661.46,3751.74,4835.57,
2010,HFA_38,FEMALE,NLD,,,,,,,,...,,,,,,,,,,
2439,HFA_45,FEMALE,LTU,77.45,77.5,77.48,77.78,77.8,77.49,77.15,...,79.63,80.17,79.81,80.2,80.56,80.82,81.3,80.16,78.89,


In [76]:
hfa_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2024 entries, 0 to 5845
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   measure_code    2024 non-null   object 
 1   sex             2024 non-null   object 
 2   country_region  2024 non-null   object 
 3   2000            1615 non-null   float64
 4   2001            1393 non-null   float64
 5   2002            1374 non-null   float64
 6   2003            1370 non-null   float64
 7   2004            1391 non-null   float64
 8   2005            1669 non-null   float64
 9   2006            1390 non-null   float64
 10  2007            1439 non-null   float64
 11  2008            1424 non-null   float64
 12  2009            1470 non-null   float64
 13  2010            1715 non-null   float64
 14  2011            1538 non-null   float64
 15  2012            1463 non-null   float64
 16  2013            1445 non-null   float64
 17  2014            1487 non-null   f

### Naming the measure codes ###

In [77]:
schema = 'capstone_health_education'
table = 'HFA_metadata_sh4'

# sql_query = f'SELECT * FROM {schema}."{table}";'
sql_query = f'SELECT "Measure labels", "Unnamed: 1" FROM {schema}."{table}" WHERE "Measure labels" LIKE \'HFA%%\';'
measure_names = sf.get_dataframe(sql_query)

In [78]:
measure_names

Unnamed: 0,Measure labels,Unnamed: 1
0,HFA_1,"Mid-year population, by sex"
1,HFA_2,Mid-year male population
2,HFA_3,Mid-year female population
3,HFA_10,"% of population aged 0–14 years, by sex"
4,HFA_11,"% of population aged 0–14 years, males"
...,...,...
616,HFA_639,"Youth unemployment rate, % of labor force ages..."
617,HFA_640,"Number of maternal deaths, clinical data"
618,HFA,European Health for All database
619,HFA,European Health for All database (WHO)


In [79]:
measure_names.rename(columns={'Measure labels': 'measure_code', 'Unnamed: 1': 'measure_label'}, inplace=True)

In [80]:
hfa_filtered = pd.merge(hfa_filtered, measure_names, on='measure_code', how='left')

In [81]:
hfa_filtered

Unnamed: 0,measure_code,sex,country_region,2000,2001,2002,2003,2004,2005,2006,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,measure_label
0,HFA_1,ALL,ALB,3.114000e+06,3069275.0,3084150.0,3102764.0,3127263.0,3.011490e+06,2992551.0,...,2889676.0,2.889173e+06,2.903700e+06,2870324.0,2862427.0,2862427.0,2862427.0,2811666.0,2842321.0,"Mid-year population, by sex"
1,HFA_1,ALL,AND,6.590700e+04,66087.0,66744.0,69739.5,74597.5,7.771200e+04,79885.0,...,72785.0,7.748100e+04,7.787200e+04,79236.0,80242.0,81011.0,81011.0,79034.0,79824.0,"Mid-year population, by sex"
2,HFA_1,ALL,ARM,3.226899e+06,3215312.0,3212878.0,3211267.0,3214030.0,3.217534e+06,3230086.0,...,3006154.0,3.004588e+06,3.026048e+06,2979442.0,2969001.0,2962482.0,2961473.0,2962309.0,2780469.0,"Mid-year population, by sex"
3,HFA_1,ALL,AUT,8.011566e+06,8042293.0,8082121.0,8118245.0,8169441.0,8.225278e+06,8267948.0,...,8543932.0,8.629519e+06,8.739806e+06,8795073.0,8837707.0,8877637.0,8916845.0,8951520.0,8939617.0,"Mid-year population, by sex"
4,HFA_1,ALL,AZE,8.048600e+06,8111200.0,8171950.0,8234100.0,8306500.0,8.391850e+06,8609450.0,...,9629779.0,9.649300e+06,9.757790e+06,9854050.0,9854050.0,9854050.0,9854050.0,10312992.0,10358074.0,"Mid-year population, by sex"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,HFA_636,ALL,EU_MEMBERS,6.853580e+01,,,,,6.958801e+01,,...,,7.130116e+01,7.156985e+01,,,,,,,Healthy life expectancy (HALE) at birth
2020,HFA_636,ALL,NORDIC,6.949645e+01,,,,,7.051491e+01,,...,,7.211325e+01,7.226660e+01,,,,,,,Healthy life expectancy (HALE) at birth
2021,HFA_636,ALL,SEEHN,6.421061e+01,,,,,6.512492e+01,,...,,6.721519e+01,6.760334e+01,,,,,,,Healthy life expectancy (HALE) at birth
2022,HFA_636,ALL,SMALL,6.878041e+01,,,,,6.964781e+01,,...,,7.169441e+01,7.180518e+01,,,,,,,Healthy life expectancy (HALE) at birth


#### Giving full names to countries and regions ####

In [82]:
schema = 'capstone_health_education'
table = 'HFA_metadata_sh5'

sql_query = f'SELECT * FROM {schema}."{table}";'
country_names = sf.get_dataframe(sql_query)

In [83]:
country_names.drop(['ISO 2', 'ISO 3', 'WHO code', 'Short name'], axis=1, inplace=True)

In [84]:
country_names.rename(columns={'Code': 'country_region', 'Full name': 'name'}, inplace=True)

In [85]:
schema = 'capstone_health_education'
table = 'HFA_metadata_sh6'

sql_query = f'SELECT * FROM {schema}."{table}";'
region_names = sf.get_dataframe(sql_query)

In [86]:
region_names.drop(['Full name'], axis=1, inplace=True)

In [87]:
region_names.rename(columns={'Code': 'country_region', 'Short name': 'name'}, inplace=True)

In [88]:
cat = pd.concat([country_names, region_names], ignore_index=True)

In [89]:
hfa_filtered = pd.merge(hfa_filtered, cat, on='country_region', how='left').copy()

In [90]:
hfa_filtered

Unnamed: 0,measure_code,sex,country_region,2000,2001,2002,2003,2004,2005,2006,...,2015,2016,2017,2018,2019,2020,2021,2022,measure_label,name
0,HFA_1,ALL,ALB,3.114000e+06,3069275.0,3084150.0,3102764.0,3127263.0,3.011490e+06,2992551.0,...,2.889173e+06,2.903700e+06,2870324.0,2862427.0,2862427.0,2862427.0,2811666.0,2842321.0,"Mid-year population, by sex",Albania
1,HFA_1,ALL,AND,6.590700e+04,66087.0,66744.0,69739.5,74597.5,7.771200e+04,79885.0,...,7.748100e+04,7.787200e+04,79236.0,80242.0,81011.0,81011.0,79034.0,79824.0,"Mid-year population, by sex",Andorra
2,HFA_1,ALL,ARM,3.226899e+06,3215312.0,3212878.0,3211267.0,3214030.0,3.217534e+06,3230086.0,...,3.004588e+06,3.026048e+06,2979442.0,2969001.0,2962482.0,2961473.0,2962309.0,2780469.0,"Mid-year population, by sex",Armenia
3,HFA_1,ALL,AUT,8.011566e+06,8042293.0,8082121.0,8118245.0,8169441.0,8.225278e+06,8267948.0,...,8.629519e+06,8.739806e+06,8795073.0,8837707.0,8877637.0,8916845.0,8951520.0,8939617.0,"Mid-year population, by sex",Austria
4,HFA_1,ALL,AZE,8.048600e+06,8111200.0,8171950.0,8234100.0,8306500.0,8.391850e+06,8609450.0,...,9.649300e+06,9.757790e+06,9854050.0,9854050.0,9854050.0,9854050.0,10312992.0,10358074.0,"Mid-year population, by sex",Azerbaijan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,HFA_636,ALL,EU_MEMBERS,6.853580e+01,,,,,6.958801e+01,,...,7.130116e+01,7.156985e+01,,,,,,,Healthy life expectancy (HALE) at birth,Members of the European Union
2020,HFA_636,ALL,NORDIC,6.949645e+01,,,,,7.051491e+01,,...,7.211325e+01,7.226660e+01,,,,,,,Healthy life expectancy (HALE) at birth,Nordic countries
2021,HFA_636,ALL,SEEHN,6.421061e+01,,,,,6.512492e+01,,...,6.721519e+01,6.760334e+01,,,,,,,Healthy life expectancy (HALE) at birth,South-eastern Europe Health Network members (S...
2022,HFA_636,ALL,SMALL,6.878041e+01,,,,,6.964781e+01,,...,7.169441e+01,7.180518e+01,,,,,,,Healthy life expectancy (HALE) at birth,Small countries


### Melt and pivot the data frame ###

In [91]:
hfa_melted = pd.melt(hfa_filtered,
                      id_vars=['sex', 'country_region', 'name', 'measure_code', 'measure_label'],
                      value_vars=['2000','2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'],
                      var_name='years',
                      value_name='value'
                      )

In [92]:
hfa_melted

Unnamed: 0,sex,country_region,name,measure_code,measure_label,years,value
0,ALL,ALB,Albania,HFA_1,"Mid-year population, by sex",2000,3114000.0
1,ALL,AND,Andorra,HFA_1,"Mid-year population, by sex",2000,65907.0
2,ALL,ARM,Armenia,HFA_1,"Mid-year population, by sex",2000,3226899.0
3,ALL,AUT,Austria,HFA_1,"Mid-year population, by sex",2000,8011566.0
4,ALL,AZE,Azerbaijan,HFA_1,"Mid-year population, by sex",2000,8048600.0
...,...,...,...,...,...,...,...
46547,ALL,EU_MEMBERS,Members of the European Union,HFA_636,Healthy life expectancy (HALE) at birth,2022,
46548,ALL,NORDIC,Nordic countries,HFA_636,Healthy life expectancy (HALE) at birth,2022,
46549,ALL,SEEHN,South-eastern Europe Health Network members (S...,HFA_636,Healthy life expectancy (HALE) at birth,2022,
46550,ALL,SMALL,Small countries,HFA_636,Healthy life expectancy (HALE) at birth,2022,


In [99]:
hfa_melted_pivoted = pd.pivot(hfa_melted,
                       columns='measure_code',
                       index=['name', 'years', 'sex'],
                       values='value'
                       )

In [100]:
hfa_eduHealth = hfa_melted_pivoted

In [101]:
hfa_eduHealth

Unnamed: 0_level_0,Unnamed: 1_level_0,measure_code,HFA_1,HFA_10,HFA_11,HFA_12,HFA_13,HFA_14,HFA_15,HFA_2,HFA_26,HFA_27,...,HFA_40,HFA_41,HFA_43,HFA_44,HFA_45,HFA_617,HFA_618,HFA_634,HFA_635,HFA_636
name,years,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Albania,2000,ALL,3114000.0,32.67,,,6.11,,,,41.7,108.31,...,,,75.10,,,,3.49,,,64.93581
Albania,2000,FEMALE,,,,31.15,,,6.89,,,,...,,,,,78.24,,,67.20689,,
Albania,2000,MALE,,,34.11,,,5.37,,1592000.0,,,...,,,,72.22,,,,,62.91314,
Albania,2001,ALL,3069275.0,29.29,,,7.54,,,,,106.76,...,,68.3,77.18,,,,5.52,,,
Albania,2001,FEMALE,,,,28.43,,,8.01,,,,...,,,,,80.45,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Western Balkans,2021,FEMALE,,,,,,,,,,,...,,,,,,,,,,
Western Balkans,2021,MALE,,,,,,,,7691590.0,,,...,,,,,,,,,,
Western Balkans,2022,ALL,16017893.0,,,,,,,,,,...,,,,,,,,,,
Western Balkans,2022,FEMALE,,,,,,,,,,,...,,,,,,,,,,


In [102]:
hfa_eduHealth.reset_index(inplace=True)

In [103]:
hfa_eduHealth

measure_code,name,years,sex,HFA_1,HFA_10,HFA_11,HFA_12,HFA_13,HFA_14,HFA_15,...,HFA_40,HFA_41,HFA_43,HFA_44,HFA_45,HFA_617,HFA_618,HFA_634,HFA_635,HFA_636
0,Albania,2000,ALL,3114000.0,32.67,,,6.11,,,...,,,75.10,,,,3.49,,,64.93581
1,Albania,2000,FEMALE,,,,31.15,,,6.89,...,,,,,78.24,,,67.20689,,
2,Albania,2000,MALE,,,34.11,,,5.37,,...,,,,72.22,,,,,62.91314,
3,Albania,2001,ALL,3069275.0,29.29,,,7.54,,,...,,68.3,77.18,,,,5.52,,,
4,Albania,2001,FEMALE,,,,28.43,,,8.01,...,,,,,80.45,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4411,Western Balkans,2021,FEMALE,,,,,,,,...,,,,,,,,,,
4412,Western Balkans,2021,MALE,,,,,,,,...,,,,,,,,,,
4413,Western Balkans,2022,ALL,16017893.0,,,,,,,...,,,,,,,,,,
4414,Western Balkans,2022,FEMALE,,,,,,,,...,,,,,,,,,,


In [104]:
hfa_eduHealth.columns.name=None

In [105]:
hfa_eduHealth

Unnamed: 0,name,years,sex,HFA_1,HFA_10,HFA_11,HFA_12,HFA_13,HFA_14,HFA_15,...,HFA_40,HFA_41,HFA_43,HFA_44,HFA_45,HFA_617,HFA_618,HFA_634,HFA_635,HFA_636
0,Albania,2000,ALL,3114000.0,32.67,,,6.11,,,...,,,75.10,,,,3.49,,,64.93581
1,Albania,2000,FEMALE,,,,31.15,,,6.89,...,,,,,78.24,,,67.20689,,
2,Albania,2000,MALE,,,34.11,,,5.37,,...,,,,72.22,,,,,62.91314,
3,Albania,2001,ALL,3069275.0,29.29,,,7.54,,,...,,68.3,77.18,,,,5.52,,,
4,Albania,2001,FEMALE,,,,28.43,,,8.01,...,,,,,80.45,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4411,Western Balkans,2021,FEMALE,,,,,,,,...,,,,,,,,,,
4412,Western Balkans,2021,MALE,,,,,,,,...,,,,,,,,,,
4413,Western Balkans,2022,ALL,16017893.0,,,,,,,...,,,,,,,,,,
4414,Western Balkans,2022,FEMALE,,,,,,,,...,,,,,,,,,,


In [106]:
hfa_eduHealth['sex']

0          ALL
1       FEMALE
2         MALE
3          ALL
4       FEMALE
         ...  
4411    FEMALE
4412      MALE
4413       ALL
4414    FEMALE
4415      MALE
Name: sex, Length: 4416, dtype: object

In [107]:
hfa_eduHealth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4416 entries, 0 to 4415
Data columns (total 36 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     4416 non-null   object 
 1   years    4416 non-null   object 
 2   sex      4416 non-null   object 
 3   HFA_1    1471 non-null   float64
 4   HFA_10   1389 non-null   float64
 5   HFA_11   1389 non-null   float64
 6   HFA_12   1140 non-null   float64
 7   HFA_13   1385 non-null   float64
 8   HFA_14   1385 non-null   float64
 9   HFA_15   1135 non-null   float64
 10  HFA_2    1459 non-null   float64
 11  HFA_26   310 non-null    float64
 12  HFA_27   1180 non-null   float64
 13  HFA_28   1180 non-null   float64
 14  HFA_29   1229 non-null   float64
 15  HFA_3    1459 non-null   float64
 16  HFA_30   1389 non-null   float64
 17  HFA_32   1321 non-null   float64
 18  HFA_33   1393 non-null   float64
 19  HFA_35   960 non-null    float64
 20  HFA_357  1135 non-null   float64
 21  HFA_36   151 n

#### categorizing continents and regions ####

In [108]:
hfa_eduHealth.rename(columns={'name': 'country'}, inplace=True)
hfa_eduHealth

Unnamed: 0,country,years,sex,HFA_1,HFA_10,HFA_11,HFA_12,HFA_13,HFA_14,HFA_15,...,HFA_40,HFA_41,HFA_43,HFA_44,HFA_45,HFA_617,HFA_618,HFA_634,HFA_635,HFA_636
0,Albania,2000,ALL,3114000.0,32.67,,,6.11,,,...,,,75.10,,,,3.49,,,64.93581
1,Albania,2000,FEMALE,,,,31.15,,,6.89,...,,,,,78.24,,,67.20689,,
2,Albania,2000,MALE,,,34.11,,,5.37,,...,,,,72.22,,,,,62.91314,
3,Albania,2001,ALL,3069275.0,29.29,,,7.54,,,...,,68.3,77.18,,,,5.52,,,
4,Albania,2001,FEMALE,,,,28.43,,,8.01,...,,,,,80.45,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4411,Western Balkans,2021,FEMALE,,,,,,,,...,,,,,,,,,,
4412,Western Balkans,2021,MALE,,,,,,,,...,,,,,,,,,,
4413,Western Balkans,2022,ALL,16017893.0,,,,,,,...,,,,,,,,,,
4414,Western Balkans,2022,FEMALE,,,,,,,,...,,,,,,,,,,


In [110]:
import pandas as pd
import numpy as np
import python_functions_sp as pfsp
import warnings
warnings.filterwarnings("ignore")
import sqlalchemy
import psycopg2
import sql_functions_sp as sfsp

In [111]:
hfa_eduHealth['continent'] = hfa_eduHealth['country'].apply(pfsp.assign_continent)

In [112]:
hfa_eduHealth['continent_region'] = hfa_eduHealth['country'].apply(pfsp.assign_europe_region)

In [113]:
hfa_eduHealth[hfa_eduHealth['continent'] == 'Other']['country'].unique()

array(['Central Asian Republics Information Network members (CARINFONET)',
       'Commonwealth of Independent States',
       'Members of the EU after May 2004 (EU13)',
       'Members of the EU before Feb 2020',
       'Members of the EU before May 2004 (EU15)',
       'Members of the European Union', 'Nordic countries',
       'Small countries',
       'South-eastern Europe Health Network members (SEEHN)',
       'WHO European Region', 'Western Balkans'], dtype=object)

In [115]:
hfa_eduHealth[hfa_eduHealth['continent_region'] == 'Other']['country'].unique()

array(['Central Asian Republics Information Network members (CARINFONET)',
       'Commonwealth of Independent States',
       'Members of the EU after May 2004 (EU13)',
       'Members of the EU before Feb 2020',
       'Members of the EU before May 2004 (EU15)',
       'Members of the European Union', 'Nordic countries',
       'Small countries',
       'South-eastern Europe Health Network members (SEEHN)',
       'WHO European Region', 'Western Balkans'], dtype=object)

In [117]:
pfsp.check_values(hfa_eduHealth)

All values match.


In [118]:
sfsp.push_to_cloud(hfa_eduHealth, 'hfa_eduHealth')

The hfa_eduHealth table was imported successfully.


In [120]:
hfa_eduHealth['HFA_36']

0         NaN
1         NaN
2         NaN
3       98.71
4         NaN
        ...  
4411      NaN
4412      NaN
4413      NaN
4414      NaN
4415      NaN
Name: HFA_36, Length: 4416, dtype: float64