In [1]:
# Import our dependencies
import numpy as np
import pandas as pd
import polars as pl

In [2]:
#  Import and read the input csv
PharmDp10k_df = pd.read_csv('../../Raw_Data/WHO_Pharmacists_per10k-data.csv')
PharmDp10k_df.head(5)

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,HWF_0014,"Pharmacists (per 10,000)",numeric,AFR,Africa,Country,GNB,Guinea-Bissau,Year,2023,...,,,,,,0.01,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z
1,HWF_0014,"Pharmacists (per 10,000)",numeric,AFR,Africa,Country,NER,Niger,Year,2023,...,,,,,,0.02,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z
2,HWF_0014,"Pharmacists (per 10,000)",numeric,AFR,Africa,Country,CAF,Central African Republic,Year,2023,...,,,,,,0.03,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z
3,HWF_0014,"Pharmacists (per 10,000)",numeric,AFR,Africa,Country,GMB,Gambia,Year,2023,...,,,,,,0.04,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z
4,HWF_0014,"Pharmacists (per 10,000)",numeric,AFR,Africa,Country,TCD,Chad,Year,2023,...,,,,,,0.08,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z


In [3]:
# Drop the population estimate column
PharmDp10k_df = PharmDp10k_df.drop(['IndicatorCode','ValueType','ParentLocationCode','ParentLocation','Location type','SpatialDimValueCode','Period type'], axis=1)
PharmDp10k_df = PharmDp10k_df.drop(['FactValueUoM','FactValueNumericLowPrefix','FactValueNumericLow','FactValueNumericHighPrefix','FactValueNumericHigh'], axis=1)
PharmDp10k_df = PharmDp10k_df.drop(['FactValueTranslationID','FactComments','Language','DateModified','IsLatestYear','Dim1 type','Dim1','Dim2','Dim3',], axis=1)
PharmDp10k_df = PharmDp10k_df.drop(['Dim1ValueCode','Dim2ValueCode','Dim3ValueCode','Dim2 type','Dim3 type','DataSource','FactValueNumericPrefix'], axis=1)
PharmDp10k_df = PharmDp10k_df.drop(['FactValueNumeric','DataSourceDimValueCode','Indicator'], axis=1)
PharmDp10k_df.rename(columns ={'Location':'Country', 'Period':'Year', 'Value':'PharmD per 10k'}, inplace = True)
PharmDp10k_df

Unnamed: 0,Country,Year,PharmD per 10k
0,Guinea-Bissau,2023,0.01
1,Niger,2023,0.02
2,Central African Republic,2023,0.03
3,Gambia,2023,0.04
4,Chad,2023,0.08
...,...,...,...
2871,Kazakhstan,1990,8.50
2872,Iceland,1990,8.75
2873,Spain,1990,9.36
2874,France,1990,9.61


In [4]:
# Open the list of country name corrections
corrections_df = pd.read_csv('../../Clean_Data/master_country_list/country_name_corrections.csv')

# Convert the corrections dataframe to a dictionary.
correction_dict = dict(zip(corrections_df['wrong'], corrections_df['correct']))

In [5]:
# Apply the correction dictionary to fix the known errors
PharmDp10k_df['Country'] = PharmDp10k_df['Country'].replace(correction_dict)

In [6]:
# Open the master list of countries
countries_df = pd.read_csv('../../Clean_Data/master_country_list/country_profile_urls.csv')
countries_df = countries_df.drop(['profile_url'], axis=1)

In [7]:
country_list = countries_df['country'].tolist()
#country_list

In [8]:
# Create a list of MDp10k countries
PharmDp10k_countries = PharmDp10k_df['Country'].tolist()
#NMp10k_countries

In [9]:
# clean the FS data based on the SS country list
no_match = []

for country in PharmDp10k_countries:
    if country in country_list:
        continue
    else:
        no_match.append(country)

In [10]:
PharmDp10k_clean_df = PharmDp10k_df.drop(PharmDp10k_df[PharmDp10k_df['Country'].isin(no_match)].index.tolist())
PharmDp10k_clean_df

Unnamed: 0,Country,Year,PharmD per 10k
0,Guinea-Bissau,2023,0.01
1,Niger,2023,0.02
2,Central African Republic,2023,0.03
3,Gambia,2023,0.04
4,Chad,2023,0.08
...,...,...,...
2871,Kazakhstan,1990,8.50
2872,Iceland,1990,8.75
2873,Spain,1990,9.36
2874,France,1990,9.61


In [11]:
PharmDp10k_sorted_df = PharmDp10k_clean_df.sort_values(['Country','Year'], axis=0)

In [12]:
PharmDp10k_sorted_df.tail(40)

Unnamed: 0,Country,Year,PharmD per 10k
1840,Vietnam,2006,1.98
1760,Vietnam,2007,2.16
1662,Vietnam,2008,2.32
1550,Vietnam,2009,2.5
1442,Vietnam,2010,2.69
1247,Vietnam,2012,4.55
1039,Vietnam,2014,3.4
938,Vietnam,2015,3.4
829,Vietnam,2016,3.37
2505,Yemen,1997,0.33


In [13]:
# Create a complete dataframe with all countries and all the years.

# Define variables to collect the details.
#data_countries = MDp10k_sorted_df['Country'].unique()
data_countries = country_list
years = list(range(1990, 2024))

# Create a helper DataFrame.
helper_df = pd.DataFrame({'Country':np.repeat(data_countries, len(years)), 'Year':np.tile(years, len(data_countries))})

# Merge the helper DataFrame with the original data to ensure the data range is complete.
complete_df = pd.merge(helper_df, PharmDp10k_sorted_df, on=['Country','Year'], how='left')

In [14]:
# Convert to wide format
wide_df = complete_df.pivot(index='Year', columns='Country', values='PharmD per 10k')

# Flatten the hierarchical columns and create 'Country_Series' style column names
wide_df.columns = [f'{col}_PharmD per 10k' for col in wide_df.columns]

# Reset the index, so 'Year' becomes a column again
wide_df.reset_index(inplace=True)

# Convert columns to numeric, coerce non-numeric values to NaN
for col in wide_df.columns[0:]:
    wide_df[col] = pd.to_numeric(wide_df[col], errors='coerce')

# Convert to Polars DataFrame
wide_pl_df = pl.from_pandas(wide_df)
    
wide_pl_df.head(5)

Year,Albania_PharmD per 10k,Algeria_PharmD per 10k,Andorra_PharmD per 10k,Angola_PharmD per 10k,Antigua and Barbuda_PharmD per 10k,Argentina_PharmD per 10k,Armenia_PharmD per 10k,Aruba_PharmD per 10k,Australia_PharmD per 10k,Austria_PharmD per 10k,Azerbaijan_PharmD per 10k,Bahamas_PharmD per 10k,Bahrain_PharmD per 10k,Bangladesh_PharmD per 10k,Barbados_PharmD per 10k,Belarus_PharmD per 10k,Belgium_PharmD per 10k,Belize_PharmD per 10k,Benin_PharmD per 10k,Bermuda_PharmD per 10k,Bhutan_PharmD per 10k,Bolivia_PharmD per 10k,Bosnia and Herzegovina_PharmD per 10k,Botswana_PharmD per 10k,Brazil_PharmD per 10k,British Virgin Islands_PharmD per 10k,Brunei_PharmD per 10k,Bulgaria_PharmD per 10k,Burkina Faso_PharmD per 10k,Burundi_PharmD per 10k,Cabo Verde_PharmD per 10k,Cambodia_PharmD per 10k,Cameroon_PharmD per 10k,Canada_PharmD per 10k,Central African Republic_PharmD per 10k,Chad_PharmD per 10k,…,Serbia_PharmD per 10k,Seychelles_PharmD per 10k,Sierra Leone_PharmD per 10k,Singapore_PharmD per 10k,Slovakia_PharmD per 10k,Slovenia_PharmD per 10k,Solomon Islands_PharmD per 10k,South Africa_PharmD per 10k,South Korea_PharmD per 10k,Spain_PharmD per 10k,Sri Lanka_PharmD per 10k,Sudan_PharmD per 10k,Suriname_PharmD per 10k,Sweden_PharmD per 10k,Switzerland_PharmD per 10k,Syria_PharmD per 10k,Taiwan (China)_PharmD per 10k,Tajikistan_PharmD per 10k,Tanzania_PharmD per 10k,Thailand_PharmD per 10k,Togo_PharmD per 10k,Trinidad and Tobago_PharmD per 10k,Tunisia_PharmD per 10k,Turkmenistan_PharmD per 10k,Türkiye_PharmD per 10k,Uganda_PharmD per 10k,Ukraine_PharmD per 10k,United Kingdom_PharmD per 10k,United States_PharmD per 10k,Uruguay_PharmD per 10k,Uzbekistan_PharmD per 10k,Vanuatu_PharmD per 10k,Venezuela_PharmD per 10k,Vietnam_PharmD per 10k,Yemen_PharmD per 10k,Zambia_PharmD per 10k,Zimbabwe_PharmD per 10k
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1990,3.65,,,,,,2.12,,5.26,4.54,6.25,,,,,3.2,8.2,,,,,,1.83,,,,,4.95,,,,,,6.8,,,…,,,,,,3.26,,,,9.36,,,,5.94,5.31,,,1.16,,,,,1.49,,2.82,,,,,,,,,,,,0.34
1991,3.75,,,,,,2.16,,6.8,4.62,6.09,,,,,3.38,8.43,,,,,,1.82,,,,,3.72,,,,,,6.63,,,…,,,,,,,,,,9.6,,,,6.13,5.44,,,1.25,,0.78,,,1.53,,2.81,,,,,,,,,,,,
1992,3.81,,,,,3.12,2.05,,5.76,4.78,6.18,,,,,3.47,8.6,,,,,,,,,,,3.12,,,,,,6.8,,,…,,,,,,,,,,9.82,,,,6.25,5.55,,,1.13,,,,,1.53,,2.86,,,,,,,,,,,,
1993,3.89,,,,,,1.65,,5.75,4.9,5.93,,,,,3.46,8.84,,,,,,,,,,,2.81,,,,,,6.91,,,…,,,,,,3.59,,,,9.99,0.33,,,6.43,5.73,,,0.4,,0.82,,,1.49,,3.0,,,,,,,,,,,,
1994,3.98,,,,,,1.46,,5.94,5.05,5.9,,,,,3.47,9.08,,,,,,,,,,,2.47,,,,,,6.97,,,…,,,,,,4.92,,,,10.12,,,,6.59,5.91,,,0.34,,,,,1.57,,3.06,,,,,,,,,,,,


In [15]:
# Handle the missing data

wide_pl_df = wide_pl_df.interpolate()
wide_pl_df = wide_pl_df.fill_null(strategy='backward')
wide_pl_df = wide_pl_df.fill_null(strategy='forward')
wide_pl_df = wide_pl_df.fill_null(strategy='zero')

In [16]:
wide_pl_df.head(15)

Year,Albania_PharmD per 10k,Algeria_PharmD per 10k,Andorra_PharmD per 10k,Angola_PharmD per 10k,Antigua and Barbuda_PharmD per 10k,Argentina_PharmD per 10k,Armenia_PharmD per 10k,Aruba_PharmD per 10k,Australia_PharmD per 10k,Austria_PharmD per 10k,Azerbaijan_PharmD per 10k,Bahamas_PharmD per 10k,Bahrain_PharmD per 10k,Bangladesh_PharmD per 10k,Barbados_PharmD per 10k,Belarus_PharmD per 10k,Belgium_PharmD per 10k,Belize_PharmD per 10k,Benin_PharmD per 10k,Bermuda_PharmD per 10k,Bhutan_PharmD per 10k,Bolivia_PharmD per 10k,Bosnia and Herzegovina_PharmD per 10k,Botswana_PharmD per 10k,Brazil_PharmD per 10k,British Virgin Islands_PharmD per 10k,Brunei_PharmD per 10k,Bulgaria_PharmD per 10k,Burkina Faso_PharmD per 10k,Burundi_PharmD per 10k,Cabo Verde_PharmD per 10k,Cambodia_PharmD per 10k,Cameroon_PharmD per 10k,Canada_PharmD per 10k,Central African Republic_PharmD per 10k,Chad_PharmD per 10k,…,Serbia_PharmD per 10k,Seychelles_PharmD per 10k,Sierra Leone_PharmD per 10k,Singapore_PharmD per 10k,Slovakia_PharmD per 10k,Slovenia_PharmD per 10k,Solomon Islands_PharmD per 10k,South Africa_PharmD per 10k,South Korea_PharmD per 10k,Spain_PharmD per 10k,Sri Lanka_PharmD per 10k,Sudan_PharmD per 10k,Suriname_PharmD per 10k,Sweden_PharmD per 10k,Switzerland_PharmD per 10k,Syria_PharmD per 10k,Taiwan (China)_PharmD per 10k,Tajikistan_PharmD per 10k,Tanzania_PharmD per 10k,Thailand_PharmD per 10k,Togo_PharmD per 10k,Trinidad and Tobago_PharmD per 10k,Tunisia_PharmD per 10k,Turkmenistan_PharmD per 10k,Türkiye_PharmD per 10k,Uganda_PharmD per 10k,Ukraine_PharmD per 10k,United Kingdom_PharmD per 10k,United States_PharmD per 10k,Uruguay_PharmD per 10k,Uzbekistan_PharmD per 10k,Vanuatu_PharmD per 10k,Venezuela_PharmD per 10k,Vietnam_PharmD per 10k,Yemen_PharmD per 10k,Zambia_PharmD per 10k,Zimbabwe_PharmD per 10k
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1990.0,3.65,1.71,8.29,0.02,1.84,3.12,2.12,0.0,5.26,4.54,6.25,3.85,1.9,0.55,6.45,3.2,8.2,3.58,0.01,0.0,0.11,2.96,1.83,0.78,3.37,0.0,0.88,4.95,0.17,0.1,0.06,0.3,0.02,6.8,0.04,0.03,…,2.47,0.57,0.16,2.63,3.66,3.26,0.65,2.27,10.83,9.36,0.33,0.44,0.57,5.94,5.31,4.7,0.0,1.16,0.03,0.78,0.09,4.12,1.49,3.21,2.82,0.08,0.29,5.87,8.12,0.65,0.31,0.09,0.0,0.77,0.33,0.23,0.34
1991.0,3.75,1.71,8.29,0.02,1.84,3.12,2.16,0.0,6.8,4.62,6.09,3.85,1.9,0.55,6.45,3.38,8.43,3.58,0.01,0.0,0.11,2.96,1.82,0.78,3.37,0.0,0.88,3.72,0.17,0.1,0.06,0.3,0.02,6.63,0.04,0.03,…,2.47,0.57,0.16,2.63,3.66,3.37,0.65,2.27,10.83,9.6,0.33,0.44,0.57,6.13,5.44,4.7,0.0,1.25,0.03,0.78,0.09,4.12,1.53,3.21,2.81,0.08,0.29,5.87,8.12,0.65,0.31,0.09,0.0,0.77,0.33,0.23,0.364
1992.0,3.81,1.71,8.29,0.02,1.84,3.12,2.05,0.0,5.76,4.78,6.18,3.85,1.9,0.55,6.45,3.47,8.6,3.58,0.01,0.0,0.11,2.96,1.6975,0.78,3.37,0.0,0.88,3.12,0.17,0.1,0.06,0.3,0.02,6.8,0.04,0.03,…,2.47,0.57,0.16,2.63,3.66,3.48,0.65,2.27,10.83,9.82,0.33,0.44,0.57,6.25,5.55,4.7,0.0,1.13,0.03,0.8,0.09,4.12,1.53,3.21,2.86,0.08,0.29,5.87,8.12,0.65,0.31,0.09,0.0,0.77,0.33,0.23,0.388
1993.0,3.89,1.71,8.29,0.02,1.84,3.301667,1.65,0.0,5.75,4.9,5.93,3.85,1.9,0.55,6.45,3.46,8.84,3.58,0.01,0.0,0.11,2.96,1.575,0.78,3.37,0.0,0.88,2.81,0.17,0.1,0.06,0.3,0.02,6.91,0.04,0.03,…,2.47,0.57,0.16,2.63,3.66,3.59,0.65,2.27,10.83,9.99,0.33,0.44,0.57,6.43,5.73,4.7,0.0,0.4,0.03,0.82,0.09,4.12,1.49,3.21,3.0,0.08,0.29,5.87,8.12,0.65,0.31,0.09,0.0,0.77,0.33,0.23,0.412
1994.0,3.98,1.71,8.29,0.02,1.84,3.483333,1.46,0.0,5.94,5.05,5.9,3.85,1.9,0.55,6.45,3.47,9.08,3.58,0.01,0.0,0.11,2.96,1.4525,0.78,3.37,0.0,0.88,2.47,0.17,0.1,0.06,0.3,0.02,6.97,0.04,0.03,…,2.47,0.57,0.16,2.63,3.66,4.92,0.65,2.27,10.83,10.12,0.340909,0.44,0.57,6.59,5.91,4.7,0.0,0.34,0.03,0.905,0.09,4.12,1.57,3.21,3.06,0.08,0.29,5.87,8.12,0.65,0.31,0.09,0.0,0.77,0.33,0.23,0.436
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2000.0,3.39,1.71,9.74,0.221429,1.84,4.442857,0.34,0.0,8.0,5.66,2.9,3.85,1.9,0.55,6.95,3.2,10.46,3.58,0.01,0.0,0.11,2.96,0.91,0.78,2.62,0.0,0.88,2.26,0.17,0.1,0.06,0.45,0.02,7.65,0.04,0.03,…,2.47,0.57,0.16,2.7,3.66,3.76,0.725,2.27,10.83,7.11,0.406364,0.44,0.57,7.3,6.19,5.33,0.0,1.08,0.03,1.025,0.09,4.17,2.0,3.21,3.35,0.08,0.29,5.87,7.56,0.65,0.27,0.09,0.0,0.77,0.368571,0.23,0.476667
2001.0,3.45,1.71,9.71,0.288571,1.84,4.559286,0.31,0.0,6.51,5.7,2.6,3.85,1.93,0.55,7.05,3.02,10.63,3.58,0.01,0.0,0.11,2.96,0.84,0.78,3.79,0.0,0.88,2.84,0.17,0.1,0.06,0.445,0.02,7.57,0.04,0.03,…,2.47,0.57,0.16,2.77,4.25,3.84,0.8,2.27,11.03,7.77,0.417273,0.44,0.57,7.39,6.27,5.545,0.0,1.06,0.03,1.08,0.09,3.82,2.02,3.04,3.34,0.08,0.27,5.87,7.87,0.65,0.27,0.09,0.0,0.77,0.381429,0.23,0.48
2002.0,3.48,1.71,10.08,0.355714,1.84,4.675714,0.35,0.0,6.58,5.82,2.73,3.85,1.78,0.55,7.6225,2.9,10.83,3.58,0.01,0.0,0.11,2.523333,0.89,0.78,4.1925,0.0,0.88,3.42,0.17,0.1,0.06,0.44,0.02,7.81,0.04,0.03,…,2.47,0.57,0.16,2.93,4.55,3.91,0.875,2.27,11.24,7.77,0.428182,0.44,0.57,7.42,6.32,5.76,0.0,1.04,0.03,1.14,0.09,3.76,2.06,2.6,3.32,0.08,0.25,5.87,7.64,0.65,0.3,0.09,0.0,3.06,0.394286,0.23,0.483333
2003.0,3.53,1.753333,9.78,0.422857,1.84,4.792143,0.31,0.0,7.92,5.99,2.19,3.85,1.71,0.548333,8.195,2.96,10.98,3.58,0.01,0.0,0.11,2.086667,0.87,0.78,4.595,0.0,1.02,4.0,0.17,0.1,0.06,0.435,0.02,7.87,0.04,0.03,…,2.47,0.57,0.16,3.09,4.72,4.06,0.95,2.27,11.45,7.58,0.439091,0.44,0.57,7.47,6.23,6.04,0.0,1.02,0.034,1.135,0.09,3.92,2.395,2.55,3.3,0.08,0.26,6.0,7.42,0.65,0.3,0.09,0.0,2.66,0.407143,0.23,0.486667


In [18]:
# Reformat the dataframe with the filled values...
# Convert the DataFrame back to pandas!!!
wide_df = wide_pl_df.to_pandas() 

# Now we can reshape it with pandas functionality
complete_df = wide_df.melt(id_vars='Year', value_vars=wide_df.columns[1:], var_name='Country', value_name='PharmD per 10k')

# Removing "_MD per 10k" from country names
complete_df['Country'] = complete_df['Country'].str.replace('_PharmD per 10k', '')

complete_df['Year'] = complete_df['Year'].astype(int)

# Creating 'Country_Year' column
complete_df['Country_Year'] = complete_df['Country'].astype(str) + '_' + complete_df['Year'].astype(str)

complete_df = complete_df.loc[:, ['Country_Year', 'Country', 'Year', 'PharmD per 10k']]
#complete_df['Country_Year'] = complete_df['Country_Year'].astype(str)


In [19]:
complete_df

Unnamed: 0,Country_Year,Country,Year,PharmD per 10k
0,Albania_1990,Albania,1990,3.6500
1,Albania_1991,Albania,1991,3.7500
2,Albania_1992,Albania,1992,3.8100
3,Albania_1993,Albania,1993,3.8900
4,Albania_1994,Albania,1994,3.9800
...,...,...,...,...
6489,Zimbabwe_2019,Zimbabwe,2019,0.9755
6490,Zimbabwe_2020,Zimbabwe,2020,1.0210
6491,Zimbabwe_2021,Zimbabwe,2021,1.1005
6492,Zimbabwe_2022,Zimbabwe,2022,1.1800


In [20]:
complete_df.to_csv('../../Clean_Data/Clean_CSV_Files/PharmD_per_10k.csv', index=False, encoding="utf-8")