In [1]:
# Import our dependencies
import numpy as np
import pandas as pd
import polars as pl

In [2]:
#  Import and read the input csv
MDp10k_df = pd.read_csv('../../Raw_Data/WHO_Medical_Doctors_per10k-data.csv')
MDp10k_df.head(5)

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,HWF_0001,"Medical doctors (per 10,000)",numeric,AFR,Africa,Country,NER,Niger,Year,2023,...,,,,,,0.38,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z
1,HWF_0001,"Medical doctors (per 10,000)",numeric,WPR,Western Pacific,Country,PNG,Papua New Guinea,Year,2023,...,,,,,,0.61,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z
2,HWF_0001,"Medical doctors (per 10,000)",numeric,AFR,Africa,Country,CAF,Central African Republic,Year,2023,...,,,,,,0.74,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z
3,HWF_0001,"Medical doctors (per 10,000)",numeric,AFR,Africa,Country,TCD,Chad,Year,2023,...,,,,,,0.85,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z
4,HWF_0001,"Medical doctors (per 10,000)",numeric,AFR,Africa,Country,GMB,Gambia,Year,2023,...,,,,,,0.9,,"NHWA data portal, December 2024 update https:/...",EN,2025-01-14T05:00:00.000Z


In [3]:
# Drop the population estimate column
MDp10k_df = MDp10k_df.drop(['IndicatorCode','ValueType','ParentLocationCode','ParentLocation','Location type','SpatialDimValueCode','Period type'], axis=1)
MDp10k_df = MDp10k_df.drop(['FactValueUoM','FactValueNumericLowPrefix','FactValueNumericLow','FactValueNumericHighPrefix','FactValueNumericHigh'], axis=1)
MDp10k_df = MDp10k_df.drop(['FactValueTranslationID','FactComments','Language','DateModified','IsLatestYear','Dim1 type','Dim1','Dim2','Dim3'], axis=1)
MDp10k_df = MDp10k_df.drop(['Dim1ValueCode','Dim2ValueCode','Dim3ValueCode','Dim2 type','Dim3 type','DataSource','FactValueNumericPrefix'], axis=1)
MDp10k_df = MDp10k_df.drop(['FactValueNumeric','DataSourceDimValueCode','Indicator'], axis=1)
MDp10k_df.rename(columns ={'Location':'Country', 'Period':'Year', 'Value':'MD per 10k'}, inplace = True)
MDp10k_df

Unnamed: 0,Country,Year,MD per 10k
0,Niger,2023,0.38
1,Papua New Guinea,2023,0.61
2,Central African Republic,2023,0.74
3,Chad,2023,0.85
4,Gambia,2023,0.90
...,...,...,...
3411,Republic of Korea,1990,8.12
3412,Oman,1990,8.17
3413,Türkiye,1990,9.04
3414,Ecuador,1990,9.34


In [4]:
# Open the list of country name corrections
corrections_df = pd.read_csv('../../Clean_Data/master_country_list/country_name_corrections.csv')

# Convert the corrections dataframe to a dictionary.
correction_dict = dict(zip(corrections_df['wrong'], corrections_df['correct']))

In [5]:
# Apply the correction dictionary to fix the known errors
MDp10k_df['Country'] = MDp10k_df['Country'].replace(correction_dict)

In [6]:
# Open the master list of countries
countries_df = pd.read_csv('../../Clean_Data/master_country_list/country_profile_urls.csv')
countries_df = countries_df.drop(['profile_url'], axis=1)

In [7]:
country_list = countries_df['country'].tolist()
#country_list

In [8]:
# Create a list of MDp10k countries
MDp10k_countries = MDp10k_df['Country'].tolist()
#MDp10k_countries

In [9]:
# clean the FS data based on the SS country list
no_match = []

for country in MDp10k_countries:
    if country in country_list:
        continue
    else:
        no_match.append(country)

In [10]:
MDp10k_clean_df = MDp10k_df.drop(MDp10k_df[MDp10k_df['Country'].isin(no_match)].index.tolist())
MDp10k_clean_df

Unnamed: 0,Country,Year,MD per 10k
0,Niger,2023,0.38
1,Papua New Guinea,2023,0.61
2,Central African Republic,2023,0.74
3,Chad,2023,0.85
4,Gambia,2023,0.90
...,...,...,...
3411,South Korea,1990,8.12
3412,Oman,1990,8.17
3413,Türkiye,1990,9.04
3414,Ecuador,1990,9.34


In [11]:
MDp10k_sorted_df = MDp10k_clean_df.sort_values(['Country','Year'], axis=0)

In [12]:
MDp10k_sorted_df.tail(40)

Unnamed: 0,Country,Year,MD per 10k
2376,Yemen,2004,3.05
1895,Yemen,2008,2.75
1772,Yemen,2009,2.75
1157,Yemen,2014,2.7
331,Yemen,2020,1.76
204,Yemen,2021,1.74
80,Yemen,2022,1.65
5,Yemen,2023,0.98
2311,Zambia,2004,1.32
2188,Zambia,2005,0.55


In [13]:
# Create a complete dataframe with all countries and all the years.

# Define variables to collect the details.
#data_countries = MDp10k_sorted_df['Country'].unique()
data_countries = country_list
years = list(range(1990, 2024))

# Create a helper DataFrame.
helper_df = pd.DataFrame({'Country':np.repeat(data_countries, len(years)), 'Year':np.tile(years, len(data_countries))})

# Merge the helper DataFrame with the original data to ensure the data range is complete.
complete_df = pd.merge(helper_df, MDp10k_sorted_df, on=['Country','Year'], how='left')

In [14]:
# Convert to wide format
wide_df = complete_df.pivot(index='Year', columns='Country', values='MD per 10k')

# Flatten the hierarchical columns and create 'Country_Series' style column names
wide_df.columns = [f'{col}_MD per 10k' for col in wide_df.columns]

# Reset the index, so 'Year' becomes a column again
wide_df.reset_index(inplace=True)

# Convert columns to numeric, coerce non-numeric values to NaN
for col in wide_df.columns[0:]:
    wide_df[col] = pd.to_numeric(wide_df[col], errors='coerce')

# Convert to Polars DataFrame
wide_pl_df = pl.from_pandas(wide_df)
    
wide_pl_df.head(5)

Year,Albania_MD per 10k,Algeria_MD per 10k,Andorra_MD per 10k,Angola_MD per 10k,Antigua and Barbuda_MD per 10k,Argentina_MD per 10k,Armenia_MD per 10k,Aruba_MD per 10k,Australia_MD per 10k,Austria_MD per 10k,Azerbaijan_MD per 10k,Bahamas_MD per 10k,Bahrain_MD per 10k,Bangladesh_MD per 10k,Barbados_MD per 10k,Belarus_MD per 10k,Belgium_MD per 10k,Belize_MD per 10k,Benin_MD per 10k,Bermuda_MD per 10k,Bhutan_MD per 10k,Bolivia_MD per 10k,Bosnia and Herzegovina_MD per 10k,Botswana_MD per 10k,Brazil_MD per 10k,British Virgin Islands_MD per 10k,Brunei_MD per 10k,Bulgaria_MD per 10k,Burkina Faso_MD per 10k,Burundi_MD per 10k,Cabo Verde_MD per 10k,Cambodia_MD per 10k,Cameroon_MD per 10k,Canada_MD per 10k,Central African Republic_MD per 10k,Chad_MD per 10k,…,Serbia_MD per 10k,Seychelles_MD per 10k,Sierra Leone_MD per 10k,Singapore_MD per 10k,Slovakia_MD per 10k,Slovenia_MD per 10k,Solomon Islands_MD per 10k,South Africa_MD per 10k,South Korea_MD per 10k,Spain_MD per 10k,Sri Lanka_MD per 10k,Sudan_MD per 10k,Suriname_MD per 10k,Sweden_MD per 10k,Switzerland_MD per 10k,Syria_MD per 10k,Taiwan (China)_MD per 10k,Tajikistan_MD per 10k,Tanzania_MD per 10k,Thailand_MD per 10k,Togo_MD per 10k,Trinidad and Tobago_MD per 10k,Tunisia_MD per 10k,Turkmenistan_MD per 10k,Türkiye_MD per 10k,Uganda_MD per 10k,Ukraine_MD per 10k,United Kingdom_MD per 10k,United States_MD per 10k,Uruguay_MD per 10k,Uzbekistan_MD per 10k,Vanuatu_MD per 10k,Venezuela_MD per 10k,Vietnam_MD per 10k,Yemen_MD per 10k,Zambia_MD per 10k,Zimbabwe_MD per 10k
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1990,13.78,,,,,,,,21.6,30.08,38.02,,,,,,32.67,,,,,,15.81,,10.85,,,32.3,,,,,,,,,…,,,,,,20.57,,,8.12,20.37,,,,32.49,29.84,,,25.05,,,,,5.31,,9.04,,,16.14,,,33.8,,,,,,1.3
1991,14.6,,,,,,,,23.25,30.88,38.46,,,,,,33.27,,,,,,15.76,,,,,30.77,,,,,,,,,…,,,,,,20.78,,,8.76,41.35,1.48,,,33.61,30.3,,,25.55,,2.3,,,5.29,,9.56,,,16.27,,,33.68,,,,,,
1992,16.04,,,,,26.36,,,23.63,32.13,38.05,,,,,,33.91,,,,,,,,11.4,,,31.59,,,,,,,,,…,,,,,,20.89,,,9.49,41.26,1.75,,,34.68,30.12,,,23.14,,,,,5.87,,9.83,,,16.4,,,33.57,,,,,,
1993,13.77,,,,,,,,23.87,32.95,38.42,,,,,,34.44,,,,,,,,11.68,,,33.61,,,,,,,,,…,,,,,,20.29,,,10.13,41.75,2.18,,,35.66,30.61,,,21.19,,2.38,,,5.94,,10.35,,,16.77,,,33.26,,,,,,
1994,12.94,,,,,,,,24.25,34.07,37.88,,,,,29.87,35.08,,,,,,,,11.97,,,33.38,,,,,,,,,…,,,,,,21.91,,,10.58,42.21,1.94,,,36.56,31.15,,,21.7,,,,,5.93,,10.97,,,16.91,,,32.96,,,,,,


In [15]:
# Handle the missing data

wide_pl_df = wide_pl_df.interpolate()
wide_pl_df = wide_pl_df.fill_null(strategy='backward')
wide_pl_df = wide_pl_df.fill_null(strategy='forward')
wide_pl_df = wide_pl_df.fill_null(strategy='zero')

In [16]:
wide_pl_df.head(15)

Year,Albania_MD per 10k,Algeria_MD per 10k,Andorra_MD per 10k,Angola_MD per 10k,Antigua and Barbuda_MD per 10k,Argentina_MD per 10k,Armenia_MD per 10k,Aruba_MD per 10k,Australia_MD per 10k,Austria_MD per 10k,Azerbaijan_MD per 10k,Bahamas_MD per 10k,Bahrain_MD per 10k,Bangladesh_MD per 10k,Barbados_MD per 10k,Belarus_MD per 10k,Belgium_MD per 10k,Belize_MD per 10k,Benin_MD per 10k,Bermuda_MD per 10k,Bhutan_MD per 10k,Bolivia_MD per 10k,Bosnia and Herzegovina_MD per 10k,Botswana_MD per 10k,Brazil_MD per 10k,British Virgin Islands_MD per 10k,Brunei_MD per 10k,Bulgaria_MD per 10k,Burkina Faso_MD per 10k,Burundi_MD per 10k,Cabo Verde_MD per 10k,Cambodia_MD per 10k,Cameroon_MD per 10k,Canada_MD per 10k,Central African Republic_MD per 10k,Chad_MD per 10k,…,Serbia_MD per 10k,Seychelles_MD per 10k,Sierra Leone_MD per 10k,Singapore_MD per 10k,Slovakia_MD per 10k,Slovenia_MD per 10k,Solomon Islands_MD per 10k,South Africa_MD per 10k,South Korea_MD per 10k,Spain_MD per 10k,Sri Lanka_MD per 10k,Sudan_MD per 10k,Suriname_MD per 10k,Sweden_MD per 10k,Switzerland_MD per 10k,Syria_MD per 10k,Taiwan (China)_MD per 10k,Tajikistan_MD per 10k,Tanzania_MD per 10k,Thailand_MD per 10k,Togo_MD per 10k,Trinidad and Tobago_MD per 10k,Tunisia_MD per 10k,Turkmenistan_MD per 10k,Türkiye_MD per 10k,Uganda_MD per 10k,Ukraine_MD per 10k,United Kingdom_MD per 10k,United States_MD per 10k,Uruguay_MD per 10k,Uzbekistan_MD per 10k,Vanuatu_MD per 10k,Venezuela_MD per 10k,Vietnam_MD per 10k,Yemen_MD per 10k,Zambia_MD per 10k,Zimbabwe_MD per 10k
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1990.0,13.78,11.14,22.38,0.6,1.63,26.36,26.52,0.0,21.6,30.08,38.02,9.89,10.13,2.38,13.66,29.87,32.67,10.42,0.38,0.0,1.81,11.8,15.81,2.64,10.85,0.0,10.29,32.3,0.52,0.27,1.36,1.15,1.88,24.11,0.79,0.25,…,27.07,13.72,0.31,13.43,32.38,20.57,1.25,7.11,8.12,20.37,1.48,2.47,4.01,32.49,29.84,12.76,0.0,25.05,0.23,2.3,0.43,9.13,5.31,27.08,9.04,0.81,70.24,16.14,24.11,37.77,33.8,1.15,19.22,5.43,2.13,1.32,1.3
1991.0,14.6,11.14,22.38,0.6,1.63,26.36,26.52,0.0,23.25,30.88,38.46,9.89,10.13,2.38,13.66,29.87,33.27,10.42,0.38,0.0,1.81,11.8,15.76,2.64,11.125,0.0,10.29,30.77,0.52,0.27,1.36,1.15,1.88,24.11,0.79,0.25,…,27.07,13.72,0.31,13.43,32.38,20.78,1.25,7.11,8.76,41.35,1.48,2.47,4.01,33.61,30.3,12.76,0.0,25.55,0.23,2.3,0.43,9.13,5.29,27.08,9.56,0.81,70.24,16.27,24.11,37.77,33.68,1.15,19.22,5.43,2.13,1.32,1.338
1992.0,16.04,11.14,22.38,0.6,1.63,26.36,26.52,0.0,23.63,32.13,38.05,9.89,10.13,2.38,13.66,29.87,33.91,10.42,0.38,0.0,1.81,11.8,15.466667,2.64,11.4,0.0,10.29,31.59,0.52,0.27,1.36,1.15,1.88,24.11,0.79,0.25,…,27.07,13.72,0.31,13.43,32.38,20.89,1.25,7.11,9.49,41.26,1.75,2.47,4.01,34.68,30.12,12.76,0.0,23.14,0.23,2.34,0.43,9.13,5.87,27.08,9.83,0.81,70.24,16.4,24.11,37.77,33.57,1.15,19.22,5.43,2.13,1.32,1.376
1993.0,13.77,11.14,22.38,0.6,1.63,26.951667,26.52,0.0,23.87,32.95,38.42,9.89,10.13,2.38,13.66,29.87,34.44,10.42,0.38,0.0,1.81,11.8,15.173333,2.64,11.68,0.0,10.29,33.61,0.52,0.27,1.36,1.15,1.88,24.11,0.79,0.25,…,27.07,13.72,0.31,13.43,32.38,20.29,1.25,7.11,10.13,41.75,2.18,2.47,4.01,35.66,30.61,12.76,0.0,21.19,0.23,2.38,0.43,9.13,5.94,27.08,10.35,0.81,70.24,16.77,24.11,37.77,33.26,1.15,19.22,5.43,2.13,1.32,1.414
1994.0,12.94,11.14,22.38,0.6,1.63,27.543333,26.52,0.0,24.25,34.07,37.88,9.89,10.13,2.38,13.66,29.87,35.08,10.42,0.38,0.0,1.81,11.8,14.88,2.64,11.97,0.0,10.29,33.38,0.52,0.27,1.36,1.15,1.88,24.11,0.79,0.25,…,27.07,13.72,0.31,13.43,32.38,21.91,1.25,7.11,10.58,42.21,1.94,2.47,4.01,36.56,31.15,12.76,0.0,21.7,0.23,2.39,0.43,9.13,5.93,27.08,10.97,0.81,70.24,16.91,24.11,37.77,32.96,1.15,19.22,5.43,2.13,1.32,1.452
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2000.0,13.66,11.14,25.57,0.612857,3.63,32.527143,26.52,0.0,24.76,38.76,35.52,13.202,10.13,2.38,13.93,32.68,28.29,10.42,0.38,0.0,1.81,11.8,13.12,2.98,14.199167,0.0,10.29,34.41,0.52,0.27,1.36,1.64,1.88,24.11,0.79,0.28,…,27.07,13.72,0.31,13.82,32.38,21.54,1.26,7.11,13.02,31.01,4.13,2.47,4.01,30.26,35.1,13.47,0.0,21.48,0.23,2.945,0.43,9.66,7.62,27.08,13.03,0.81,70.24,19.87,25.96,37.77,29.61,1.282857,19.22,5.43,2.524286,1.32,1.26
2001.0,13.03,11.14,26.56,0.617143,5.63,33.835714,25.9,0.0,25.51,39.89,35.25,14.858,10.6,2.38,16.01,32.52,28.49,10.118889,0.38,0.0,1.81,11.8,13.03,2.98,14.608333,0.0,11.025,34.29,0.52,0.27,1.36,1.73625,1.88,24.11,0.79,0.2975,…,27.07,13.72,0.31,14.39,32.23,21.74,1.27,7.11,13.97,31.06,4.28,2.47,4.9525,31.21,35.14,13.91,0.0,20.95,0.23,2.98,0.43,9.31,7.87,27.02,13.56,0.81,70.27,20.36,26.2475,37.77,29.11,1.327143,19.22,5.43,2.655714,1.32,1.3675
2002.0,11.67,11.14,30.53,0.621429,7.63,35.144286,25.45,0.0,25.46,40.6,35.41,16.514,10.53,2.49,16.5575,32.75,28.59,9.817778,0.38,0.0,1.81,10.378333,13.0,3.07,15.0175,0.0,11.76,35.22,0.52,0.27,1.36,1.8325,1.88,24.11,0.79,0.315,…,27.07,13.72,0.31,14.54,32.01,22.29,1.28,7.11,15.0,31.05,4.593333,2.47,5.895,32.12,35.61,14.35,0.0,20.08,0.23,2.95,0.43,7.91,7.99,26.57,13.71,0.81,70.14,21.07,26.535,37.77,28.49,1.371429,19.05875,5.71,2.787143,1.32,1.475
2003.0,11.86,10.846667,35.11,0.625714,9.63,36.452857,24.39,0.0,26.15,41.44,35.34,18.17,10.06,2.6,17.105,33.03,28.57,9.516667,0.38,0.0,1.81,8.956667,13.3,2.96,15.426667,0.0,12.44,36.04,0.52,0.27,1.36,1.92875,1.88,24.11,0.79,0.3325,…,27.07,13.72,0.31,15.21,31.48,22.51,1.29,7.11,15.79,32.0,4.906667,2.47,6.8375,32.83,36.58,13.68,0.0,19.06,0.238,2.92,0.43,7.75,8.13,25.51,13.93,0.81,70.05,21.94,26.8225,38.135,27.86,1.415714,18.8975,5.923333,2.918571,1.32,1.5825


In [17]:
# Reformat the dataframe with the filled values...
# Convert the DataFrame back to pandas!!!
wide_df = wide_pl_df.to_pandas() 

# Now we can reshape it with pandas functionality
complete_df = wide_df.melt(id_vars='Year', value_vars=wide_df.columns[1:], var_name='Country', value_name='MD per 10k')

# Removing "_MD per 10k" from country names
complete_df['Country'] = complete_df['Country'].str.replace('_MD per 10k', '')

complete_df['Year'] = complete_df['Year'].astype(int)

# Creating 'Country_Year' column
complete_df['Country_Year'] = complete_df['Country'] + '_' + complete_df['Year'].astype(str)

complete_df = complete_df.loc[:, ['Country_Year', 'Country', 'Year', 'MD per 10k']]
#complete_df['Country_Year'] = complete_df['Country_Year'].astype("string")


In [18]:
complete_df

Unnamed: 0,Country_Year,Country,Year,MD per 10k
0,Albania_1990,Albania,1990,13.78
1,Albania_1991,Albania,1991,14.60
2,Albania_1992,Albania,1992,16.04
3,Albania_1993,Albania,1993,13.77
4,Albania_1994,Albania,1994,12.94
...,...,...,...,...
6489,Zimbabwe_2019,Zimbabwe,2019,1.23
6490,Zimbabwe_2020,Zimbabwe,2020,1.28
6491,Zimbabwe_2021,Zimbabwe,2021,1.36
6492,Zimbabwe_2022,Zimbabwe,2022,1.27


In [19]:
complete_df.to_csv('../../Clean_Data/Clean_CSV_Files/MD_per_10k.csv', index=False, encoding="utf-8")