In [1]:
##importing relevant packages
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [2]:
##importing the dataset
data_raw = pd.read_csv("https://raw.githubusercontent.com/Ali-Duni/project_coding/refs/heads/main/data/raw/WDI_regions_raw.csv")

In [3]:
##inspecting first few rows
data_raw.head()

Unnamed: 0,Time,Time Code,Country Name,Country Code,"Mortality rate, under-5 (per 1,000 live births) [SH.DYN.MORT]","Immunization, measles (% of children ages 12-23 months) [SH.IMM.MEAS]"
0,1960,YR1960,East Asia & Pacific,EAS,..,..
1,1960,YR1960,Europe & Central Asia,ECS,..,..
2,1960,YR1960,Latin America & Caribbean,LCN,..,..
3,1960,YR1960,Middle East & North Africa,MEA,..,..
4,1960,YR1960,South Asia,SAS,..,..


In [4]:
##first, the variables of interest seem to have a uselessly long name
##I will create a dictionary containing pairs of the original names and the new names and then use it to rename columns in the df
names_columns = {
    "Mortality rate, under-5 (per 1,000 live births) [SH.DYN.MORT]": "child_mortality_rate",
    "Immunization, measles (% of children ages 12-23 months) [SH.IMM.MEAS]": "immunization_rate",
    "Country Name": "region"
}
data_raw.rename(names_columns, axis="columns", inplace=True)

In [5]:
##now, let's get rid of variables we do not need. Time code seems unnecessary, and so does country code. let's remove them from the dataframe
##first, I will create a list of columns i want to keep. This will also be useful later on
vars_to_keep = ["child_mortality_rate", "immunization_rate", "region", "Time"]
data_raw = data_raw.filter(items=vars_to_keep)

In [6]:
##let's now filter our dataframe from missing values. first, let's check the NAs for mortality rate
data_raw["child_mortality_rate"].isna().sum()

5

In [9]:
##this does not seem to be correct, since there are also a lot of observations which use .. in place of NaN. let's fix that
data_raw.replace("..", pd.NA, inplace=True)

In [10]:
##let's get rid of them for all columns using our previous list
data_raw = data_raw.dropna(subset = vars_to_keep)
data_raw

Unnamed: 0,child_mortality_rate,immunization_rate,region,Time
240,56.8,89.4425347060659,East Asia & Pacific,1990
242,54.8,75.9339324103451,Latin America & Caribbean,1990
243,65.4,83.1685040029308,Middle East & North Africa,1990
244,129.4,55.9756600132649,South Asia,1990
245,180.5,57.2696648667665,Sub-Saharan Africa,1990
...,...,...,...,...
459,23.1,88.301957974945,Middle East & North Africa,2017
460,44.8,86.0673400859076,South Asia,2017
461,75.5,69.553651704968,Sub-Saharan Africa,2017
462,6.5,91.7336873033563,North America,2017


In [11]:
##this is our cleaned data frame, ready to use for analysis. let us save it
data_raw.to_csv("data/clean/WDI_regions_cleaned.csv", index = "True")