# Data Description :
| datasets | Description |
|----------|----------|
|[lower_respiratory_diseases](#section1)<br> | data contain number of  deaths from Lower respiratory infections
|[gdp](#section2)<br>| death rates from pneumonia vs gdp per capita
|[death_under_5](#section3)<br> | pneumonia rates of death in children under 5
|[death_rates](#section4)<br> | pneumonia rates of death for all age
|[mortality](#section5)<br> | pneumonia rates of death for all ages separately (group)
|[risk_factor_aged_70](#section6)<br> | pneumonia risk factors for aged 70
|[risk_factor_aged_5](#section7)<br>  | pneumonia risk factors for under aged 5
|[careseeking](#section8)<br> | Percentage of children under 5 with symptoms of pneumonia taken for care to a health provider
|[breastfeeding](#section9)<br> | children who had taken breastfeeding
|[vaccine](#section10)<br> | Indicator:Pneumococcal conjugate vaccines (PCV3) immunization coverage among 1-year-olds (%)

# [Merge similar data together](#section11)<br>

> ## [death datasets](#section12)<br>
> ## [risk datasets](#section13)<br>
> ## [save anthor data](#section14)<br>

----------------------------

# Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

In [2]:
#pd.set_option("display.max_rows",None)

In [3]:
lower_respiratory_diseases=pd.read_csv("pneumonia-and-lower-respiratory-diseases-deaths.csv")

In [4]:
lower_respiratory_diseases

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 5-14 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 70+ years (Number)
0,Afghanistan,AFG,1990,20224,932,488,538,1559
1,Afghanistan,AFG,1991,20879,941,529,580,1576
2,Afghanistan,AFG,1992,23585,956,604,664,1595
3,Afghanistan,AFG,1993,27116,979,665,728,1628
4,Afghanistan,AFG,1994,29271,1003,695,754,1668
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,5070,3069,1808,207,2820
6836,Zimbabwe,ZWE,2016,5023,3093,1843,223,2841
6837,Zimbabwe,ZWE,2017,4865,3127,1870,234,2866
6838,Zimbabwe,ZWE,2018,4675,3158,1893,238,2895


In [5]:
lower_respiratory_diseases.Entity.value_counts().reset_index()

Unnamed: 0,index,Entity
0,Afghanistan,30
1,Northern Ireland,30
2,Norway,30
3,OECD Countries,30
4,Oman,30
...,...,...
223,Guam,30
224,Guatemala,30
225,Guinea,30
226,Guinea-Bissau,30


In [6]:
lower_respiratory_diseases.Entity.unique()

array(['Afghanistan', 'African Region (WHO)', 'Albania', 'Algeria',
       'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda',
       'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Cape Verde', 'Central African Republic', 'Chad',
       'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus',
       'Czechia', 'Democratic Republic of Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'East Asia & Pacific (WB)',
       'Eastern Mediterranean Region (WHO)', 'Ecuador', 'Egypt',
       'El Salvador', 'England', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Eswatini', 'Ethiopia', 'Eu

In [7]:
gdp=pd.read_csv("lower-respiratory-infections-vs-gdp-per-capita.csv")

In [8]:
gdp

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate),"GDP per capita, PPP (current international $)",Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,1001.982170,,12412311.0,
2,Afghanistan,AFG,1991,889.451273,,13299016.0,
3,Afghanistan,AFG,1992,815.054137,,14485543.0,
4,Afghanistan,AFG,1993,816.899612,,15816601.0,
...,...,...,...,...,...,...,...
56805,Zimbabwe,ZWE,1987,,,9527202.0,
56806,Zimbabwe,ZWE,1988,,,9849129.0,
56807,Zimbabwe,ZWE,1989,,,10153852.0,
56808,Zimbabwe,ZWE,2021,,,15092171.0,


---------------

In [9]:
death_under_5=pd.read_csv("pneumonia-death-rates-in-children-under-5.csv")

In [10]:
death_under_5.head()

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)
0,Afghanistan,AFG,1990,1001.98217
1,Afghanistan,AFG,1991,889.451273
2,Afghanistan,AFG,1992,815.054137
3,Afghanistan,AFG,1993,816.899612
4,Afghanistan,AFG,1994,821.638318


In [11]:
death_under_5[(death_under_5.Entity=="Afghanistan") & (death_under_5.Year==2003)]

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)
13,Afghanistan,AFG,2003,620.508818


In [12]:
death_rates=pd.read_csv("pneumonia-death-rates-age-standardized.csv")

In [13]:
death_rates

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Age-standardized (Rate)
0,Afghanistan,AFG,1990,145.199604
1,Afghanistan,AFG,1991,131.671918
2,Afghanistan,AFG,1992,123.663077
3,Afghanistan,AFG,1993,126.485749
4,Afghanistan,AFG,1994,129.937755
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,162.378516
6836,Zimbabwe,ZWE,2016,160.508947
6837,Zimbabwe,ZWE,2017,158.196077
6838,Zimbabwe,ZWE,2018,155.665066


In [14]:
mortality=pd.read_csv("pneumonia-mortality-by-age.csv")

In [15]:
mortality

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate),Deaths - Lower respiratory infections - Sex: Both - Age: 5-14 years (Rate),Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Rate),Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Rate),Deaths - Lower respiratory infections - Sex: Both - Age: 70+ years (Rate)
0,Afghanistan,AFG,1990,1001.982170,18.091487,10.243855,69.694963,475.963464
1,Afghanistan,AFG,1991,889.451273,18.334832,9.819060,69.290253,472.273332
2,Afghanistan,AFG,1992,815.054137,18.936761,9.406091,69.025223,469.205942
3,Afghanistan,AFG,1993,816.899612,19.193463,9.329637,69.680051,469.807891
4,Afghanistan,AFG,1994,821.638318,18.728907,9.421587,70.765961,472.369829
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,239.605213,5.677288,26.039638,276.471664,1083.641273
6836,Zimbabwe,ZWE,2016,236.684332,6.039127,26.025914,272.404564,1062.825282
6837,Zimbabwe,ZWE,2017,229.249003,6.226371,25.873663,268.497989,1046.263433
6838,Zimbabwe,ZWE,2018,220.930293,6.262915,25.681544,263.914859,1027.296891


In [16]:
risk_factor_aged_70=pd.read_csv("risk-factor-aged-70.csv")

In [17]:
risk_factor_aged_70

Unnamed: 0,Entity,Code,Year,Deaths - Cause: Lower respiratory infections - Risk: No access to handwashing facility - Sex: Both - Age: 70+ years (Number),Deaths - Cause: Lower respiratory infections - Risk: Secondhand smoke - Sex: Both - Age: 70+ years (Number),Deaths - Cause: Lower respiratory infections - Risk: Particulate matter pollution - Sex: Both - Age: 70+ years (Number),Deaths - Cause: Lower respiratory infections - Risk: Smoking - Sex: Both - Age: 70+ years (Number)
0,Afghanistan,AFG,1990,250,136,895,102
1,Afghanistan,AFG,1991,254,138,905,104
2,Afghanistan,AFG,1992,257,140,916,106
3,Afghanistan,AFG,1993,263,143,936,108
4,Afghanistan,AFG,1994,271,147,959,112
...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,599,134,1105,446
6836,Zimbabwe,ZWE,2016,601,134,1088,450
6837,Zimbabwe,ZWE,2017,605,135,1069,455
6838,Zimbabwe,ZWE,2018,608,136,1054,461


In [18]:
risk_factor_aged_5=pd.read_csv("risk-factors-aged-under5.csv")

In [19]:
risk_factor_aged_5

Unnamed: 0,Entity,Code,Year,Deaths - Cause: Lower respiratory infections - Risk: Child stunting - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Child wasting - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Low birth weight - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: No access to handwashing facility - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Secondhand smoke - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Child underweight - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Household air pollution from solid fuels - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Non-exclusive breastfeeding - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Short gestation - Sex: Both - Age: Under 5 (Number)
0,Afghanistan,AFG,1990,4251,11902,3145,3248,1883,3070,10871,1408,2208
1,Afghanistan,AFG,1991,4326,12121,3473,3363,1962,3168,11229,1519,2444
2,Afghanistan,AFG,1992,4826,13583,4103,3804,2235,3579,12696,1788,2892
3,Afghanistan,AFG,1993,5610,15720,4647,4391,2587,4193,14604,2050,3279
4,Afghanistan,AFG,1994,6162,17053,4935,4763,2806,4624,15778,2172,3487
...,...,...,...,...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,493,2395,927,1074,386,275,1606,426,867
6836,Zimbabwe,ZWE,2016,489,2388,891,1061,381,271,1561,413,834
6837,Zimbabwe,ZWE,2017,468,2310,863,1024,367,258,1483,401,809
6838,Zimbabwe,ZWE,2018,445,2216,832,982,353,244,1395,389,782


In [20]:
careseeking=pd.read_csv("pneumonia-careseeking.csv")

In [21]:
careseeking

Unnamed: 0,Entity,Code,Year,Percentage of children under 5 with symptoms of pneumonia taken for care to a health provider
0,Afghanistan,AFG,2011,60.5
1,Afghanistan,AFG,2015,61.5
2,Albania,ALB,2000,83.0
3,Albania,ALB,2005,45.0
4,Albania,ALB,2009,69.6
...,...,...,...,...
416,Zimbabwe,ZWE,2006,25.0
417,Zimbabwe,ZWE,2009,42.6
418,Zimbabwe,ZWE,2011,48.0
419,Zimbabwe,ZWE,2014,58.6


In [22]:
breastfeeding=pd.read_csv("exclusive-breastfeeding.csv")

In [23]:
breastfeeding

Unnamed: 0,Entity,Code,Year,Exclusive breastfeeding percent
0,Afghanistan,AFG,2015,43.13
1,Albania,ALB,2000,6.27
2,Albania,ALB,2005,3.39
3,Albania,ALB,2008,37.14
4,Albania,ALB,2017,36.54
...,...,...,...,...
522,Zimbabwe,ZWE,2005,21.69
523,Zimbabwe,ZWE,2009,25.94
524,Zimbabwe,ZWE,2010,31.30
525,Zimbabwe,ZWE,2014,40.29


In [24]:
vaccine=pd.read_csv("share-of-one-year-olds-who-received-the-final-dose-of-pneumococcal-vaccine.csv")

In [25]:
vaccine

Unnamed: 0,Entity,Code,Year,Indicator:Pneumococcal conjugate vaccines (PCV3) immunization coverage among 1-year-olds (%)
0,Afghanistan,AFG,2014,49
1,Afghanistan,AFG,2015,65
2,Afghanistan,AFG,2016,62
3,Afghanistan,AFG,2017,66
4,Afghanistan,AFG,2018,69
...,...,...,...,...
1841,Zimbabwe,ZWE,2016,90
1842,Zimbabwe,ZWE,2017,89
1843,Zimbabwe,ZWE,2018,89
1844,Zimbabwe,ZWE,2019,90


-----------------

#### we have [ lower_respiratory_diseases , gdp , death_under_5 , death_rates  , mortality , risk_factor_aged_70, risk_factor_aged_5 , careseeking , breastfeeding , vaccine]

In [26]:
#all data 
data=[ lower_respiratory_diseases,gdp,death_under_5,death_rates,mortality,risk_factor_aged_70,risk_factor_aged_5,careseeking ,breastfeeding,vaccine]

------------------

<a id=section1></a>

##### Clean lower_respiratory_diseases 

In [27]:
lower_respiratory_diseases

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 5-14 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 70+ years (Number)
0,Afghanistan,AFG,1990,20224,932,488,538,1559
1,Afghanistan,AFG,1991,20879,941,529,580,1576
2,Afghanistan,AFG,1992,23585,956,604,664,1595
3,Afghanistan,AFG,1993,27116,979,665,728,1628
4,Afghanistan,AFG,1994,29271,1003,695,754,1668
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,5070,3069,1808,207,2820
6836,Zimbabwe,ZWE,2016,5023,3093,1843,223,2841
6837,Zimbabwe,ZWE,2017,4865,3127,1870,234,2866
6838,Zimbabwe,ZWE,2018,4675,3158,1893,238,2895


In [28]:
lower_respiratory_diseases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6840 entries, 0 to 6839
Data columns (total 8 columns):
 #   Column                                                                         Non-Null Count  Dtype 
---  ------                                                                         --------------  ----- 
 0   Entity                                                                         6840 non-null   object
 1   Code                                                                           6150 non-null   object
 2   Year                                                                           6840 non-null   int64 
 3   Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number)      6840 non-null   int64 
 4   Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Number)  6840 non-null   int64 
 5   Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Number)  6840 non-null   int64 
 6   Deaths - Lower respiratory infect

In [29]:
lower_respiratory_diseases.isna().sum()

Entity                                                                             0
Code                                                                             690
Year                                                                               0
Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number)          0
Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Number)      0
Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Number)      0
Deaths - Lower respiratory infections - Sex: Both - Age: 5-14 years (Number)       0
Deaths - Lower respiratory infections - Sex: Both - Age: 70+ years (Number)        0
dtype: int64

In [30]:
lower_respiratory_diseases[lower_respiratory_diseases["Code"].isna()].sample(50)

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 5-14 years (Number),Deaths - Lower respiratory infections - Sex: Both - Age: 70+ years (Number)
1665,Eastern Mediterranean Region (WHO),,2005,145893,17047,12173,7846,29257
1955,Europe & Central Asia (WB),,1995,67640,37603,24853,4041,154713
1797,England,,2017,94,2124,494,15,30846
1965,Europe & Central Asia (WB),,2005,30833,44165,33276,2443,151694
4397,OECD Countries,,2007,9599,34763,10627,880,296914
2102,G20,,1992,1004008,156589,78685,43205,479772
2104,G20,,1994,895503,162996,83828,40297,508717
5144,Scotland,,2004,9,255,62,2,2662
1653,Eastern Mediterranean Region (WHO),,1993,196356,13312,9250,7668,22402
6656,World Bank High Income,,2016,1731,40679,9380,393,356222


I want to estimate code from  first character from each word in country in __Entity__ or the first three character of the country

In [31]:
#test
x="African Region (WHO)"

In [32]:
x.split("(")[0] # to take name country without (who) or like that

'African Region '

In [33]:
x.split("(")[0].split()

['African', 'Region']

In [34]:
#test
first_char=x.split("(")[0].split()[0][0]

In [35]:
second_char=x.split("(")[0].split()[1][0]
second_char

'R'

In [36]:
def test(x):
    list_country=x.split("(")[0].split() # list contain first word in country
    code=[]
    for char in list_country:
        code.append(char[0])
    return "".join(code)# to convert list ro string

In [37]:
print(test("African Region (WHO)"))

AR


In [38]:
def est_code(x):
    list_word=x.split("(")
    if len(list_word[0].split())==1: # means name of country is one word
        return x[:3]
    else:
        list_country=x.split("(")[0].split() # list contain first word in country
        code=[]
        for char in list_country:
            code.append(char[0])
        return "".join(code)# to convert list ro string

In [39]:
print(est_code("European Region (WHO)"))

ER


In [40]:
print(est_code("Scotland"))

Sco


In [41]:
#apply last function to estimate missing value
lower_respiratory_diseases["Code"]=lower_respiratory_diseases["Code"].fillna(lower_respiratory_diseases["Entity"].apply(est_code))

In [42]:
lower_respiratory_diseases.isna().sum()

Entity                                                                           0
Code                                                                             0
Year                                                                             0
Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number)        0
Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Number)    0
Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Number)    0
Deaths - Lower respiratory infections - Sex: Both - Age: 5-14 years (Number)     0
Deaths - Lower respiratory infections - Sex: Both - Age: 70+ years (Number)      0
dtype: int64

In [43]:
lower_respiratory_diseases.duplicated().sum()

0

---------------

<a id=section2></a>

##### Clean gdp 

In [44]:
gdp

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate),"GDP per capita, PPP (current international $)",Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,1001.982170,,12412311.0,
2,Afghanistan,AFG,1991,889.451273,,13299016.0,
3,Afghanistan,AFG,1992,815.054137,,14485543.0,
4,Afghanistan,AFG,1993,816.899612,,15816601.0,
...,...,...,...,...,...,...,...
56805,Zimbabwe,ZWE,1987,,,9527202.0,
56806,Zimbabwe,ZWE,1988,,,9849129.0,
56807,Zimbabwe,ZWE,1989,,,10153852.0,
56808,Zimbabwe,ZWE,2021,,,15092171.0,


In [45]:
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56810 entries, 0 to 56809
Data columns (total 7 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   Entity                                                                   56810 non-null  object 
 1   Code                                                                     54091 non-null  object 
 2   Year                                                                     56810 non-null  int64  
 3   Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)  6840 non-null   float64
 4   GDP per capita, PPP (current international $)                            6235 non-null   float64
 5   Population (historical estimates)                                        55656 non-null  float64
 6   Continent                                                              

In [46]:
round(gdp.isna().sum()/gdp.shape[0],2) #percentage of missing values

Entity                                                                     0.00
Code                                                                       0.05
Year                                                                       0.00
Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)    0.88
GDP per capita, PPP (current international $)                              0.89
Population (historical estimates)                                          0.02
Continent                                                                  0.99
dtype: float64

In [47]:
gdp[gdp["Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)"].isna()]

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate),"GDP per capita, PPP (current international $)",Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
31,Afghanistan,AFG,2020,,2078.648682,38928341.0,
32,Afghanistan,AFG,-10000,,,14737.0,
33,Afghanistan,AFG,-9000,,,20405.0,
34,Afghanistan,AFG,-8000,,,28253.0,
...,...,...,...,...,...,...,...
56805,Zimbabwe,ZWE,1987,,,9527202.0,
56806,Zimbabwe,ZWE,1988,,,9849129.0,
56807,Zimbabwe,ZWE,1989,,,10153852.0,
56808,Zimbabwe,ZWE,2021,,,15092171.0,


- there are missing understand in Year are found years in negative 

In [48]:
# drop Continent column and missing values in [Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate) , GDP per capita, PPP (current international $)]
gdp.drop("Continent",axis=1,inplace=True)
gdp.dropna(subset=["Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)","GDP per capita, PPP (current international $)"],inplace=True)

In [49]:
gdp

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate),"GDP per capita, PPP (current international $)",Population (historical estimates)
13,Afghanistan,AFG,2002,602.566687,877.014282,22600774.0
14,Afghanistan,AFG,2003,620.508818,927.857544,23680871.0
15,Afghanistan,AFG,2004,596.086242,925.441406,24726689.0
16,Afghanistan,AFG,2005,552.850175,1023.051758,25654274.0
17,Afghanistan,AFG,2006,520.044321,1077.761597,26433058.0
...,...,...,...,...,...,...
56575,Zimbabwe,ZWE,2015,239.605213,2679.507568,13814642.0
56576,Zimbabwe,ZWE,2016,236.684332,2806.468994,14030338.0
56577,Zimbabwe,ZWE,2017,229.249003,3795.642334,14236599.0
56578,Zimbabwe,ZWE,2018,220.930293,4017.221680,14438812.0


----------------

<a id=section3></a>

##### Clean death_under_5 

In [50]:
death_under_5

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)
0,Afghanistan,AFG,1990,1001.982170
1,Afghanistan,AFG,1991,889.451273
2,Afghanistan,AFG,1992,815.054137
3,Afghanistan,AFG,1993,816.899612
4,Afghanistan,AFG,1994,821.638318
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,239.605213
6836,Zimbabwe,ZWE,2016,236.684332
6837,Zimbabwe,ZWE,2017,229.249003
6838,Zimbabwe,ZWE,2018,220.930293


In [51]:
#rename column
death_under_5.rename(columns={"Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)":"rate_death_under_5"},inplace=True)

In [52]:
death_under_5

Unnamed: 0,Entity,Code,Year,rate_death_under_5
0,Afghanistan,AFG,1990,1001.982170
1,Afghanistan,AFG,1991,889.451273
2,Afghanistan,AFG,1992,815.054137
3,Afghanistan,AFG,1993,816.899612
4,Afghanistan,AFG,1994,821.638318
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,239.605213
6836,Zimbabwe,ZWE,2016,236.684332
6837,Zimbabwe,ZWE,2017,229.249003
6838,Zimbabwe,ZWE,2018,220.930293


In [53]:
death_under_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6840 entries, 0 to 6839
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Entity              6840 non-null   object 
 1   Code                6150 non-null   object 
 2   Year                6840 non-null   int64  
 3   rate_death_under_5  6840 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 213.9+ KB


In [54]:
death_under_5.isna().sum()

Entity                  0
Code                  690
Year                    0
rate_death_under_5      0
dtype: int64

In [55]:
death_under_5[death_under_5["Code"].isna()].sample(20)

Unnamed: 0,Entity,Code,Year,rate_death_under_5
1629,East Asia & Pacific (WB),,1999,184.190925
5658,Sub-Saharan Africa (WB),,2008,354.746177
4217,North America (WB),,2007,3.800316
6706,World Bank Lower Middle Income,,2006,251.04599
5460,South Asia (WB),,1990,441.272396
1977,Europe & Central Asia (WB),,2017,30.211239
4319,Northern Ireland,,2019,2.132979
5140,Scotland,,2000,4.20674
1994,European Region (WHO),,2004,64.183699
4205,North America (WB),,1995,5.119911


we use __est_code__ function to impute missing value like in  lower_respiratory_diseases data

In [56]:
#apply last function to estimate missing value
death_under_5["Code"]=death_under_5["Code"].fillna(death_under_5["Entity"].apply(est_code))

In [57]:
death_under_5.isna().sum()

Entity                0
Code                  0
Year                  0
rate_death_under_5    0
dtype: int64

In [58]:
death_under_5.duplicated().sum()

0

--------------------

<a id=section4></a>

##### Clean death_rates 

In [59]:
death_rates

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Age-standardized (Rate)
0,Afghanistan,AFG,1990,145.199604
1,Afghanistan,AFG,1991,131.671918
2,Afghanistan,AFG,1992,123.663077
3,Afghanistan,AFG,1993,126.485749
4,Afghanistan,AFG,1994,129.937755
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,162.378516
6836,Zimbabwe,ZWE,2016,160.508947
6837,Zimbabwe,ZWE,2017,158.196077
6838,Zimbabwe,ZWE,2018,155.665066


In [60]:
#rename column
death_rates.rename(columns={"Deaths - Lower respiratory infections - Sex: Both - Age: Age-standardized (Rate)":"rate_death_each_all_age"},inplace=True)


In [61]:
death_rates

Unnamed: 0,Entity,Code,Year,rate_death_each_all_age
0,Afghanistan,AFG,1990,145.199604
1,Afghanistan,AFG,1991,131.671918
2,Afghanistan,AFG,1992,123.663077
3,Afghanistan,AFG,1993,126.485749
4,Afghanistan,AFG,1994,129.937755
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,162.378516
6836,Zimbabwe,ZWE,2016,160.508947
6837,Zimbabwe,ZWE,2017,158.196077
6838,Zimbabwe,ZWE,2018,155.665066


In [62]:
death_rates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6840 entries, 0 to 6839
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Entity                   6840 non-null   object 
 1   Code                     6150 non-null   object 
 2   Year                     6840 non-null   int64  
 3   rate_death_each_all_age  6840 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 213.9+ KB


In [63]:
death_rates.isna().sum()

Entity                       0
Code                       690
Year                         0
rate_death_each_all_age      0
dtype: int64

In [64]:
death_rates[death_rates["Code"].isna()].sample(10)

Unnamed: 0,Entity,Code,Year,rate_death_each_all_age
3694,Middle East & North Africa (WB),,1994,43.435444
1626,East Asia & Pacific (WB),,1996,53.449437
6705,World Bank Lower Middle Income,,2005,64.911675
5475,South Asia (WB),,2005,62.824904
4801,Region of the Americas (WHO),,1991,40.915984
1990,European Region (WHO),,2000,30.030991
6667,World Bank Low Income,,1997,145.640963
1788,England,,2008,28.076904
3128,Latin America & Caribbean (WB),,1998,47.50869
6682,World Bank Low Income,,2012,105.230811


we use __est_code__ function to impute missing value like in  lower_respiratory_diseases data

In [65]:
#apply last function to estimate missing value
death_rates["Code"]=death_rates["Code"].fillna(death_rates["Entity"].apply(est_code))

In [66]:
death_rates.loc[6720]["Code"]

'WBUMI'

In [67]:
death_rates.isna().sum()

Entity                     0
Code                       0
Year                       0
rate_death_each_all_age    0
dtype: int64

------------------------

<a id=section5></a>

##### Clean mortality 

In [68]:
mortality

Unnamed: 0,Entity,Code,Year,Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate),Deaths - Lower respiratory infections - Sex: Both - Age: 5-14 years (Rate),Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Rate),Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Rate),Deaths - Lower respiratory infections - Sex: Both - Age: 70+ years (Rate)
0,Afghanistan,AFG,1990,1001.982170,18.091487,10.243855,69.694963,475.963464
1,Afghanistan,AFG,1991,889.451273,18.334832,9.819060,69.290253,472.273332
2,Afghanistan,AFG,1992,815.054137,18.936761,9.406091,69.025223,469.205942
3,Afghanistan,AFG,1993,816.899612,19.193463,9.329637,69.680051,469.807891
4,Afghanistan,AFG,1994,821.638318,18.728907,9.421587,70.765961,472.369829
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,239.605213,5.677288,26.039638,276.471664,1083.641273
6836,Zimbabwe,ZWE,2016,236.684332,6.039127,26.025914,272.404564,1062.825282
6837,Zimbabwe,ZWE,2017,229.249003,6.226371,25.873663,268.497989,1046.263433
6838,Zimbabwe,ZWE,2018,220.930293,6.262915,25.681544,263.914859,1027.296891


In [69]:
#rename column
mortality.rename(columns={"Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Rate)":"rate_death_under_5",
                          "Deaths - Lower respiratory infections - Sex: Both - Age: 5-14 years (Rate)":"rate_death_under_5-14_years",
                          "Deaths - Lower respiratory infections - Sex: Both - Age: 15-49 years (Rate)":"rate_death_aged_15-49_years",
                          "Deaths - Lower respiratory infections - Sex: Both - Age: 50-69 years (Rate)":"rate_death_aged_50-69_years",
                          "Deaths - Lower respiratory infections - Sex: Both - Age: 70+ years (Rate)":"rate_death_aged_70+_years"},
                     inplace=True)


In [70]:
mortality

Unnamed: 0,Entity,Code,Year,rate_death_under_5,rate_death_under_5-14_years,rate_death_aged_15-49_years,rate_death_aged_50-69_years,rate_death_aged_70+_years
0,Afghanistan,AFG,1990,1001.982170,18.091487,10.243855,69.694963,475.963464
1,Afghanistan,AFG,1991,889.451273,18.334832,9.819060,69.290253,472.273332
2,Afghanistan,AFG,1992,815.054137,18.936761,9.406091,69.025223,469.205942
3,Afghanistan,AFG,1993,816.899612,19.193463,9.329637,69.680051,469.807891
4,Afghanistan,AFG,1994,821.638318,18.728907,9.421587,70.765961,472.369829
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,239.605213,5.677288,26.039638,276.471664,1083.641273
6836,Zimbabwe,ZWE,2016,236.684332,6.039127,26.025914,272.404564,1062.825282
6837,Zimbabwe,ZWE,2017,229.249003,6.226371,25.873663,268.497989,1046.263433
6838,Zimbabwe,ZWE,2018,220.930293,6.262915,25.681544,263.914859,1027.296891


In [71]:
mortality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6840 entries, 0 to 6839
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Entity                       6840 non-null   object 
 1   Code                         6150 non-null   object 
 2   Year                         6840 non-null   int64  
 3   rate_death_under_5           6840 non-null   float64
 4   rate_death_under_5-14_years  6840 non-null   float64
 5   rate_death_aged_15-49_years  6840 non-null   float64
 6   rate_death_aged_50-69_years  6840 non-null   float64
 7   rate_death_aged_70+_years    6840 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 427.6+ KB


In [72]:
mortality.isna().sum()

Entity                           0
Code                           690
Year                             0
rate_death_under_5               0
rate_death_under_5-14_years      0
rate_death_aged_15-49_years      0
rate_death_aged_50-69_years      0
rate_death_aged_70+_years        0
dtype: int64

In [73]:
mortality[mortality["Code"].isna()].sample(10)

Unnamed: 0,Entity,Code,Year,rate_death_under_5,rate_death_under_5-14_years,rate_death_aged_15-49_years,rate_death_aged_50-69_years,rate_death_aged_70+_years
4208,North America (WB),,1998,4.691904,0.37958,2.208836,17.296316,275.156588
55,African Region (WHO),,2015,263.034434,6.587076,11.641442,111.161146,734.549369
6731,World Bank Upper Middle Income,,2001,113.431649,3.209998,4.520086,23.254576,213.134937
6587,Western Pacific Region (WHO),,2007,78.182747,2.051396,1.895779,13.782272,240.103269
3145,Latin America & Caribbean (WB),,2015,46.858082,2.115998,5.235886,37.335302,427.120023
3719,Middle East & North Africa (WB),,2019,31.152779,2.54749,3.410625,24.937239,210.766977
4405,OECD Countries,,2015,7.818117,0.452814,1.732742,13.701591,244.485961
3697,Middle East & North Africa (WB),,1997,143.782016,5.972655,4.156905,31.460233,215.109941
4295,Northern Ireland,,1995,6.796677,0.748092,3.023732,36.435807,937.849218
5481,South Asia (WB),,2011,194.516002,5.550045,3.950438,44.201465,287.527966


we use __est_code__ function to impute missing value like in  lower_respiratory_diseases data

In [74]:
#apply last function to estimate missing value
mortality["Code"]=mortality["Code"].fillna(mortality["Entity"].apply(est_code))

In [75]:
mortality.loc[5151]["Code"]

'Sco'

In [76]:
mortality.isna().sum()

Entity                         0
Code                           0
Year                           0
rate_death_under_5             0
rate_death_under_5-14_years    0
rate_death_aged_15-49_years    0
rate_death_aged_50-69_years    0
rate_death_aged_70+_years      0
dtype: int64

In [77]:
mortality.duplicated().sum()

0

---------------------

<a id=section6></a>

##### Clean risk_factor_aged_70 

In [78]:
risk_factor_aged_70

Unnamed: 0,Entity,Code,Year,Deaths - Cause: Lower respiratory infections - Risk: No access to handwashing facility - Sex: Both - Age: 70+ years (Number),Deaths - Cause: Lower respiratory infections - Risk: Secondhand smoke - Sex: Both - Age: 70+ years (Number),Deaths - Cause: Lower respiratory infections - Risk: Particulate matter pollution - Sex: Both - Age: 70+ years (Number),Deaths - Cause: Lower respiratory infections - Risk: Smoking - Sex: Both - Age: 70+ years (Number)
0,Afghanistan,AFG,1990,250,136,895,102
1,Afghanistan,AFG,1991,254,138,905,104
2,Afghanistan,AFG,1992,257,140,916,106
3,Afghanistan,AFG,1993,263,143,936,108
4,Afghanistan,AFG,1994,271,147,959,112
...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,599,134,1105,446
6836,Zimbabwe,ZWE,2016,601,134,1088,450
6837,Zimbabwe,ZWE,2017,605,135,1069,455
6838,Zimbabwe,ZWE,2018,608,136,1054,461


In [79]:
#rename column
risk_factor_aged_70.rename(columns={"Deaths - Cause: Lower respiratory infections - Risk: No access to handwashing facility - Sex: Both - Age: 70+ years (Number)":"Deaths_Cause_No_access_to_handwashing_facility_aged_70_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Secondhand smoke - Sex: Both - Age: 70+ years (Number)":"Deaths_Cause_Secondhand_smoke_aged_70_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Particulate matter pollution - Sex: Both - Age: 70+ years (Number)":"Deaths_Cause_Particulate_matter_pollution_aged_70_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Smoking - Sex: Both - Age: 70+ years (Number)":"Deaths_Cause_Smoking_aged_70_years"},
                     inplace=True)


In [80]:
risk_factor_aged_70

Unnamed: 0,Entity,Code,Year,Deaths_Cause_No_access_to_handwashing_facility_aged_70_years,Deaths_Cause_Secondhand_smoke_aged_70_years,Deaths_Cause_Particulate_matter_pollution_aged_70_years,Deaths_Cause_Smoking_aged_70_years
0,Afghanistan,AFG,1990,250,136,895,102
1,Afghanistan,AFG,1991,254,138,905,104
2,Afghanistan,AFG,1992,257,140,916,106
3,Afghanistan,AFG,1993,263,143,936,108
4,Afghanistan,AFG,1994,271,147,959,112
...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,599,134,1105,446
6836,Zimbabwe,ZWE,2016,601,134,1088,450
6837,Zimbabwe,ZWE,2017,605,135,1069,455
6838,Zimbabwe,ZWE,2018,608,136,1054,461


In [81]:
risk_factor_aged_70.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6840 entries, 0 to 6839
Data columns (total 7 columns):
 #   Column                                                        Non-Null Count  Dtype 
---  ------                                                        --------------  ----- 
 0   Entity                                                        6840 non-null   object
 1   Code                                                          6150 non-null   object
 2   Year                                                          6840 non-null   int64 
 3   Deaths_Cause_No_access_to_handwashing_facility_aged_70_years  6840 non-null   int64 
 4   Deaths_Cause_Secondhand_smoke_aged_70_years                   6840 non-null   int64 
 5   Deaths_Cause_Particulate_matter_pollution_aged_70_years       6840 non-null   int64 
 6   Deaths_Cause_Smoking_aged_70_years                            6840 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 374.2+ KB


In [82]:
risk_factor_aged_70.isna().sum()

Entity                                                            0
Code                                                            690
Year                                                              0
Deaths_Cause_No_access_to_handwashing_facility_aged_70_years      0
Deaths_Cause_Secondhand_smoke_aged_70_years                       0
Deaths_Cause_Particulate_matter_pollution_aged_70_years           0
Deaths_Cause_Smoking_aged_70_years                                0
dtype: int64

In [83]:
risk_factor_aged_70[risk_factor_aged_70["Code"].isna()].sample(10)

Unnamed: 0,Entity,Code,Year,Deaths_Cause_No_access_to_handwashing_facility_aged_70_years,Deaths_Cause_Secondhand_smoke_aged_70_years,Deaths_Cause_Particulate_matter_pollution_aged_70_years,Deaths_Cause_Smoking_aged_70_years
5485,South Asia (WB),,2015,23043,13742,80270,25962
6630,World Bank High Income,,1990,3734,14992,20453,56211
4396,OECD Countries,,2006,4582,14116,19427,53362
4815,Region of the Americas (WHO),,2005,6573,7952,16202,23734
3717,Middle East & North Africa (WB),,2017,1487,2858,6014,4203
3699,Middle East & North Africa (WB),,1999,1143,1775,3875,2726
1777,England,,1997,412,2237,3232,9074
5159,Scotland,,2019,29,118,56,514
6738,World Bank Upper Middle Income,,2008,15914,24638,56938,47422
1962,Europe & Central Asia (WB),,2002,2300,8437,13472,29031


we use __est_code__ function to impute missing value like in  lower_respiratory_diseases data

In [84]:
#apply last function to estimate missing value
risk_factor_aged_70["Code"]=risk_factor_aged_70["Code"].fillna(risk_factor_aged_70["Entity"].apply(est_code))

In [85]:
risk_factor_aged_70.loc[5151]["Code"]

'Sco'

In [86]:
risk_factor_aged_70.isna().sum()

Entity                                                          0
Code                                                            0
Year                                                            0
Deaths_Cause_No_access_to_handwashing_facility_aged_70_years    0
Deaths_Cause_Secondhand_smoke_aged_70_years                     0
Deaths_Cause_Particulate_matter_pollution_aged_70_years         0
Deaths_Cause_Smoking_aged_70_years                              0
dtype: int64

In [87]:
risk_factor_aged_70.duplicated().sum()

0

----------------

<a id=section7></a>

##### Clean risk_factor_aged_5

In [88]:
risk_factor_aged_5

Unnamed: 0,Entity,Code,Year,Deaths - Cause: Lower respiratory infections - Risk: Child stunting - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Child wasting - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Low birth weight - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: No access to handwashing facility - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Secondhand smoke - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Child underweight - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Household air pollution from solid fuels - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Non-exclusive breastfeeding - Sex: Both - Age: Under 5 (Number),Deaths - Cause: Lower respiratory infections - Risk: Short gestation - Sex: Both - Age: Under 5 (Number)
0,Afghanistan,AFG,1990,4251,11902,3145,3248,1883,3070,10871,1408,2208
1,Afghanistan,AFG,1991,4326,12121,3473,3363,1962,3168,11229,1519,2444
2,Afghanistan,AFG,1992,4826,13583,4103,3804,2235,3579,12696,1788,2892
3,Afghanistan,AFG,1993,5610,15720,4647,4391,2587,4193,14604,2050,3279
4,Afghanistan,AFG,1994,6162,17053,4935,4763,2806,4624,15778,2172,3487
...,...,...,...,...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,493,2395,927,1074,386,275,1606,426,867
6836,Zimbabwe,ZWE,2016,489,2388,891,1061,381,271,1561,413,834
6837,Zimbabwe,ZWE,2017,468,2310,863,1024,367,258,1483,401,809
6838,Zimbabwe,ZWE,2018,445,2216,832,982,353,244,1395,389,782


In [89]:
#rename column
risk_factor_aged_5.rename(columns={"Deaths - Cause: Lower respiratory infections - Risk: Child stunting - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_Child_stunting_Under_5_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Child wasting - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_Child_wasting_Under_5_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Low birth weight - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_Low_birth_weight_Under_5_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: No access to handwashing facility - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_No_access_to_handwashing_Under_5_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Secondhand smoke - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_Secondhand_smoke_Under_5_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Child underweight - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_Child_underweight_Under_5_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Household air pollution from solid fuels - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Non-exclusive breastfeeding - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_on_exclusive_breastfeeding_Under_5_years",
                          "Deaths - Cause: Lower respiratory infections - Risk: Short gestation - Sex: Both - Age: Under 5 (Number)":"Deaths_Cause_Short_gestation_Under_5_years",},
                     inplace=True)


In [90]:
risk_factor_aged_5

Unnamed: 0,Entity,Code,Year,Deaths_Cause_Child_stunting_Under_5_years,Deaths_Cause_Child_wasting_Under_5_years,Deaths_Cause_Low_birth_weight_Under_5_years,Deaths_Cause_No_access_to_handwashing_Under_5_years,Deaths_Cause_Secondhand_smoke_Under_5_years,Deaths_Cause_Child_underweight_Under_5_years,Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years,Deaths_Cause_on_exclusive_breastfeeding_Under_5_years,Deaths_Cause_Short_gestation_Under_5_years
0,Afghanistan,AFG,1990,4251,11902,3145,3248,1883,3070,10871,1408,2208
1,Afghanistan,AFG,1991,4326,12121,3473,3363,1962,3168,11229,1519,2444
2,Afghanistan,AFG,1992,4826,13583,4103,3804,2235,3579,12696,1788,2892
3,Afghanistan,AFG,1993,5610,15720,4647,4391,2587,4193,14604,2050,3279
4,Afghanistan,AFG,1994,6162,17053,4935,4763,2806,4624,15778,2172,3487
...,...,...,...,...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,493,2395,927,1074,386,275,1606,426,867
6836,Zimbabwe,ZWE,2016,489,2388,891,1061,381,271,1561,413,834
6837,Zimbabwe,ZWE,2017,468,2310,863,1024,367,258,1483,401,809
6838,Zimbabwe,ZWE,2018,445,2216,832,982,353,244,1395,389,782


In [91]:
risk_factor_aged_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6840 entries, 0 to 6839
Data columns (total 12 columns):
 #   Column                                                               Non-Null Count  Dtype 
---  ------                                                               --------------  ----- 
 0   Entity                                                               6840 non-null   object
 1   Code                                                                 6150 non-null   object
 2   Year                                                                 6840 non-null   int64 
 3   Deaths_Cause_Child_stunting_Under_5_years                            6840 non-null   int64 
 4   Deaths_Cause_Child_wasting_Under_5_years                             6840 non-null   int64 
 5   Deaths_Cause_Low_birth_weight_Under_5_years                          6840 non-null   int64 
 6   Deaths_Cause_No_access_to_handwashing_Under_5_years                  6840 non-null   int64 
 7   Deaths_Cause_Se

In [92]:
risk_factor_aged_5.isna().sum()

Entity                                                                   0
Code                                                                   690
Year                                                                     0
Deaths_Cause_Child_stunting_Under_5_years                                0
Deaths_Cause_Child_wasting_Under_5_years                                 0
Deaths_Cause_Low_birth_weight_Under_5_years                              0
Deaths_Cause_No_access_to_handwashing_Under_5_years                      0
Deaths_Cause_Secondhand_smoke_Under_5_years                              0
Deaths_Cause_Child_underweight_Under_5_years                             0
Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years      0
Deaths_Cause_on_exclusive_breastfeeding_Under_5_years                    0
Deaths_Cause_Short_gestation_Under_5_years                               0
dtype: int64

In [93]:
risk_factor_aged_5[risk_factor_aged_5["Code"].isna()].sample(10)

Unnamed: 0,Entity,Code,Year,Deaths_Cause_Child_stunting_Under_5_years,Deaths_Cause_Child_wasting_Under_5_years,Deaths_Cause_Low_birth_weight_Under_5_years,Deaths_Cause_No_access_to_handwashing_Under_5_years,Deaths_Cause_Secondhand_smoke_Under_5_years,Deaths_Cause_Child_underweight_Under_5_years,Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years,Deaths_Cause_on_exclusive_breastfeeding_Under_5_years,Deaths_Cause_Short_gestation_Under_5_years
1628,East Asia & Pacific (WB),,1998,42733,185247,36795,29884,57027,31980,103666,32852,23925
6703,World Bank Lower Middle Income,,2003,137739,485095,210469,143763,75227,133121,338410,66755,140923
1668,Eastern Mediterranean Region (WHO),,2008,19756,75987,29474,17994,14287,17501,44142,10402,21285
4223,North America (WB),,2013,16,206,59,6,81,13,0,68,53
5645,Sub-Saharan Africa (WB),,1995,108266,368624,89047,136931,23922,93409,321332,47644,70311
4204,North America (WB),,1994,32,378,125,13,220,23,0,140,113
1984,European Region (WHO),,1994,7436,39025,7958,5192,12708,4141,6271,7785,5760
6640,World Bank High Income,,2000,152,1474,344,79,561,99,30,322,272
1973,Europe & Central Asia (WB),,2013,1569,11187,2506,1039,3117,848,810,2168,1924
5577,South-East Asia Region (WHO),,2017,19716,90773,63016,23737,18704,20604,43624,12820,39572


we use __est_code__ function to impute missing value like in  lower_respiratory_diseases data

In [94]:
#apply last function to estimate missing value
risk_factor_aged_5["Code"]=risk_factor_aged_5["Code"].fillna(risk_factor_aged_5["Entity"].apply(est_code))

In [95]:
risk_factor_aged_5.loc[5151]["Code"]

'Sco'

In [96]:
risk_factor_aged_5.isna().sum()

Entity                                                                 0
Code                                                                   0
Year                                                                   0
Deaths_Cause_Child_stunting_Under_5_years                              0
Deaths_Cause_Child_wasting_Under_5_years                               0
Deaths_Cause_Low_birth_weight_Under_5_years                            0
Deaths_Cause_No_access_to_handwashing_Under_5_years                    0
Deaths_Cause_Secondhand_smoke_Under_5_years                            0
Deaths_Cause_Child_underweight_Under_5_years                           0
Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years    0
Deaths_Cause_on_exclusive_breastfeeding_Under_5_years                  0
Deaths_Cause_Short_gestation_Under_5_years                             0
dtype: int64

In [97]:
risk_factor_aged_5.duplicated().sum()

0

---------------

<a id=section8></a>

##### Clean careseeking

In [98]:
careseeking

Unnamed: 0,Entity,Code,Year,Percentage of children under 5 with symptoms of pneumonia taken for care to a health provider
0,Afghanistan,AFG,2011,60.5
1,Afghanistan,AFG,2015,61.5
2,Albania,ALB,2000,83.0
3,Albania,ALB,2005,45.0
4,Albania,ALB,2009,69.6
...,...,...,...,...
416,Zimbabwe,ZWE,2006,25.0
417,Zimbabwe,ZWE,2009,42.6
418,Zimbabwe,ZWE,2011,48.0
419,Zimbabwe,ZWE,2014,58.6


In [99]:
careseeking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421 entries, 0 to 420
Data columns (total 4 columns):
 #   Column                                                                                         Non-Null Count  Dtype  
---  ------                                                                                         --------------  -----  
 0   Entity                                                                                         421 non-null    object 
 1   Code                                                                                           421 non-null    object 
 2   Year                                                                                           421 non-null    int64  
 3   Percentage of children under 5 with symptoms of pneumonia taken for care to a health provider  421 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 13.3+ KB


In [100]:
careseeking.isna().sum()

Entity                                                                                           0
Code                                                                                             0
Year                                                                                             0
Percentage of children under 5 with symptoms of pneumonia taken for care to a health provider    0
dtype: int64

In [101]:
careseeking.duplicated().sum()

0

----------------------------

<a id=section9></a>

##### Clean breastfeeding

In [102]:
breastfeeding

Unnamed: 0,Entity,Code,Year,Exclusive breastfeeding percent
0,Afghanistan,AFG,2015,43.13
1,Albania,ALB,2000,6.27
2,Albania,ALB,2005,3.39
3,Albania,ALB,2008,37.14
4,Albania,ALB,2017,36.54
...,...,...,...,...
522,Zimbabwe,ZWE,2005,21.69
523,Zimbabwe,ZWE,2009,25.94
524,Zimbabwe,ZWE,2010,31.30
525,Zimbabwe,ZWE,2014,40.29


In [103]:
breastfeeding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 4 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Entity                           527 non-null    object 
 1   Code                             527 non-null    object 
 2   Year                             527 non-null    int64  
 3   Exclusive breastfeeding percent  527 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 16.6+ KB


In [104]:
breastfeeding.isna().sum()

Entity                             0
Code                               0
Year                               0
Exclusive breastfeeding percent    0
dtype: int64

In [105]:
breastfeeding.duplicated().sum()

0

-------------------------------------

<a id=section10></a>

##### Clean vaccine

In [106]:
vaccine

Unnamed: 0,Entity,Code,Year,Indicator:Pneumococcal conjugate vaccines (PCV3) immunization coverage among 1-year-olds (%)
0,Afghanistan,AFG,2014,49
1,Afghanistan,AFG,2015,65
2,Afghanistan,AFG,2016,62
3,Afghanistan,AFG,2017,66
4,Afghanistan,AFG,2018,69
...,...,...,...,...
1841,Zimbabwe,ZWE,2016,90
1842,Zimbabwe,ZWE,2017,89
1843,Zimbabwe,ZWE,2018,89
1844,Zimbabwe,ZWE,2019,90


In [107]:
#rename column
vaccine.rename(columns={"Indicator:Pneumococcal conjugate vaccines (PCV3) immunization coverage among 1-year-olds (%)":"vaccines(PCV3)_among_1_year_olds"},
               inplace=True)


In [108]:
vaccine

Unnamed: 0,Entity,Code,Year,vaccines(PCV3)_among_1_year_olds
0,Afghanistan,AFG,2014,49
1,Afghanistan,AFG,2015,65
2,Afghanistan,AFG,2016,62
3,Afghanistan,AFG,2017,66
4,Afghanistan,AFG,2018,69
...,...,...,...,...
1841,Zimbabwe,ZWE,2016,90
1842,Zimbabwe,ZWE,2017,89
1843,Zimbabwe,ZWE,2018,89
1844,Zimbabwe,ZWE,2019,90


In [109]:
vaccine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1846 entries, 0 to 1845
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Entity                            1846 non-null   object
 1   Code                              1274 non-null   object
 2   Year                              1846 non-null   int64 
 3   vaccines(PCV3)_among_1_year_olds  1846 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 57.8+ KB


In [110]:
vaccine.isna().sum()

Entity                                0
Code                                572
Year                                  0
vaccines(PCV3)_among_1_year_olds      0
dtype: int64

In [111]:
vaccine[vaccine["Code"].isna()].sample(10)

Unnamed: 0,Entity,Code,Year,vaccines(PCV3)_among_1_year_olds
340,Central Asia and Southern Asia,,2015,18
1137,Northern Africa,,2012,9
1756,Western Asia,,2009,38
1524,South-east Asia,,2019,23
291,Caucasus and Central Asia (MDG),,2015,25
1778,"Western Asia (exc. Armenia, Azerbaijan, Cyprus...",,2018,65
442,Developing regions (MDG),,2015,34
332,Central Asia,,2020,70
38,Americas,,2012,77
532,Eastern Mediterranean,,2012,12


we use __est_code__ function to impute missing value like in  lower_respiratory_diseases data

In [112]:
#apply last function to estimate missing value
vaccine["Code"]=vaccine["Code"].fillna(vaccine["Entity"].apply(est_code))

In [113]:
vaccine.loc[1553]["Code"]

'SA'

In [114]:
vaccine.isna().sum()

Entity                              0
Code                                0
Year                                0
vaccines(PCV3)_among_1_year_olds    0
dtype: int64

In [115]:
vaccine.duplicated().sum()

0

-----------------------------------------

<a id=section11></a>
# Merge similar data together

__I will mergre all similar data together to reduce number of datasets and understaning more .__

In [116]:
death_under_5

Unnamed: 0,Entity,Code,Year,rate_death_under_5
0,Afghanistan,AFG,1990,1001.982170
1,Afghanistan,AFG,1991,889.451273
2,Afghanistan,AFG,1992,815.054137
3,Afghanistan,AFG,1993,816.899612
4,Afghanistan,AFG,1994,821.638318
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,239.605213
6836,Zimbabwe,ZWE,2016,236.684332
6837,Zimbabwe,ZWE,2017,229.249003
6838,Zimbabwe,ZWE,2018,220.930293


In [117]:
death_rates

Unnamed: 0,Entity,Code,Year,rate_death_each_all_age
0,Afghanistan,AFG,1990,145.199604
1,Afghanistan,AFG,1991,131.671918
2,Afghanistan,AFG,1992,123.663077
3,Afghanistan,AFG,1993,126.485749
4,Afghanistan,AFG,1994,129.937755
...,...,...,...,...
6835,Zimbabwe,ZWE,2015,162.378516
6836,Zimbabwe,ZWE,2016,160.508947
6837,Zimbabwe,ZWE,2017,158.196077
6838,Zimbabwe,ZWE,2018,155.665066


In [118]:
mortality

Unnamed: 0,Entity,Code,Year,rate_death_under_5,rate_death_under_5-14_years,rate_death_aged_15-49_years,rate_death_aged_50-69_years,rate_death_aged_70+_years
0,Afghanistan,AFG,1990,1001.982170,18.091487,10.243855,69.694963,475.963464
1,Afghanistan,AFG,1991,889.451273,18.334832,9.819060,69.290253,472.273332
2,Afghanistan,AFG,1992,815.054137,18.936761,9.406091,69.025223,469.205942
3,Afghanistan,AFG,1993,816.899612,19.193463,9.329637,69.680051,469.807891
4,Afghanistan,AFG,1994,821.638318,18.728907,9.421587,70.765961,472.369829
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,239.605213,5.677288,26.039638,276.471664,1083.641273
6836,Zimbabwe,ZWE,2016,236.684332,6.039127,26.025914,272.404564,1062.825282
6837,Zimbabwe,ZWE,2017,229.249003,6.226371,25.873663,268.497989,1046.263433
6838,Zimbabwe,ZWE,2018,220.930293,6.262915,25.681544,263.914859,1027.296891


- i note that containt of death_under_5 dataset in mortality datasets, sothat i not need death_under_5 dataset.

- to megre any data we need column that commen between datasets called __key__

In [119]:
#create key rename to id
def key(x,y):
    return x+ "_" + str(y)


In [120]:
#test apply function
copy=mortality.copy()
copy.insert(loc=3,column="id",value=copy.apply(lambda x :key(x["Code"],x["Year"]),axis=1))

In [121]:
copy

Unnamed: 0,Entity,Code,Year,id,rate_death_under_5,rate_death_under_5-14_years,rate_death_aged_15-49_years,rate_death_aged_50-69_years,rate_death_aged_70+_years
0,Afghanistan,AFG,1990,AFG_1990,1001.982170,18.091487,10.243855,69.694963,475.963464
1,Afghanistan,AFG,1991,AFG_1991,889.451273,18.334832,9.819060,69.290253,472.273332
2,Afghanistan,AFG,1992,AFG_1992,815.054137,18.936761,9.406091,69.025223,469.205942
3,Afghanistan,AFG,1993,AFG_1993,816.899612,19.193463,9.329637,69.680051,469.807891
4,Afghanistan,AFG,1994,AFG_1994,821.638318,18.728907,9.421587,70.765961,472.369829
...,...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,ZWE_2015,239.605213,5.677288,26.039638,276.471664,1083.641273
6836,Zimbabwe,ZWE,2016,ZWE_2016,236.684332,6.039127,26.025914,272.404564,1062.825282
6837,Zimbabwe,ZWE,2017,ZWE_2017,229.249003,6.226371,25.873663,268.497989,1046.263433
6838,Zimbabwe,ZWE,2018,ZWE_2018,220.930293,6.262915,25.681544,263.914859,1027.296891


In [122]:
# apply in all data sets
for data_i in data:
    data_i.insert(loc=3,column="id",value=data_i.apply(lambda x :key(x["Code"],x["Year"]),axis=1))

In [123]:
#test
mortality

Unnamed: 0,Entity,Code,Year,id,rate_death_under_5,rate_death_under_5-14_years,rate_death_aged_15-49_years,rate_death_aged_50-69_years,rate_death_aged_70+_years
0,Afghanistan,AFG,1990,AFG_1990,1001.982170,18.091487,10.243855,69.694963,475.963464
1,Afghanistan,AFG,1991,AFG_1991,889.451273,18.334832,9.819060,69.290253,472.273332
2,Afghanistan,AFG,1992,AFG_1992,815.054137,18.936761,9.406091,69.025223,469.205942
3,Afghanistan,AFG,1993,AFG_1993,816.899612,19.193463,9.329637,69.680051,469.807891
4,Afghanistan,AFG,1994,AFG_1994,821.638318,18.728907,9.421587,70.765961,472.369829
...,...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,ZWE_2015,239.605213,5.677288,26.039638,276.471664,1083.641273
6836,Zimbabwe,ZWE,2016,ZWE_2016,236.684332,6.039127,26.025914,272.404564,1062.825282
6837,Zimbabwe,ZWE,2017,ZWE_2017,229.249003,6.226371,25.873663,268.497989,1046.263433
6838,Zimbabwe,ZWE,2018,ZWE_2018,220.930293,6.262915,25.681544,263.914859,1027.296891


In [124]:
death_rates

Unnamed: 0,Entity,Code,Year,id,rate_death_each_all_age
0,Afghanistan,AFG,1990,AFG_1990,145.199604
1,Afghanistan,AFG,1991,AFG_1991,131.671918
2,Afghanistan,AFG,1992,AFG_1992,123.663077
3,Afghanistan,AFG,1993,AFG_1993,126.485749
4,Afghanistan,AFG,1994,AFG_1994,129.937755
...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,ZWE_2015,162.378516
6836,Zimbabwe,ZWE,2016,ZWE_2016,160.508947
6837,Zimbabwe,ZWE,2017,ZWE_2017,158.196077
6838,Zimbabwe,ZWE,2018,ZWE_2018,155.665066


Now we can merge 


<a id=section12></a>

## death datasets
containt of rate_death of all age

In [125]:
death=pd.merge(mortality,death_rates,on="id")
death

Unnamed: 0,Entity_x,Code_x,Year_x,id,rate_death_under_5,rate_death_under_5-14_years,rate_death_aged_15-49_years,rate_death_aged_50-69_years,rate_death_aged_70+_years,Entity_y,Code_y,Year_y,rate_death_each_all_age
0,Afghanistan,AFG,1990,AFG_1990,1001.982170,18.091487,10.243855,69.694963,475.963464,Afghanistan,AFG,1990,145.199604
1,Afghanistan,AFG,1991,AFG_1991,889.451273,18.334832,9.819060,69.290253,472.273332,Afghanistan,AFG,1991,131.671918
2,Afghanistan,AFG,1992,AFG_1992,815.054137,18.936761,9.406091,69.025223,469.205942,Afghanistan,AFG,1992,123.663077
3,Afghanistan,AFG,1993,AFG_1993,816.899612,19.193463,9.329637,69.680051,469.807891,Afghanistan,AFG,1993,126.485749
4,Afghanistan,AFG,1994,AFG_1994,821.638318,18.728907,9.421587,70.765961,472.369829,Afghanistan,AFG,1994,129.937755
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6895,Zimbabwe,ZWE,2015,ZWE_2015,239.605213,5.677288,26.039638,276.471664,1083.641273,Zimbabwe,ZWE,2015,162.378516
6896,Zimbabwe,ZWE,2016,ZWE_2016,236.684332,6.039127,26.025914,272.404564,1062.825282,Zimbabwe,ZWE,2016,160.508947
6897,Zimbabwe,ZWE,2017,ZWE_2017,229.249003,6.226371,25.873663,268.497989,1046.263433,Zimbabwe,ZWE,2017,158.196077
6898,Zimbabwe,ZWE,2018,ZWE_2018,220.930293,6.262915,25.681544,263.914859,1027.296891,Zimbabwe,ZWE,2018,155.665066


In [126]:
death.drop(["Entity_y","Code_y","Year_y"],axis=1,inplace=True)
death.rename(columns={"Entity_x":"Entity",
                       "Code_x":"Code",
                       "Year_x":"Year"},inplace=True)

In [127]:
death

Unnamed: 0,Entity,Code,Year,id,rate_death_under_5,rate_death_under_5-14_years,rate_death_aged_15-49_years,rate_death_aged_50-69_years,rate_death_aged_70+_years,rate_death_each_all_age
0,Afghanistan,AFG,1990,AFG_1990,1001.982170,18.091487,10.243855,69.694963,475.963464,145.199604
1,Afghanistan,AFG,1991,AFG_1991,889.451273,18.334832,9.819060,69.290253,472.273332,131.671918
2,Afghanistan,AFG,1992,AFG_1992,815.054137,18.936761,9.406091,69.025223,469.205942,123.663077
3,Afghanistan,AFG,1993,AFG_1993,816.899612,19.193463,9.329637,69.680051,469.807891,126.485749
4,Afghanistan,AFG,1994,AFG_1994,821.638318,18.728907,9.421587,70.765961,472.369829,129.937755
...,...,...,...,...,...,...,...,...,...,...
6895,Zimbabwe,ZWE,2015,ZWE_2015,239.605213,5.677288,26.039638,276.471664,1083.641273,162.378516
6896,Zimbabwe,ZWE,2016,ZWE_2016,236.684332,6.039127,26.025914,272.404564,1062.825282,160.508947
6897,Zimbabwe,ZWE,2017,ZWE_2017,229.249003,6.226371,25.873663,268.497989,1046.263433,158.196077
6898,Zimbabwe,ZWE,2018,ZWE_2018,220.930293,6.262915,25.681544,263.914859,1027.296891,155.665066


In [128]:
#check nan
death.isna().sum()

Entity                         0
Code                           0
Year                           0
id                             0
rate_death_under_5             0
rate_death_under_5-14_years    0
rate_death_aged_15-49_years    0
rate_death_aged_50-69_years    0
rate_death_aged_70+_years      0
rate_death_each_all_age        0
dtype: int64

In [129]:
#check duplicated
death.duplicated().sum()

0

In [130]:
#save data
death.to_csv("death.csv",index=False)

--------------------

<a id=section13></a>

## risk datasets
containt of risk factor for aged 5 and 70+

In [131]:
risk_factor_aged_5

Unnamed: 0,Entity,Code,Year,id,Deaths_Cause_Child_stunting_Under_5_years,Deaths_Cause_Child_wasting_Under_5_years,Deaths_Cause_Low_birth_weight_Under_5_years,Deaths_Cause_No_access_to_handwashing_Under_5_years,Deaths_Cause_Secondhand_smoke_Under_5_years,Deaths_Cause_Child_underweight_Under_5_years,Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years,Deaths_Cause_on_exclusive_breastfeeding_Under_5_years,Deaths_Cause_Short_gestation_Under_5_years
0,Afghanistan,AFG,1990,AFG_1990,4251,11902,3145,3248,1883,3070,10871,1408,2208
1,Afghanistan,AFG,1991,AFG_1991,4326,12121,3473,3363,1962,3168,11229,1519,2444
2,Afghanistan,AFG,1992,AFG_1992,4826,13583,4103,3804,2235,3579,12696,1788,2892
3,Afghanistan,AFG,1993,AFG_1993,5610,15720,4647,4391,2587,4193,14604,2050,3279
4,Afghanistan,AFG,1994,AFG_1994,6162,17053,4935,4763,2806,4624,15778,2172,3487
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,ZWE_2015,493,2395,927,1074,386,275,1606,426,867
6836,Zimbabwe,ZWE,2016,ZWE_2016,489,2388,891,1061,381,271,1561,413,834
6837,Zimbabwe,ZWE,2017,ZWE_2017,468,2310,863,1024,367,258,1483,401,809
6838,Zimbabwe,ZWE,2018,ZWE_2018,445,2216,832,982,353,244,1395,389,782


In [132]:
risk_factor_aged_70

Unnamed: 0,Entity,Code,Year,id,Deaths_Cause_No_access_to_handwashing_facility_aged_70_years,Deaths_Cause_Secondhand_smoke_aged_70_years,Deaths_Cause_Particulate_matter_pollution_aged_70_years,Deaths_Cause_Smoking_aged_70_years
0,Afghanistan,AFG,1990,AFG_1990,250,136,895,102
1,Afghanistan,AFG,1991,AFG_1991,254,138,905,104
2,Afghanistan,AFG,1992,AFG_1992,257,140,916,106
3,Afghanistan,AFG,1993,AFG_1993,263,143,936,108
4,Afghanistan,AFG,1994,AFG_1994,271,147,959,112
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,ZWE_2015,599,134,1105,446
6836,Zimbabwe,ZWE,2016,ZWE_2016,601,134,1088,450
6837,Zimbabwe,ZWE,2017,ZWE_2017,605,135,1069,455
6838,Zimbabwe,ZWE,2018,ZWE_2018,608,136,1054,461


In [133]:
risk=pd.merge(risk_factor_aged_5,risk_factor_aged_70,on="id")
risk

Unnamed: 0,Entity_x,Code_x,Year_x,id,Deaths_Cause_Child_stunting_Under_5_years,Deaths_Cause_Child_wasting_Under_5_years,Deaths_Cause_Low_birth_weight_Under_5_years,Deaths_Cause_No_access_to_handwashing_Under_5_years,Deaths_Cause_Secondhand_smoke_Under_5_years,Deaths_Cause_Child_underweight_Under_5_years,Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years,Deaths_Cause_on_exclusive_breastfeeding_Under_5_years,Deaths_Cause_Short_gestation_Under_5_years,Entity_y,Code_y,Year_y,Deaths_Cause_No_access_to_handwashing_facility_aged_70_years,Deaths_Cause_Secondhand_smoke_aged_70_years,Deaths_Cause_Particulate_matter_pollution_aged_70_years,Deaths_Cause_Smoking_aged_70_years
0,Afghanistan,AFG,1990,AFG_1990,4251,11902,3145,3248,1883,3070,10871,1408,2208,Afghanistan,AFG,1990,250,136,895,102
1,Afghanistan,AFG,1991,AFG_1991,4326,12121,3473,3363,1962,3168,11229,1519,2444,Afghanistan,AFG,1991,254,138,905,104
2,Afghanistan,AFG,1992,AFG_1992,4826,13583,4103,3804,2235,3579,12696,1788,2892,Afghanistan,AFG,1992,257,140,916,106
3,Afghanistan,AFG,1993,AFG_1993,5610,15720,4647,4391,2587,4193,14604,2050,3279,Afghanistan,AFG,1993,263,143,936,108
4,Afghanistan,AFG,1994,AFG_1994,6162,17053,4935,4763,2806,4624,15778,2172,3487,Afghanistan,AFG,1994,271,147,959,112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6895,Zimbabwe,ZWE,2015,ZWE_2015,493,2395,927,1074,386,275,1606,426,867,Zimbabwe,ZWE,2015,599,134,1105,446
6896,Zimbabwe,ZWE,2016,ZWE_2016,489,2388,891,1061,381,271,1561,413,834,Zimbabwe,ZWE,2016,601,134,1088,450
6897,Zimbabwe,ZWE,2017,ZWE_2017,468,2310,863,1024,367,258,1483,401,809,Zimbabwe,ZWE,2017,605,135,1069,455
6898,Zimbabwe,ZWE,2018,ZWE_2018,445,2216,832,982,353,244,1395,389,782,Zimbabwe,ZWE,2018,608,136,1054,461


In [134]:
risk.drop(["Entity_y","Code_y","Year_y"],axis=1,inplace=True)
risk.rename(columns={"Entity_x":"Entity",
                       "Code_x":"Code",
                       "Year_x":"Year"},inplace=True)

In [135]:
risk

Unnamed: 0,Entity,Code,Year,id,Deaths_Cause_Child_stunting_Under_5_years,Deaths_Cause_Child_wasting_Under_5_years,Deaths_Cause_Low_birth_weight_Under_5_years,Deaths_Cause_No_access_to_handwashing_Under_5_years,Deaths_Cause_Secondhand_smoke_Under_5_years,Deaths_Cause_Child_underweight_Under_5_years,Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years,Deaths_Cause_on_exclusive_breastfeeding_Under_5_years,Deaths_Cause_Short_gestation_Under_5_years,Deaths_Cause_No_access_to_handwashing_facility_aged_70_years,Deaths_Cause_Secondhand_smoke_aged_70_years,Deaths_Cause_Particulate_matter_pollution_aged_70_years,Deaths_Cause_Smoking_aged_70_years
0,Afghanistan,AFG,1990,AFG_1990,4251,11902,3145,3248,1883,3070,10871,1408,2208,250,136,895,102
1,Afghanistan,AFG,1991,AFG_1991,4326,12121,3473,3363,1962,3168,11229,1519,2444,254,138,905,104
2,Afghanistan,AFG,1992,AFG_1992,4826,13583,4103,3804,2235,3579,12696,1788,2892,257,140,916,106
3,Afghanistan,AFG,1993,AFG_1993,5610,15720,4647,4391,2587,4193,14604,2050,3279,263,143,936,108
4,Afghanistan,AFG,1994,AFG_1994,6162,17053,4935,4763,2806,4624,15778,2172,3487,271,147,959,112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6895,Zimbabwe,ZWE,2015,ZWE_2015,493,2395,927,1074,386,275,1606,426,867,599,134,1105,446
6896,Zimbabwe,ZWE,2016,ZWE_2016,489,2388,891,1061,381,271,1561,413,834,601,134,1088,450
6897,Zimbabwe,ZWE,2017,ZWE_2017,468,2310,863,1024,367,258,1483,401,809,605,135,1069,455
6898,Zimbabwe,ZWE,2018,ZWE_2018,445,2216,832,982,353,244,1395,389,782,608,136,1054,461


In [136]:
#check nan
risk.isna().sum()

Entity                                                                 0
Code                                                                   0
Year                                                                   0
id                                                                     0
Deaths_Cause_Child_stunting_Under_5_years                              0
Deaths_Cause_Child_wasting_Under_5_years                               0
Deaths_Cause_Low_birth_weight_Under_5_years                            0
Deaths_Cause_No_access_to_handwashing_Under_5_years                    0
Deaths_Cause_Secondhand_smoke_Under_5_years                            0
Deaths_Cause_Child_underweight_Under_5_years                           0
Deaths_Cause_Household_air_pollution_from_solid_fuels_Under_5_years    0
Deaths_Cause_on_exclusive_breastfeeding_Under_5_years                  0
Deaths_Cause_Short_gestation_Under_5_years                             0
Deaths_Cause_No_access_to_handwashing_facility_aged

In [137]:
#check duplicated
risk.duplicated().sum()

0

In [138]:
#save data
risk.to_csv("risk.csv",index=False)

------------------------

<a id=section14></a>

## save anthor data

In [139]:
lower_respiratory_diseases.to_csv("lower_respiratory_diseases.csv",index=False)

In [140]:
gdp.to_csv("gdp.csv",index=False)

In [141]:
careseeking.to_csv("careseeking.csv",index=False)

In [142]:
breastfeeding.to_csv("breastfeeding.csv",index=False)

In [143]:
vaccine.to_csv("vaccine.csv",index=False)