# Read the data (Three different format)

In [1]:
import pandas as pd
import numpy as np
import csv

### 1. Data with same foramt

In [2]:
data_health = '../data/health'
data_list = ['/Adolescent birth rate (births per 1,000 women ages 15-19).csv',
            '/Life expectancy at birth.csv',
            '/Maternal mortality ratio (deaths per 100,000 live births).csv',
            '/Mortality rate, infant (per 1,000 live births).csv',
            '/Mortality rate, under-five (per 1,000 live births).csv',
            '/Proportion of births attended by skilled health personnel (%).csv',
            '/Share of seats in parliament (% held by women).csv',
            '/Women with account at financial institution or with mobile money-service provider (% of female population ages 15 and older).csv'
            ]
# country count in each list
country = [20, 19, 19, 18, 18, 18, 24, 20]
output = dict()

In [3]:
for c, d in zip(country, data_list):
    df = pd.read_csv(data_health + d,skiprows=1)

    df.drop(columns=['HDI Rank (2018)'], inplace = True)
    df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

    # Keep only the country data
    df.drop(df.tail(c).index,inplace=True)
    df.replace('..', np.nan,inplace=True)

    df.set_index("Country", inplace = True)
    df.astype('float64').dtypes
    df = df.apply(pd.to_numeric, errors='ignore')

    output[d] = df
    df.to_csv(data_health +'/cleaned_data/'+ d)

### 2. Data with other format

In [4]:
df = pd.read_csv(data_health + '/seats_held_by_women_in_parliaments.csv')

cols = [0, 1] + list(range(3,41))
df.drop(df.columns[cols],axis=1,inplace=True)
df.drop(df.tail(5).index,inplace=True)
df.set_index("Country Name", inplace = True)
df.rename(columns=lambda s: s[:4], inplace = True)
df.replace('..', np.nan,inplace=True)
df.astype('float64').dtypes
df = df.apply(pd.to_numeric, errors='ignore')

# Merge 'seats_held_by_women_in_parliaments.csv' and 'Share of seats in parliament (% held by women).csv'
merge = pd.concat([output['/Share of seats in parliament (% held by women).csv'], df], sort=False)
output['/Share of seats in parliament (% held by women).csv'] = merge.groupby(merge.index).mean()
output['/Share of seats in parliament (% held by women).csv'].to_csv(data_health +'/cleaned_data/Share of seats in parliament (% held by women).csv')

### 3. Happiness data

In [5]:
year = list(range(2015,2020))
country_str = ['Country', 'Country', 'Country', 'Country or region', 'Country or region']
score_str = ['Happiness Score', 'Happiness Score','Happiness.Score','Score','Score']

for y, c, s in zip(year, country_str, score_str):
    df = pd.read_csv(data_health + '/World_Happiness_Report'+str(y)+'.csv')
    if y == 2015:
        tmp = pd.DataFrame(data = df[s])
        tmp.set_index(df[c], inplace=True)
        tmp = tmp.rename(index={'Somaliland region': 'Somaliland Region'})
    else:
        tmp2 = pd.DataFrame(data = df[s])
        tmp2.set_index(df[c], inplace=True)
        tmp2 = tmp2.rename(index={'Hong Kong S.A.R., China': 'Hong Kong','Taiwan Province of China':'Taiwan','Trinidad & Tobago':'Trinidad and Tobago'})
        tmp = pd.concat([tmp, tmp2], axis = 1, sort=True)

tmp.columns = year
output['world_happiness'] = tmp
tmp.to_csv(data_health + '/cleaned_data/world_happiness.csv')

# Show the data with dictionary

In [6]:
output['/Adolescent birth rate (births per 1,000 women ages 15-19).csv']

Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Afghanistan,161.2,163.9,165.2,146.4,133.1,126.6,120.2,113.7,107.3,100.8,90.2,79.6,69.0
Albania,16.3,18.7,19.3,18.8,18.5,18.9,19.4,19.8,20.3,20.7,20.4,20.0,19.6
Algeria,33.1,24.3,14.9,9.7,9.7,10.1,10.4,10.8,11.2,11.6,11.1,10.6,10.1
Angola,222.0,213.0,202.5,192.8,181.2,178.5,175.7,172.9,170.2,167.4,161.8,156.2,150.5
Antigua and Barbuda,83.3,71.8,65.4,59.7,51.2,50.3,49.5,48.7,47.8,47.0,45.6,44.2,42.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Dominica,,,,,,,,,,,,,
Saint Kitts and Nevis,,,,,,,,,,,,,
Liechtenstein,,,,,,,,,,,,,
Marshall Islands,,,,,,,,,,,,,


In [7]:
output['/Life expectancy at birth.csv']

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,50.3,51.0,51.6,52.3,52.8,53.4,53.9,54.4,54.9,55.4,...,60.5,61.0,61.6,62.1,62.5,63.0,63.4,63.8,64.1,64.5
Albania,71.8,71.8,71.8,71.9,72.0,72.2,72.5,72.8,73.2,73.6,...,76.2,76.6,76.9,77.3,77.6,77.8,78.0,78.2,78.3,78.5
Algeria,66.9,67.3,67.6,67.9,68.2,68.5,68.9,69.3,69.7,70.2,...,74.6,74.9,75.2,75.4,75.7,75.9,76.1,76.3,76.5,76.7
Andorra,76.5,76.7,76.9,77.0,77.2,77.4,77.6,77.9,78.2,78.5,...,80.7,80.8,80.9,81.1,81.2,81.3,81.4,81.5,81.7,81.8
Angola,45.3,45.3,45.2,45.2,45.2,45.2,45.4,45.5,45.8,46.1,...,54.3,55.4,56.3,57.2,58.1,58.8,59.4,59.9,60.4,60.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela (Bolivarian Republic of),70.7,70.8,71.0,71.1,71.2,71.3,71.5,71.6,71.8,71.9,...,73.1,73.1,73.1,73.0,72.9,72.8,72.6,72.4,72.2,72.1
Viet Nam,70.6,70.8,71.1,71.4,71.7,71.9,72.2,72.4,72.6,72.8,...,74.7,74.8,74.9,75.0,75.0,75.1,75.1,75.2,75.2,75.3
Yemen,57.3,57.7,58.0,58.3,58.6,58.8,59.1,59.4,59.8,60.2,...,65.3,65.5,65.8,65.9,66.0,66.1,66.1,66.1,66.1,66.1
Zambia,49.2,48.1,47.0,45.9,45.0,44.2,43.7,43.5,43.4,43.6,...,54.1,55.7,57.1,58.5,59.7,60.8,61.7,62.5,63.0,63.5


In [8]:
output['/Maternal mortality ratio (deaths per 100,000 live births).csv']

Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,1340.0,1270.0,1100.0,821.0,584.0,536.0,496.0,459.0,425.0,396.0
Albania,71.0,53.0,43.0,30.0,30.0,30.0,30.0,29.0,29.0,29.0
Algeria,216.0,192.0,170.0,148.0,147.0,147.0,145.0,144.0,141.0,140.0
Angola,1160.0,1150.0,924.0,705.0,561.0,546.0,526.0,509.0,493.0,477.0
Argentina,72.0,63.0,60.0,58.0,58.0,56.0,55.0,54.0,54.0,52.0
...,...,...,...,...,...,...,...,...,...,...
Saint Kitts and Nevis,,,,,,,,,,
Liechtenstein,,,,,,,,,,
Marshall Islands,,,,,,,,,,
Palau,,,,,,,,,,


In [9]:
output['/Mortality rate, infant (per 1,000 live births).csv']

Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Afghanistan,118.8,102.2,90.2,78.4,65.6,63.2,60.9,58.7,56.7,54.9,53.2,51.5
Albania,34.8,28.5,21.8,15.4,10.7,10.1,9.5,9.0,8.6,8.3,8.1,7.8
Algeria,41.8,37.2,33.9,28.8,23.5,22.9,22.4,22.1,21.8,21.4,21.0,20.6
Andorra,7.5,5.8,4.7,4.1,3.8,3.7,3.7,3.6,3.5,3.4,3.3,3.2
Angola,132.0,131.6,122.6,101.8,76.4,71.9,67.8,64.1,60.9,58.2,55.8,53.8
...,...,...,...,...,...,...,...,...,...,...,...,...
Yemen,88.5,79.5,68.9,54.6,43.9,43.3,43.2,43.2,43.2,43.2,43.2,43.2
Zambia,110.5,107.1,96.6,66.8,53.4,51.5,49.6,48.3,46.3,44.2,43.0,41.5
Zimbabwe,50.8,58.3,58.4,59.0,55.1,52.2,48.4,45.4,42.3,40.2,38.2,36.5
"Hong Kong, China (SAR)",,,,,,,,,,,,


In [10]:
output['/Mortality rate, under-five (per 1,000 live births).csv']

Unnamed: 0_level_0,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Afghanistan,175.1,147.8,128.8,110.1,90.0,86.2,82.6,79.2,76.0,73.1,70.4,67.9
Albania,40.1,32.4,24.5,17.2,12.0,11.3,10.7,10.1,9.7,9.4,9.1,8.8
Algeria,49.6,43.7,39.7,33.5,27.4,26.7,26.1,25.7,25.3,24.9,24.5,24.0
Andorra,8.5,6.3,5.0,4.3,3.9,3.9,3.8,3.7,3.6,3.5,3.4,3.3
Angola,223.5,222.7,206.3,167.8,121.3,113.2,105.8,99.2,93.6,88.9,84.6,81.1
...,...,...,...,...,...,...,...,...,...,...,...,...
Yemen,126.0,111.9,95.2,72.7,56.4,55.5,55.4,55.4,55.4,55.4,55.4,55.4
Zambia,185.0,181.8,165.1,111.6,82.4,78.6,74.9,71.6,68.0,64.9,62.4,60.0
Zimbabwe,77.5,98.0,101.8,99.6,87.5,81.2,73.8,66.8,61.1,57.0,53.3,50.3
"Hong Kong, China (SAR)",,,,,,,,,,,,


In [11]:
output['/Proportion of births attended by skilled health personnel (%).csv']

Unnamed: 0_level_0,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,,,34.3,38.6,39.9,,45.2,50.5,,53.4,58.8
Albania,99.1,99.8,,,,,,,,,99.8
Algeria,,,,,,96.6,,,,,
Andorra,,,,,,,,100.0,100.0,100.0,
Angola,,,,,,,,,46.6,,
...,...,...,...,...,...,...,...,...,...,...,...
Luxembourg,,,,,,,,,,,
Netherlands,,,,,,,,,,,
Papua New Guinea,,,,,,,,,,,
Sweden,,,,,,,,,,,


In [12]:
output['/Share of seats in parliament (% held by women).csv']

Unnamed: 0,1995,1997,2000,2005,2010,2011,2012,2013,2014,2015,...,2001,2002,2003,2004,2006,2007,2008,2009,2019,2020
Afghanistan,,,,26.604618,27.655422,27.655422,27.655422,27.655422,27.655422,27.555422,...,,,,,27.309237,27.685950,27.685950,27.309237,27.868852,27.016129
Albania,,,5.180645,7.121429,16.414286,15.707143,15.707143,17.878571,20.000000,20.707143,...,5.714286,5.714286,5.714286,6.428571,7.142857,7.142857,7.142857,16.428571,29.508197,29.508197
Algeria,3.2,3.178947,3.710526,5.734833,7.356041,7.584576,28.700866,28.700866,28.650866,28.650866,...,3.421053,6.169666,6.169666,6.169666,6.169666,7.712082,7.712082,7.712082,25.757576,25.757576
American Samoa,,,,,,,,,,,...,,,,,,,,,,
Andorra,7.1,7.121429,7.121429,28.585714,35.707143,50.000000,50.000000,50.000000,50.000000,39.292857,...,14.285714,14.285714,14.285714,14.285714,28.571429,28.571429,25.000000,35.714286,46.428571,46.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
World,,11.692171,13.904440,16.474545,19.157038,19.827767,20.755204,21.777447,22.188297,22.849299,...,14.090701,15.197446,15.179207,15.897001,16.969664,17.869202,18.442755,19.025087,24.636604,25.173055
Yemen,,,0.700000,0.700000,0.700000,0.700000,0.700000,0.700000,0.700000,0.500000,...,,,,,,,,,,
"Yemen, Rep.",,,0.664452,0.332226,0.332226,0.332226,0.332226,0.332226,0.332226,0.000000,...,0.668896,0.668896,0.332226,0.332226,0.332226,0.332226,0.332226,0.332226,0.332226,0.332226
Zambia,9.7,9.688710,10.113291,12.679114,14.006369,11.482484,11.482484,11.482484,10.779747,12.679114,...,10.126582,12.025316,12.025316,12.025316,14.649682,15.189873,15.189873,14.012739,17.964072,16.766467


In [13]:
output['/Women with account at financial institution or with mobile money-service provider (% of female population ages 15 and older).csv']

Unnamed: 0_level_0,2011,2014,2017
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,2.6,3.8,7.2
Albania,22.7,33.6,38.1
Algeria,20.4,40.1,29.3
Angola,38.9,22.3,22.3
Argentina,31.8,50.9,50.8
...,...,...,...
Timor-Leste,,,
Tonga,,,
Saint Vincent and the Grenadines,,,
Vanuatu,,,


In [14]:
output['world_happiness']

Unnamed: 0,2015,2016,2017,2018,2019
Afghanistan,3.575,3.360,3.794,3.632,3.203
Albania,4.959,4.655,4.644,4.586,4.719
Algeria,5.605,6.355,5.872,5.295,5.211
Angola,4.033,3.866,3.795,3.795,
Argentina,6.574,6.650,6.599,6.388,6.086
...,...,...,...,...,...
Venezuela,6.810,6.084,5.250,4.806,4.707
Vietnam,5.360,5.061,5.074,5.103,5.175
Yemen,4.077,3.724,3.593,3.355,3.380
Zambia,5.129,4.795,4.514,4.377,4.107


# Attribute of output dictionary

In [17]:
print(len(output))

9


In [16]:
output.keys()

dict_keys(['/Adolescent birth rate (births per 1,000 women ages 15-19).csv', '/Life expectancy at birth.csv', '/Maternal mortality ratio (deaths per 100,000 live births).csv', '/Mortality rate, infant (per 1,000 live births).csv', '/Mortality rate, under-five (per 1,000 live births).csv', '/Proportion of births attended by skilled health personnel (%).csv', '/Share of seats in parliament (% held by women).csv', '/Women with account at financial institution or with mobile money-service provider (% of female population ages 15 and older).csv', 'world_happiness'])