# Read the data (Three different format)

In [1]:
import pandas as pd
import numpy as np
import csv

### 1. Data with same foramt

In [2]:
data_health = '../data/health'
data_list = ['/Adolescent birth rate (births per 1,000 women ages 15-19).csv',
            '/Life expectancy at birth.csv',
            '/Maternal mortality ratio (deaths per 100,000 live births).csv',
            '/Mortality rate, infant (per 1,000 live births).csv',
            '/Mortality rate, under-five (per 1,000 live births).csv',
            '/Proportion of births attended by skilled health personnel (%).csv',
            '/Share of seats in parliament (% held by women).csv',
            '/Women with account at financial institution or with mobile money-service provider (% of female population ages 15 and older).csv'
            ]
# data_list = ['/Adolescent birth rate (births per 1,000 women ages 15-19).csv']
# country count in each list
country = [20, 19, 19, 18, 18, 18, 24, 20]
output = dict()

In [3]:
for c, d in zip(country, data_list):
    df = pd.read_csv(data_health + d,skiprows=1)

    df.drop(columns=['HDI Rank (2018)'], inplace = True)
    df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

    # Keep only the country data
    df.drop(df.tail(c).index,inplace=True)
    df.replace('..', np.nan,inplace=True)
    
    df.set_index("Country", inplace = True)
    df.astype('float64').dtypes
    df = df.apply(pd.to_numeric, errors='ignore')

    min_year = int(min(df.columns))
    max_year = int(max(df.columns))

    df = df.reindex(columns=map(str, range(min_year, max_year)))
    df = df.interpolate(method='linear', limit_direction='both', axis=1)
    
    output[d] = df
    df.to_csv(data_health +'/cleaned_data/cleaned_'+ d[1:])

### 2. Data with other format

In [4]:
df = pd.read_csv(data_health + '/seats_held_by_women_in_parliaments.csv')

cols = [0, 1] + list(range(3,41))
df.drop(df.columns[cols],axis=1,inplace=True)
df.drop(df.tail(5).index,inplace=True)
df.set_index("Country Name", inplace = True)
df.rename(columns=lambda s: s[:4], inplace = True)
df.replace('..', np.nan,inplace=True)
df.astype('float64').dtypes
df = df.apply(pd.to_numeric, errors='ignore')

# Merge 'seats_held_by_women_in_parliaments.csv' and 'Share of seats in parliament (% held by women).csv'
merge = pd.concat([output['/Share of seats in parliament (% held by women).csv'], df], sort=False)
output['/Share of seats in parliament (% held by women).csv'] = merge.groupby(merge.index).mean()
output['/Share of seats in parliament (% held by women).csv'].to_csv(data_health +'/cleaned_data/cleaned_Share of seats in parliament (% held by women).csv')



### 3. Happiness data

In [5]:
year = list(range(2015,2020))
country_str = ['Country', 'Country', 'Country', 'Country or region', 'Country or region']
score_str = ['Happiness Score', 'Happiness Score','Happiness.Score','Score','Score']

for y, c, s in zip(year, country_str, score_str):
    df = pd.read_csv(data_health + '/World_Happiness_Report'+str(y)+'.csv')
    if y == 2015:
        tmp = pd.DataFrame(data = df[s])
        tmp.set_index(df[c], inplace=True)
        tmp = tmp.rename(index={'Somaliland region': 'Somaliland Region'})
    else:
        tmp2 = pd.DataFrame(data = df[s])
        tmp2.set_index(df[c], inplace=True)
        tmp2 = tmp2.rename(index={'Hong Kong S.A.R., China': 'Hong Kong','Taiwan Province of China':'Taiwan','Trinidad & Tobago':'Trinidad and Tobago'})
        tmp = pd.concat([tmp, tmp2], axis = 1, sort=True)

tmp.columns = year
output['world_happiness'] = tmp
tmp.to_csv(data_health + '/cleaned_data/cleaned_world_happiness.csv')

# Show the data with dictionary

In [6]:
output['/Adolescent birth rate (births per 1,000 women ages 15-19).csv']

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,161.2,161.74,162.28,162.82,163.36,163.9,164.16,164.42,164.68,164.94,...,138.42,135.76,133.1,126.6,120.2,113.7,107.3,100.8,90.2,79.6
Albania,16.3,16.78,17.26,17.74,18.22,18.7,18.82,18.94,19.06,19.18,...,18.62,18.56,18.5,18.9,19.4,19.8,20.3,20.7,20.4,20.0
Algeria,33.1,31.34,29.58,27.82,26.06,24.3,22.42,20.54,18.66,16.78,...,9.70,9.70,9.7,10.1,10.4,10.8,11.2,11.6,11.1,10.6
Angola,222.0,220.20,218.40,216.60,214.80,213.0,210.90,208.80,206.70,204.60,...,185.84,183.52,181.2,178.5,175.7,172.9,170.2,167.4,161.8,156.2
Antigua and Barbuda,83.3,81.00,78.70,76.40,74.10,71.8,70.52,69.24,67.96,66.68,...,54.60,52.90,51.2,50.3,49.5,48.7,47.8,47.0,45.6,44.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Dominica,,,,,,,,,,,...,,,,,,,,,,
Saint Kitts and Nevis,,,,,,,,,,,...,,,,,,,,,,
Liechtenstein,,,,,,,,,,,...,,,,,,,,,,
Marshall Islands,,,,,,,,,,,...,,,,,,,,,,


In [7]:
output['/Life expectancy at birth.csv']

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,50.3,51.0,51.6,52.3,52.8,53.4,53.9,54.4,54.9,55.4,...,59.9,60.5,61.0,61.6,62.1,62.5,63.0,63.4,63.8,64.1
Albania,71.8,71.8,71.8,71.9,72.0,72.2,72.5,72.8,73.2,73.6,...,75.9,76.2,76.6,76.9,77.3,77.6,77.8,78.0,78.2,78.3
Algeria,66.9,67.3,67.6,67.9,68.2,68.5,68.9,69.3,69.7,70.2,...,74.3,74.6,74.9,75.2,75.4,75.7,75.9,76.1,76.3,76.5
Andorra,76.5,76.7,76.9,77.0,77.2,77.4,77.6,77.9,78.2,78.5,...,80.6,80.7,80.8,80.9,81.1,81.2,81.3,81.4,81.5,81.7
Angola,45.3,45.3,45.2,45.2,45.2,45.2,45.4,45.5,45.8,46.1,...,53.2,54.3,55.4,56.3,57.2,58.1,58.8,59.4,59.9,60.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela (Bolivarian Republic of),70.7,70.8,71.0,71.1,71.2,71.3,71.5,71.6,71.8,71.9,...,73.1,73.1,73.1,73.1,73.0,72.9,72.8,72.6,72.4,72.2
Viet Nam,70.6,70.8,71.1,71.4,71.7,71.9,72.2,72.4,72.6,72.8,...,74.6,74.7,74.8,74.9,75.0,75.0,75.1,75.1,75.2,75.2
Yemen,57.3,57.7,58.0,58.3,58.6,58.8,59.1,59.4,59.8,60.2,...,64.9,65.3,65.5,65.8,65.9,66.0,66.1,66.1,66.1,66.1
Zambia,49.2,48.1,47.0,45.9,45.0,44.2,43.7,43.5,43.4,43.6,...,52.6,54.1,55.7,57.1,58.5,59.7,60.8,61.7,62.5,63.0


In [8]:
output['/Maternal mortality ratio (deaths per 100,000 live births).csv']

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1340.0,1326.0,1312.0,1298.0,1284.0,1270.0,1236.0,1202.0,1168.0,1134.0,...,821.0,773.6,726.2,678.8,631.4,584.0,536.0,496.0,459.0,425.0
Albania,71.0,67.4,63.8,60.2,56.6,53.0,51.0,49.0,47.0,45.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,29.0,29.0
Algeria,216.0,211.2,206.4,201.6,196.8,192.0,187.6,183.2,178.8,174.4,...,148.0,147.8,147.6,147.4,147.2,147.0,147.0,145.0,144.0,141.0
Angola,1160.0,1158.0,1156.0,1154.0,1152.0,1150.0,1104.8,1059.6,1014.4,969.2,...,705.0,676.2,647.4,618.6,589.8,561.0,546.0,526.0,509.0,493.0
Argentina,72.0,70.2,68.4,66.6,64.8,63.0,62.4,61.8,61.2,60.6,...,58.0,58.0,58.0,58.0,58.0,58.0,56.0,55.0,54.0,54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Saint Kitts and Nevis,,,,,,,,,,,...,,,,,,,,,,
Liechtenstein,,,,,,,,,,,...,,,,,,,,,,
Marshall Islands,,,,,,,,,,,...,,,,,,,,,,
Palau,,,,,,,,,,,...,,,,,,,,,,


In [9]:
output['/Mortality rate, infant (per 1,000 live births).csv']

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,118.8,115.48,112.16,108.84,105.52,102.2,99.80,97.40,95.00,92.60,...,73.28,70.72,68.16,65.6,63.2,60.9,58.7,56.7,54.9,53.2
Albania,34.8,33.54,32.28,31.02,29.76,28.5,27.16,25.82,24.48,23.14,...,13.52,12.58,11.64,10.7,10.1,9.5,9.0,8.6,8.3,8.1
Algeria,41.8,40.88,39.96,39.04,38.12,37.2,36.54,35.88,35.22,34.56,...,26.68,25.62,24.56,23.5,22.9,22.4,22.1,21.8,21.4,21.0
Andorra,7.5,7.16,6.82,6.48,6.14,5.8,5.58,5.36,5.14,4.92,...,3.98,3.92,3.86,3.8,3.7,3.7,3.6,3.5,3.4,3.3
Angola,132.0,131.92,131.84,131.76,131.68,131.6,129.80,128.00,126.20,124.40,...,91.64,86.56,81.48,76.4,71.9,67.8,64.1,60.9,58.2,55.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yemen,88.5,86.70,84.90,83.10,81.30,79.5,77.38,75.26,73.14,71.02,...,50.32,48.18,46.04,43.9,43.3,43.2,43.2,43.2,43.2,43.2
Zambia,110.5,109.82,109.14,108.46,107.78,107.1,105.00,102.90,100.80,98.70,...,61.44,58.76,56.08,53.4,51.5,49.6,48.3,46.3,44.2,43.0
Zimbabwe,50.8,52.30,53.80,55.30,56.80,58.3,58.32,58.34,58.36,58.38,...,57.44,56.66,55.88,55.1,52.2,48.4,45.4,42.3,40.2,38.2
"Hong Kong, China (SAR)",,,,,,,,,,,...,,,,,,,,,,


In [10]:
output['/Mortality rate, under-five (per 1,000 live births).csv']

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,175.1,169.64,164.18,158.72,153.26,147.8,144.00,140.20,136.40,132.60,...,102.06,98.04,94.02,90.0,86.2,82.6,79.2,76.0,73.1,70.4
Albania,40.1,38.56,37.02,35.48,33.94,32.4,30.82,29.24,27.66,26.08,...,15.12,14.08,13.04,12.0,11.3,10.7,10.1,9.7,9.4,9.1
Algeria,49.6,48.42,47.24,46.06,44.88,43.7,42.90,42.10,41.30,40.50,...,31.06,29.84,28.62,27.4,26.7,26.1,25.7,25.3,24.9,24.5
Andorra,8.5,8.06,7.62,7.18,6.74,6.3,6.04,5.78,5.52,5.26,...,4.14,4.06,3.98,3.9,3.9,3.8,3.7,3.6,3.5,3.4
Angola,223.5,223.34,223.18,223.02,222.86,222.7,219.42,216.14,212.86,209.58,...,149.20,139.90,130.60,121.3,113.2,105.8,99.2,93.6,88.9,84.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yemen,126.0,123.18,120.36,117.54,114.72,111.9,108.56,105.22,101.88,98.54,...,66.18,62.92,59.66,56.4,55.5,55.4,55.4,55.4,55.4,55.4
Zambia,185.0,184.36,183.72,183.08,182.44,181.8,178.46,175.12,171.78,168.44,...,99.92,94.08,88.24,82.4,78.6,74.9,71.6,68.0,64.9,62.4
Zimbabwe,77.5,81.60,85.70,89.80,93.90,98.0,98.76,99.52,100.28,101.04,...,94.76,92.34,89.92,87.5,81.2,73.8,66.8,61.1,57.0,53.3
"Hong Kong, China (SAR)",,,,,,,,,,,...,,,,,,,,,,


In [11]:
output['/Proportion of births attended by skilled health personnel (%).csv']

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Afghanistan,34.3,34.30,34.30,34.30,34.30,34.3,34.3,34.3,34.3,34.3,34.3,38.6,39.9,42.55,45.2,50.5,51.95,53.4
Albania,99.1,99.24,99.38,99.52,99.66,99.8,99.8,99.8,99.8,99.8,99.8,99.8,99.8,99.80,99.8,99.8,99.80,99.8
Algeria,96.6,96.60,96.60,96.60,96.60,96.6,96.6,96.6,96.6,96.6,96.6,96.6,96.6,96.60,96.6,96.6,96.60,96.6
Andorra,100.0,100.00,100.00,100.00,100.00,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.00,100.0,100.0,100.00,100.0
Angola,46.6,46.60,46.60,46.60,46.60,46.6,46.6,46.6,46.6,46.6,46.6,46.6,46.6,46.60,46.6,46.6,46.60,46.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Luxembourg,,,,,,,,,,,,,,,,,,
Netherlands,,,,,,,,,,,,,,,,,,
Papua New Guinea,,,,,,,,,,,,,,,,,,
Sweden,,,,,,,,,,,,,,,,,,


In [12]:
output['/Share of seats in parliament (% held by women).csv']

Unnamed: 0,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
Afghanistan,25.9,25.9,25.900000,25.900000,25.900000,25.900000,25.900000,25.900000,25.900000,25.900000,...,27.655422,27.655422,27.655422,27.655422,27.555422,27.555422,27.555422,,27.868852,27.016129
Albania,5.2,5.2,5.200000,5.200000,5.180645,5.180645,5.647143,5.837143,6.027143,6.574286,...,15.707143,15.707143,17.878571,20.000000,20.707143,22.878571,27.878571,27.857143,29.508197,29.508197
Algeria,3.2,3.2,3.178947,3.312281,3.445614,3.710526,3.840526,5.344833,5.474833,5.604833,...,7.584576,28.700866,28.700866,28.650866,28.650866,28.700866,23.528788,25.757576,25.757576,25.757576
American Samoa,,,,,,,,,,,...,,,,,,,,,,
Andorra,7.1,7.1,7.121429,7.121429,7.121429,7.121429,12.842857,14.992857,17.142857,19.292857,...,50.000000,50.000000,50.000000,50.000000,39.292857,32.121429,32.121429,32.142857,46.428571,46.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
World,,,11.692171,13.070822,13.520105,13.904440,14.090701,15.197446,15.179207,15.897001,...,19.827767,20.755204,21.777447,22.188297,22.849299,23.091367,23.590337,24.097878,24.636604,25.173055
Yemen,0.7,0.7,0.700000,0.700000,0.700000,0.700000,0.700000,0.700000,0.700000,0.700000,...,0.700000,0.700000,0.700000,0.700000,0.500000,0.500000,0.500000,,,
"Yemen, Rep.",,,,0.664452,0.664452,0.664452,0.668896,0.668896,0.332226,0.332226,...,0.332226,0.332226,0.332226,0.332226,0.000000,0.000000,0.000000,0.000000,0.332226,0.332226
Zambia,9.7,9.7,9.688710,10.044872,10.046624,10.113291,10.373291,11.582658,11.842658,12.102658,...,11.482484,11.482484,11.482484,10.779747,12.679114,17.982036,17.982036,17.964072,17.964072,16.766467


In [13]:
output['/Women with account at financial institution or with mobile money-service provider (% of female population ages 15 and older).csv']

Unnamed: 0_level_0,2011,2012,2013,2014,2015,2016
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,2.6,3.000000,3.400000,3.8,3.8,3.8
Albania,22.7,26.333333,29.966667,33.6,33.6,33.6
Algeria,20.4,26.966667,33.533333,40.1,40.1,40.1
Angola,38.9,33.366667,27.833333,22.3,22.3,22.3
Argentina,31.8,38.166667,44.533333,50.9,50.9,50.9
...,...,...,...,...,...,...
Timor-Leste,,,,,,
Tonga,,,,,,
Saint Vincent and the Grenadines,,,,,,
Vanuatu,,,,,,


In [14]:
output['world_happiness']

Unnamed: 0,2015,2016,2017,2018,2019
Afghanistan,3.575,3.360,3.794,3.632,3.203
Albania,4.959,4.655,4.644,4.586,4.719
Algeria,5.605,6.355,5.872,5.295,5.211
Angola,4.033,3.866,3.795,3.795,
Argentina,6.574,6.650,6.599,6.388,6.086
...,...,...,...,...,...
Venezuela,6.810,6.084,5.250,4.806,4.707
Vietnam,5.360,5.061,5.074,5.103,5.175
Yemen,4.077,3.724,3.593,3.355,3.380
Zambia,5.129,4.795,4.514,4.377,4.107


# Attribute of output dictionary

In [15]:
print(len(output))

9


In [16]:
output.keys()

dict_keys(['/Adolescent birth rate (births per 1,000 women ages 15-19).csv', '/Life expectancy at birth.csv', '/Maternal mortality ratio (deaths per 100,000 live births).csv', '/Mortality rate, infant (per 1,000 live births).csv', '/Mortality rate, under-five (per 1,000 live births).csv', '/Proportion of births attended by skilled health personnel (%).csv', '/Share of seats in parliament (% held by women).csv', '/Women with account at financial institution or with mobile money-service provider (% of female population ages 15 and older).csv', 'world_happiness'])