# Project 1

### Importing Data

In [383]:
import pandas as pd

In [384]:
df = pd.read_excel("2018_Statistical_Annex_Table_1.xlsx",index_col = False)

### Initial Interpretations

In [385]:
#we use the head function to get a quick sense of the structure of the data
df.head()

Unnamed: 0,Table 1. Human Development Index and its components,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,,,,SDG 3,,SDG 4.3,,SDG 4.6,,SDG 8.5,,,,
1,,,Human Development Index (HDI),,Life expectancy at birth,,Expected years of schooling,,Mean years of schooling,,Gross national income (GNI) per capita,,GNI per capita rank minus HDI rank,,HDI rank
2,HDI rank,Country,Value,,(years),,(years),,(years),,(2011 PPP $),,,,
3,,,2017,,2017,,2017,a,2017,a,2017,,2017,,2016
4,,VERY HIGH HUMAN DEVELOPMENT,,,,,,,,,,,,,


## Cleanup

We see that the data is tabulated with some custom nested headers, the field names are spread across rows 3 and 4. In order to make the data more compatible the pandas dataframe structure we will remove the extraneous headings and have the data entries properly aligned. 

In [386]:
data = df[5:]

In [387]:
data.head()

Unnamed: 0,Table 1. Human Development Index and its components,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
5,1,Norway,0.952522,,82.328,,17.8521,,12.5668,,68012.5,,5,,1
6,2,Switzerland,0.943998,,83.473,,16.2088,,13.408,,57625.1,,8,,2
7,3,Australia,0.938631,,83.068,,22.9213,b,12.855,,43560.1,,18,,3
8,4,Ireland,0.93841,,81.643,,19.6137,b,12.5263,c,53754.2,,8,,4
9,5,Germany,0.936043,,81.178,,16.956,,14.082,,46135.8,,13,,4


In [388]:
headers = df.iloc[1].values.tolist()

In [389]:
headers

[nan,
 nan,
 'Human Development Index (HDI) ',
 nan,
 'Life expectancy at birth',
 nan,
 'Expected years of schooling ',
 nan,
 'Mean years of schooling',
 nan,
 'Gross national income (GNI) per capita',
 nan,
 'GNI per capita rank minus HDI rank',
 nan,
 'HDI rank']

In [390]:
headers[0] = "HDI Rank"
headers[1] = "Country"

In [391]:
data.columns = headers

In [392]:
data.head()

Unnamed: 0,HDI Rank,Country,Human Development Index (HDI),nan,Life expectancy at birth,nan.1,Expected years of schooling,nan.2,Mean years of schooling,nan.3,Gross national income (GNI) per capita,nan.4,GNI per capita rank minus HDI rank,nan.5,HDI rank
5,1,Norway,0.952522,,82.328,,17.8521,,12.5668,,68012.5,,5,,1
6,2,Switzerland,0.943998,,83.473,,16.2088,,13.408,,57625.1,,8,,2
7,3,Australia,0.938631,,83.068,,22.9213,b,12.855,,43560.1,,18,,3
8,4,Ireland,0.93841,,81.643,,19.6137,b,12.5263,c,53754.2,,8,,4
9,5,Germany,0.936043,,81.178,,16.956,,14.082,,46135.8,,13,,4


In [393]:
data = data.loc[:, data.columns.notnull()]
data.head()

Unnamed: 0,HDI Rank,Country,Human Development Index (HDI),Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita,GNI per capita rank minus HDI rank,HDI rank
5,1,Norway,0.952522,82.328,17.8521,12.5668,68012.5,5,1
6,2,Switzerland,0.943998,83.473,16.2088,13.408,57625.1,8,2
7,3,Australia,0.938631,83.068,22.9213,12.855,43560.1,18,3
8,4,Ireland,0.93841,81.643,19.6137,12.5263,53754.2,8,4
9,5,Germany,0.936043,81.178,16.956,14.082,46135.8,13,4


Our dataset looks clean! However we can still count the null values.

In [394]:
data.isnull().sum().sum()

424

In [395]:
data.tail(69)

Unnamed: 0,HDI Rank,Country,Human Development Index (HDI),Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita,GNI per capita rank minus HDI rank,HDI rank
196,189,Niger,0.353931,60.422,5.36851,1.95133,905.868,-2,188
197,,OTHER COUNTRIES OR TERRITORIES,,,,,,,
198,..,Korea (Democratic People's Rep. of),..,71.887,12.0002,..,..,..,..
199,..,Monaco,..,..,..,..,..,..,..
200,..,Nauru,..,..,10.3143,..,18573,..,..
201,..,San Marino,..,..,15.1112,..,..,..,..
202,..,Somalia,..,56.714,..,..,..,..,..
203,..,Tuvalu,..,..,..,..,5887.72,..,..
204,,,,,,,,,
205,,Human development groups,,,,,,,


It appears our data is still full of null values. The end of our dataset is the issue. We can remove these problematic rows by keeping only rows with numeric values for 'HDI rank'. 

In [396]:
data = data[pd.to_numeric(data['HDI rank'], errors='coerce').notnull()]
data.isnull().sum().sum() # count the null values again

0

In [397]:
data.head(69)

Unnamed: 0,HDI Rank,Country,Human Development Index (HDI),Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita,GNI per capita rank minus HDI rank,HDI rank
5,1,Norway,0.952522,82.328,17.8521,12.5668,68012.5,5,1
6,2,Switzerland,0.943998,83.473,16.2088,13.408,57625.1,8,2
7,3,Australia,0.938631,83.068,22.9213,12.855,43560.1,18,3
8,4,Ireland,0.93841,81.643,19.6137,12.5263,53754.2,8,4
9,5,Germany,0.936043,81.178,16.956,14.082,46135.8,13,4
10,6,Iceland,0.934879,82.912,19.3479,12.3638,45810.2,13,6
11,7,"Hong Kong, China (SAR)",0.932583,84.097,16.3257,12.0381,58419.7,2,8
12,7,Sweden,0.932805,82.625,17.6346,12.4261,47765.7,9,7
13,9,Singapore,0.932042,83.218,16.2,11.4747,82503.1,-6,8
14,10,Netherlands,0.930639,82.005,18.0448,12.19,47899.8,5,10


In [398]:
data.to_csv("out.csv")

## Expanding

Now we can add some data from another table to our dataset. We can start by cleaning this data. We'll follow many of the same steps as before. 

In [399]:
df = pd.read_excel("2018_Statistical_Annex_Table_5.xlsx",index_col = False)
df.head(20)

Unnamed: 0,Table 5. Gender Inequality Index,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,,,,,,,,,,,,,,,,,,,
1,,,,,,,SDG3.1,,SDG3.7,,SDG5.5,,SDG4.6,,,,,,
2,,,Gender Inequality Index,,,,Maternal mortality ratio,,Adolescent birth rate,,Share of seats in parliament,,Population with at least some secondary education,,,,Labour force participation rate,,
3,,,Value,,Rank,,"(deaths per 100,000 live births)",,"(births per 1,000 women ages 15–19)",,(% held by women),,(% ages 25 and older),,,,(% ages 15 and older),,
4,HDI rank,Country,,,,,,,,,,,Female,,Male,,Female,,Male
5,,,2017,,2017,,2015,,2015-2020,b,2017,,2010–2017,c,2010–2017,c,2017,,2017
6,,VERY HIGH HUMAN DEVELOPMENT,,,,,,,,,,,,,,,,,
7,1,Norway,0.0481388,,5,,5,,5.592,,41.4201,,96.2987,,95.113,,60.8,,67.6
8,2,Switzerland,0.0393197,,1,,5,,2.989,,29.2683,,96.3714,,97.1626,,62.9,,74.1
9,3,Australia,0.109091,,23,,6,,12.902,,32.7434,,90.0339,,89.9195,,59.2,,70.5


In [400]:
data2 = df[7:]
data2.head()

Unnamed: 0,Table 5. Gender Inequality Index,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
7,1,Norway,0.0481388,,5,,5,,5.592,,41.4201,,96.2987,,95.113,,60.8,,67.6
8,2,Switzerland,0.0393197,,1,,5,,2.989,,29.2683,,96.3714,,97.1626,,62.9,,74.1
9,3,Australia,0.109091,,23,,6,,12.902,,32.7434,,90.0339,,89.9195,,59.2,,70.5
10,4,Ireland,0.109382,,23,,8,,9.662,,24.3119,,90.238,,86.308,,53.0,,67.3
11,5,Germany,0.0720575,,14,,6,,6.535,,31.491,,96.2394,,96.831,,55.0,,66.2


In [401]:
headers = df.iloc[2].values.tolist()
headers

[nan,
 nan,
 'Gender Inequality Index',
 nan,
 nan,
 nan,
 'Maternal mortality ratio',
 nan,
 'Adolescent birth rate',
 nan,
 'Share of seats in parliament',
 nan,
 'Population with at least some secondary education',
 nan,
 nan,
 nan,
 'Labour force participation rate ',
 nan,
 nan]

In [402]:
headers[0] = 'HDI rank'
headers[1] = 'Country'

In [403]:
data2.columns = headers
data2.head()

Unnamed: 0,HDI rank,Country,Gender Inequality Index,nan,nan.1,nan.2,Maternal mortality ratio,nan.3,Adolescent birth rate,nan.4,Share of seats in parliament,nan.5,Population with at least some secondary education,nan.6,nan.7,nan.8,Labour force participation rate,nan.9,nan.10
7,1,Norway,0.0481388,,5,,5,,5.592,,41.4201,,96.2987,,95.113,,60.8,,67.6
8,2,Switzerland,0.0393197,,1,,5,,2.989,,29.2683,,96.3714,,97.1626,,62.9,,74.1
9,3,Australia,0.109091,,23,,6,,12.902,,32.7434,,90.0339,,89.9195,,59.2,,70.5
10,4,Ireland,0.109382,,23,,8,,9.662,,24.3119,,90.238,,86.308,,53.0,,67.3
11,5,Germany,0.0720575,,14,,6,,6.535,,31.491,,96.2394,,96.831,,55.0,,66.2


Now we decide which columns we want to use. 

In [407]:
useful_columns = [headers[0], headers[1], headers[2], headers[6], headers[8], headers[10]]
useful_columns

['HDI rank',
 'Country',
 'Gender Inequality Index',
 'Maternal mortality ratio',
 'Adolescent birth rate',
 'Share of seats in parliament']

We drop the other columns and some rows off the end of our table. 

In [405]:
data2 = data2.loc[:, useful_columns]
data2 = data2[:-59]
data2.tail()

Unnamed: 0,HDI rank,Country,Gender Inequality Index,Maternal mortality ratio,Adolescent birth rate,Share of seats in parliament
194,185,Burundi,0.470664,712,26.782,37.8049
195,186,Chad,0.708285,856,161.09,12.766
196,187,South Sudan,..,789,62.04,26.5589
197,188,Central African Republic,0.673356,882,103.802,8.57143
198,189,Niger,0.648882,553,191.984,16.9591



Finally we can merge our tables on HDI rank and Country


In [406]:
data = pd.merge(data, data2, on=['HDI rank','Country'])
data.head()

Unnamed: 0,HDI Rank,Country,Human Development Index (HDI),Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita,GNI per capita rank minus HDI rank,HDI rank,Gender Inequality Index,Maternal mortality ratio,Adolescent birth rate,Share of seats in parliament
0,1,Norway,0.952522,82.328,17.8521,12.5668,68012.5,5,1,0.0481388,5,5.592,41.4201
1,2,Switzerland,0.943998,83.473,16.2088,13.408,57625.1,8,2,0.0393197,5,2.989,29.2683
2,3,Australia,0.938631,83.068,22.9213,12.855,43560.1,18,3,0.109091,6,12.902,32.7434
3,4,Ireland,0.93841,81.643,19.6137,12.5263,53754.2,8,4,0.109382,8,9.662,24.3119
4,6,Iceland,0.934879,82.912,19.3479,12.3638,45810.2,13,6,0.0618811,3,6.805,38.0952
5,7,Sweden,0.932805,82.625,17.6346,12.4261,47765.7,9,7,0.0435777,4,5.239,43.553
6,10,Netherlands,0.930639,82.005,18.0448,12.19,47899.8,5,10,0.0436475,7,3.993,35.5556
7,12,Canada,0.925952,82.541,16.4374,13.2819,43433.2,10,12,0.0921445,7,9.402,30.1149
8,14,United Kingdom,0.921549,81.717,17.4429,12.8887,39116.3,13,14,0.116226,9,12.498,28.5223
9,15,Finland,0.919653,81.496,17.6397,12.4396,41002.3,10,15,0.057871,3,6.769,42.0
