In [1]:
import pandas as pd
import numpy as np

In [2]:
# Set print options
np.set_printoptions(suppress=True, precision=6, edgeitems = 7)
pd.options.display.float_format = '{:.6f}'.format
pd.set_option('display.max_columns', None)

In [3]:
# Read data
df = pd.read_csv("./InputData/raw_data.csv")

In [4]:
df

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.000000,263.000000,62,0.010000,71.279624,65.000000,1154,19.100000,83,6.000000,8.160000,65.000000,0.100000,584.259210,33736494.000000,17.200000,17.300000,0.479000,10.100000
1,Afghanistan,2014,Developing,59.900000,271.000000,64,0.010000,73.523582,62.000000,492,18.600000,86,58.000000,8.180000,62.000000,0.100000,612.696514,327582.000000,17.500000,17.500000,0.476000,10.000000
2,Afghanistan,2013,Developing,59.900000,268.000000,66,0.010000,73.219243,64.000000,430,18.100000,89,62.000000,8.130000,64.000000,0.100000,631.744976,31731688.000000,17.700000,17.700000,0.470000,9.900000
3,Afghanistan,2012,Developing,59.500000,272.000000,69,0.010000,78.184215,67.000000,2787,17.600000,93,67.000000,8.520000,67.000000,0.100000,669.959000,3696958.000000,17.900000,18.000000,0.463000,9.800000
4,Afghanistan,2011,Developing,59.200000,275.000000,71,0.010000,7.097109,68.000000,3013,17.200000,97,68.000000,7.870000,68.000000,0.100000,63.537231,2978599.000000,18.200000,18.200000,0.454000,9.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.300000,723.000000,27,4.360000,0.000000,68.000000,31,27.100000,42,67.000000,7.130000,65.000000,33.600000,454.366654,12777511.000000,9.400000,9.400000,0.407000,9.200000
2934,Zimbabwe,2003,Developing,44.500000,715.000000,26,4.060000,0.000000,7.000000,998,26.700000,41,7.000000,6.520000,68.000000,36.700000,453.351155,12633897.000000,9.800000,9.900000,0.418000,9.500000
2935,Zimbabwe,2002,Developing,44.800000,73.000000,25,4.430000,0.000000,73.000000,304,26.300000,40,73.000000,6.530000,71.000000,39.800000,57.348340,125525.000000,1.200000,1.300000,0.427000,10.000000
2936,Zimbabwe,2001,Developing,45.300000,686.000000,25,1.720000,0.000000,76.000000,529,25.900000,39,76.000000,6.160000,75.000000,42.100000,548.587312,12366165.000000,1.600000,1.700000,0.427000,9.800000


In [5]:
# Standardize column names
df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_")

In [6]:
# Rename columns
df = df.rename({"under-five_deaths": "under_five_deaths",
          'hiv/aids': 'hiv_aids',
          'thinness__1-19_years': 'thinness_1_19',
          'thinness_5-9_years': 'thinness_5_9'}, axis = 1)

In [7]:
# Drop possible "confounded" variables
df = df.drop(["adult_mortality", "infant_deaths", "under_five_deaths"], axis = 1)

In [8]:
# Check missing values
pd.isnull(df).sum()

country                              0
year                                 0
status                               0
life_expectancy                     10
alcohol                            194
percentage_expenditure               0
hepatitis_b                        553
measles                              0
bmi                                 34
polio                               19
total_expenditure                  226
diphtheria                          19
hiv_aids                             0
gdp                                448
population                         652
thinness_1_19                       34
thinness_5_9                        34
income_composition_of_resources    167
schooling                          163
dtype: int64

In [9]:
# Drop missing values
df = df.dropna()

In [10]:
# Check for unusual values
df.describe()

Unnamed: 0,year,life_expectancy,alcohol,percentage_expenditure,hepatitis_b,measles,bmi,polio,total_expenditure,diphtheria,hiv_aids,gdp,population,thinness_1_19,thinness_5_9,income_composition_of_resources,schooling
count,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0,1649.0
mean,2007.840509,69.302304,4.533196,698.973558,79.217708,2224.494239,38.128623,83.564585,5.955925,84.155246,1.983869,5566.031887,14653625.889485,4.850637,4.907762,0.631551,12.119891
std,4.087711,8.796834,4.029189,1759.229336,25.604664,10085.802019,19.754249,22.450557,2.299385,21.579193,6.03236,11475.900117,70460393.403056,4.599228,4.653757,0.183089,2.795388
min,2000.0,44.0,0.01,0.0,2.0,0.0,2.0,3.0,0.74,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,4.2
25%,2005.0,64.4,0.81,37.438577,74.0,0.0,19.5,81.0,4.41,82.0,0.1,462.14965,191897.0,1.6,1.7,0.509,10.3
50%,2008.0,71.7,3.79,145.102253,89.0,15.0,43.7,93.0,5.84,92.0,0.1,1592.572182,1419631.0,3.0,3.2,0.673,12.3
75%,2011.0,75.0,7.34,509.389994,96.0,373.0,55.8,97.0,7.47,97.0,0.7,4718.51291,7658972.0,7.1,7.1,0.751,14.0
max,2015.0,89.0,17.87,18961.3486,99.0,131441.0,77.1,99.0,14.39,99.0,50.6,119172.7418,1293859294.0,27.2,28.2,0.936,20.7


In [11]:
# Dummy encode "status" as developing 0/1
df["developing"] = (df["status"] == "Developing").astype(int)
df = df.drop("status", axis = 1)

In [12]:
df.head()

Unnamed: 0,country,year,life_expectancy,alcohol,percentage_expenditure,hepatitis_b,measles,bmi,polio,total_expenditure,diphtheria,hiv_aids,gdp,population,thinness_1_19,thinness_5_9,income_composition_of_resources,schooling,developing
0,Afghanistan,2015,65.0,0.01,71.279624,65.0,1154,19.1,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,1
1,Afghanistan,2014,59.9,0.01,73.523582,62.0,492,18.6,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,1
2,Afghanistan,2013,59.9,0.01,73.219243,64.0,430,18.1,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,1
3,Afghanistan,2012,59.5,0.01,78.184215,67.0,2787,17.6,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,1
4,Afghanistan,2011,59.2,0.01,7.097109,68.0,3013,17.2,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,1


- Grouping variables: country, year

In [13]:
# Save modified data
df.to_csv("./OutputData/training_data.csv", index = False)