In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
file_path = "data/Life Expectancy Data.csv"
df_life_exp = pd.read_csv(file_path)

In [3]:
df_life_exp.info()
df_life_exp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [4]:
# Something is wrong with the column names
# Let's white space from column names
df_life_exp_renamed = df_life_exp.copy()
new_col_names = {}
for col_name in df_life_exp.columns:
    new_col_names[col_name] = col_name.strip().lower()
    print("'"+col_name+"'", '->', "'"+new_col_names[col_name]+"'")
_ = df_life_exp_renamed.rename(columns=new_col_names)

'Country' -> 'country'
'Year' -> 'year'
'Status' -> 'status'
'Life expectancy ' -> 'life expectancy'
'Adult Mortality' -> 'adult mortality'
'infant deaths' -> 'infant deaths'
'Alcohol' -> 'alcohol'
'percentage expenditure' -> 'percentage expenditure'
'Hepatitis B' -> 'hepatitis b'
'Measles ' -> 'measles'
' BMI ' -> 'bmi'
'under-five deaths ' -> 'under-five deaths'
'Polio' -> 'polio'
'Total expenditure' -> 'total expenditure'
'Diphtheria ' -> 'diphtheria'
' HIV/AIDS' -> 'hiv/aids'
'GDP' -> 'gdp'
'Population' -> 'population'
' thinness  1-19 years' -> 'thinness  1-19 years'
' thinness 5-9 years' -> 'thinness 5-9 years'
'Income composition of resources' -> 'income composition of resources'
'Schooling' -> 'schooling'


In [5]:
percent_missing = df_life_exp_renamed.isna().sum() * 100 / len(df_life_exp_renamed)
df_missing = pd.DataFrame({'column_name':df_life_exp_renamed.columns, 'percent_missing':percent_missing, 'dtype':df_life_exp_renamed.dtypes})
df_missing[df_missing.percent_missing > 0]

Unnamed: 0,column_name,percent_missing,dtype
Life expectancy,Life expectancy,0.340368,float64
Adult Mortality,Adult Mortality,0.340368,float64
Alcohol,Alcohol,6.603131,float64
Hepatitis B,Hepatitis B,18.822328,float64
BMI,BMI,1.15725,float64
Polio,Polio,0.646698,float64
Total expenditure,Total expenditure,7.692308,float64
Diphtheria,Diphtheria,0.646698,float64
GDP,GDP,15.248468,float64
Population,Population,22.191967,float64


In [6]:
# We need to impute the missing data
from sklearn.impute import SimpleImputer
df_life_exp_imputed = df_life_exp_renamed.copy()
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
missing_column_mask = (df_missing.percent_missing > 0).values
missing_column_names = df_missing[df_missing.percent_missing > 0].column_name
df_life_exp_imputed.iloc[:,missing_column_mask] = mean_imputer.fit_transform(df_life_exp_renamed[missing_column_names])