In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
file_path = "data/Life Expectancy Data.csv"
df_life_exp = pd.read_csv(file_path)

In [None]:
df_life_exp.info()
df_life_exp.head()

In [None]:
# Something is wrong with the column names
# Let's white space from column names
new_col_names = {}
for col_name in df_life_exp.columns:
    new_col_names[col_name] = col_name.strip().lower()
    print("'"+col_name+"'", '->', "'"+new_col_names[col_name]+"'")
df_life_exp_renamed = df_life_exp.rename(columns=new_col_names)

In [None]:
percent_missing = df_life_exp_renamed.isna().sum() * 100 / len(df_life_exp_renamed)
df_missing = pd.DataFrame({'column_name':df_life_exp_renamed.columns, 'percent_missing':percent_missing, 'dtype':df_life_exp_renamed.dtypes})
df_missing[df_missing.percent_missing > 0]

In [None]:
# We need to impute the missing data
from sklearn.impute import SimpleImputer
df_life_exp_imputed = df_life_exp_renamed.copy()
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
missing_column_mask = (df_missing.percent_missing > 0).values
missing_column_names = df_missing[df_missing.percent_missing > 0].column_name
df_life_exp_imputed.iloc[:,missing_column_mask] = mean_imputer.fit_transform(df_life_exp_renamed[missing_column_names])

In [None]:
# Let's take a look at the correlation matrix
def plot_correlation_matrix(df, title=None):
    # Computer correlation matrix
    corr = df.corr() * 100
    # Generate mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))
    # Set up figure
    f, ax = plt.subplots(figsize=(16,16))
    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(250, 30, l=65, center="dark", as_cmap=True)
    # Draw the heatmap with the mask and corrent aspect ratio
    sns.heatmap(corr, mask=mask, center=0.0, cmap=cmap, linewidths=.5, cbar_kws={'shrink':.5}, annot=True)
    if title:
        plt.title(title)


plot_correlation_matrix(df_life_exp_imputed, 'Feature correlation %')

## Seeing the correlation matrix, one can confirm some intuitive insights between some features like:
- Adult Mortality and LifeExpectancy are negatively correlated.
- infant deaths and under-five deaths (correlation is 100%) so we may even drop one of the two columns.
- 5-9 years thinness and 1-19 years thinness.
----
## There are some new insights based on positive relations like:
- Percentage of expenditure and GPD are positively correlated.
- Schooling and income composition of resources are positively correlated.
- Schooling and life expectancy are positively correlated.
- Most diseases do not greatly affect mortality rates except for HIV/AIDs