In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [None]:
file_path = "data/Life Expectancy Data.csv"
df_life_exp = pd.read_csv(file_path)

In [None]:
df_life_exp.info()
df_life_exp.head()
df_life_exp.describe()

### The column names are all wrong. Let's try fixing that.

In [None]:
new_col_names = {}
for col_name in df_life_exp.columns:
    # Strip the name from extra white space and switch to lower case
    new_col_names[col_name] = re.sub(' +', ' ',col_name.strip().lower())
    # Display the change in name
    print("'"+col_name+"'", '->', "'"+new_col_names[col_name]+"'")
# Rename the columns into a new data frame
df_life_exp_renamed = df_life_exp.rename(columns=new_col_names)

In [None]:
# infant deaths and under-five deaths look similar. Let's check that
df_diff = df_life_exp_renamed['under-five deaths'] - df_life_exp_renamed['infant deaths']
df_diff = df_diff[df_diff != 0]
df_diff.max(), df_diff.min(), np.abs(df_diff).sum(), len(df_diff)

In [None]:
# Let's drop the infant deaths column and keep under-five deaths
df_life_exp_renamed.drop('infant deaths', axis=1, inplace=True)
df_life_exp_renamed.describe()

### There are some missing data points, so let's explore how much of the data is missing.

In [None]:
# Calculate percentage of missing data
percent_missing = df_life_exp_renamed.isna().sum() * 100 / len(df_life_exp_renamed)
data = zip(df_life_exp_renamed.columns, percent_missing, df_life_exp_renamed.dtypes)
# Create data frame with missing data numbers
df_missing = pd.DataFrame(data=data, columns=['column name', '% missing', 'dtype'])
# Check columns with missing data
df_missing = df_missing[df_missing['% missing'] > 0]
df_missing

### Knowing that there's lots of missing data, we need to impute the missing data as follows:
1. Check the distribution of the missing columns.
1. Impute the missing values with a function that's appropriate to the distribution.

In [None]:
def plot(df, plotting_func, n_cols=4, n_rows=None, figsize=(20, 15)):
    assert (n_cols is None) ^ (n_rows is None), 'Define only one of n_rows or n_cols'
    # Get columns
    columns = df.columns
    # Derive the other dimension
    if n_rows != None:
        n_cols = math.ceil(len(columns) / n_rows)
    else:
        n_rows = math.ceil(len(columns) / n_cols)
    axes = plt.subplots(n_rows, n_cols, figsize=figsize)[1]
    for row in range(n_rows):    
        for col in range(n_cols):
            index = row * n_cols + col
            if index >= len(columns):
                break
            # Skip columns with non-numeric values
            if not pd.api.types.is_numeric_dtype(df[columns[index]]):
                continue
            plotting_func(df[columns[index]], ax=axes[row, col])
    plt.subplots_adjust(top=0.92, bottom=0.1, left=0.10, right=0.95, hspace=0.5, wspace=0.5)


# Check the distribution of the columns using box plots
plot(df_life_exp_renamed, sns.boxplot)
# Check the ditribution of the columns using dist plot (bar plots)
plot(df_life_exp_renamed, sns.distplot)

In [None]:
# TODO: Impute properly
# Imputing missing data with mean
df_life_exp_imputed = df_life_exp_renamed.copy()
df_life_exp_imputed.fillna((df_life_exp_imputed.mean(axis=0)), inplace=True)

## Now let's take a look at the correlation between columns

In [None]:
# Let's take a look at the correlation matrix
def plot_correlation_matrix(df, title=None, cutoff=50):
    # Computer correlation matrix
    corr_full = df.corr() * 100
    corr_cut = corr_full.copy()
    corr_cut[~((corr_cut <= -cutoff) | (corr_cut >= cutoff))] = 0
    axs = plt.subplots(2, 1, figsize=(16, 32))[1]
    for (corr, ax) in zip([corr_full, corr_cut], axs):
        # Generate mask for the upper triangle
        mask = np.triu(np.ones_like(corr, dtype=bool))
        # Set up figure
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(250, 30, l=65, center="dark", as_cmap=True)
        # Draw the heatmap with the mask and corrent aspect ratio
        sns.heatmap(corr, mask=mask, center=0.0, cmap=cmap, linewidths=.5, cbar_kws={'shrink':.5}, annot=True, ax=ax)
        if title:
            plt.title(title)
    corr_cut[mask] = 0
    # Return the cutoff mask
    return corr_cut


cutoff = 56
corr_cut = plot_correlation_matrix(df_life_exp_imputed, 'Feature correlation %', cutoff)
# Lets check the joint plots of correlated data above the cutoff
corr_cut = corr_cut[corr_cut > 0.0].reset_index()
corr_cut = corr_cut.melt(id_vars='index', value_name='corr', var_name='x2')
corr_cut = corr_cut[corr_cut['corr'].notna()]
for x1, x2, _ in corr_cut.values:
    sns.jointplot(x=x1, y=x2, data=df_life_exp_imputed, kind='reg')

In [None]:
def plot_with_binning(df, x, y, bins=None, func=sns.barplot, title=None, figsize=(12, 7), omit_interval=True):
    # Create a subplot for the plot
    ax = plt.subplots(figsize=figsize)[1]
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    if title:
        ax.set_title(title)
    else:
        ax.set_title('{} against {}'.format(y, x))
    # set bin number
    if bins == None:
        bins = len(df[x].unique())
    # Get the range for the cut
    x_range = np.linspace(math.floor(df[x].min()), math.ceil(df[x].max()), bins)
    # Get a binned series
    binned_series = df.groupby([pd.cut(df[x], x_range)])[y].mean()
    # Get x-axis values
    x_values = binned_series.index.values
    # If omit interval flag is on, only show the right side of the intervals
    if omit_interval:
        x_values = np.array(['{:.2f}'.format(i.right).rstrip('0.') for i in x_values])
    # Get y-axis values
    y_values = binned_series.values
    # Plot data
    func(x=x_values.astype('str'), y=y_values)