## Feature Engineering

This notebook will contain the functions used to perform the feature engineering for our model. While not all the feature enginnering was used directly with the model, we have chosen to include all of them as all the outputs of this feature engineering were used when testing with VIF and StepWise to determine which variables would be better for training the model.

In [None]:
# importing the relevent packages and scalar needed for feature engineering.
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler, MaxAbsScaler, PowerTransformer, Normalizer, MinMaxScaler

In [None]:
# This code is using the Robust Scaler to scaler the data within the chosen columns

def robust_scaling(df):

    # We used these variables to be scaled using this scaler as when looking at their histograms within the EDA phase of the project, it was found that each of these had big outliers,
    #and we wanted to maintain the distribution's shape while decreasing the impact of the outliers.
    possible_scalings =['adult_mortality', 'hepatitis_b', 'measles', 'bmi', 'under_five_deaths', 'polio',
                        'diphtheria', 'gdp_per_capita', 'population_mln', 'thinness_ten_nineteen_years',
                        'thinness_five_nine_years']

    features = [feature for feature in possible_scalings if feature in list(df.columns)]

    # At this stage we turned the datatypes of each feature into floats.
    df_robbed = df[features]

    df_robbed = df_robbed.astype(float)

    rob = RobustScaler()

    rob.fit(df_robbed)

    df_scaled = pd.DataFrame(rob.transform(df_robbed),columns=[col + '_rob' for col in df_robbed.columns])
    # It was important to change the scale of the df_scaled dataframe in order to make sure that we could join the dataframe's back together correctly.
    df_scaled['index'] = df.index

    df_scaled = df_scaled.join(df, on='index')
    df_scaled.set_index('index', inplace=True)

    return df_scaled

In [None]:
# This code is using the Max-Abs Scaler to scaler the data within the chosen columns

def max_abs(df):

    max_abs = MaxAbsScaler()

    # We determine that these variables would benefit the most from this scaler as their histograms were zero centred with a lot of sparcity.
    maxabs_columns = df[['adult_mortality', 'incidents_hiv', 'gdp_per_capita', 'population_mln']].copy()

    max_abs.fit(maxabs_columns)

    df_scaled_ma = pd.DataFrame(max_abs.transform(maxabs_columns), columns=[col + '_ma' for col in maxabs_columns.columns])

    # It was important to change the scale of the df_scaled df in order to make sure that we could join the df's back together correctly.
    df_scaled_ma['index'] = df.index

    df_scaled_ma = df_scaled_ma.join(df, on='index')
    df_scaled_ma.set_index('index', inplace=True)

    return df_scaled_ma

In [None]:
# This code is using the PowerTransformer Scaler to scaler the data within the chosen columns

def power_transform(df):

    pt = PowerTransformer()

    # We selected these variables for this scaling as we thought that they would benefit the most from it due to their histograms being 'Non-Normal'.
    pt_columns = df[['infant_deaths', 'under_five_deaths', 'adult_mortality', 'alcohol_consumption', 'hepatitis_b', 'measles', 'polio', 'diphtheria', 'incidents_hiv', 'gdp_per_capita', 'population_mln', 'thinness_ten_nineteen_years', 'thinness_five_nine_years', 'economy_status_developed', 'economy_status_developing']].copy()
    pt.fit(pt_columns)

    df_scaled_pt = pd.DataFrame(pt.transform(pt_columns), columns=[col + '_pt' for col in pt_columns.columns])

    # It was important to change the scale of the df_scaled df in order to make sure that we could join the df's back together correctly.
    df_scaled_pt['index'] = df.index

    df_scaled_pt = df_scaled_pt.join(df, on='index')
    df_scaled_pt.set_index('index', inplace=True)

    return df_scaled_pt

In [None]:
# This code is using the Normaliser Scaler to scaler the data within the chosen columns

def normaliser(df):
    normalizer = Normalizer()

    norm_cols = df[['year', 'infant_deaths', 'under_five_deaths','adult_mortality', 'alcohol_consumption', 'hepatitis_b', 'measles','bmi', 'polio', 'diphtheria', 'incidents_hiv', 'gdp_per_capita',
                    'population_mln', 'thinness_ten_nineteen_years','thinness_five_nine_years', 'schooling', 'economy_status_developed','economy_status_developing']].copy()

    normalizer.fit(norm_cols)
    df_norm_scale = pd.DataFrame(normalizer.transform(norm_cols), columns=[columns + '_normed' for columns in norm_cols.columns])

    df_norm_scale['index'] = df.index

    df_norm_scale = df_norm_scale.join(df, on='index')
    df_norm_scale.set_index('index', inplace=True)

    return df_norm_scale

In [None]:
# This code is using the MinMax Scaler to scaler the data within the chosen columns

def min_max(df):

    mm_scaler = MinMaxScaler()

    mm_cols = df[['year', 'infant_deaths', 'under_five_deaths','adult_mortality', 'alcohol_consumption', 'hepatitis_b', 'measles','bmi', 'polio', 'diphtheria', 'incidents_hiv', 'gdp_per_capita',
                  'population_mln', 'thinness_ten_nineteen_years','thinness_five_nine_years', 'schooling', 'economy_status_developed','economy_status_developing']].copy()

    mm_scaler.fit(mm_cols)

    df_min_max = pd.DataFrame(mm_scaler.transform(mm_cols), columns=[columns + '_min_max' for columns in mm_cols.columns])

    df_min_max['index'] = df.index

    df_min_max = df_min_max.join(df, on='index')
    df_min_max.set_index('index', inplace=True)

    return df_min_max