In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.tools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, Normalizer


In [None]:
## setting out the input features for our model that will be using features that are not considered ethically

def input_features_detailed():
    # Define the conditions for each feature
    conditions = {
        #'Year': (lambda x: isinstance(x, int) and 2000 <= x <= 2015, "Input Values between 2000-2015"),
        #'Infant_deaths': (lambda x: 1 <= x <= 140, "Input Values between 1-140"),
        #'Under_five_deaths': (lambda x: 2 <= x <= 225, "Input Values between 2-225"),
        'adult_mortality': (lambda x: 49 <= x <= 720, "Input Values between 49-720"),
        'alcohol_consumption': (lambda x: 0 <= x <= 18, "Input Values between 0-18"),
        #'Hepatitis_B': (lambda x: 12 <= x <= 100, "Input Values between 12-100"),
        'measles': (lambda x: 10 <= x <= 100, "Input Values between 10-100"),
        #'BMI': (lambda x: 19 <= x <= 33, "Input Values between 19-33"),
        'polio': (lambda x: 8 <= x <= 100, "Input Values between 8-100"),
        #'Diphtheria': (lambda x: 16 <= x <= 100, "Input Values between 16-100"),
        'incidents_hiv': (lambda x: 0 <= x <= 22, "Input Values between 0-22"),
        #'GDP_per_capita': (lambda x: 10 <= x <= 100, "Input Values between 10-100"),
        #'Population_mln': (lambda x: 10 <= x <= 100, "Input Values between 10-100"),
        'thinness_ten_nineteen_years': (lambda x: 0 <= x <= 30, "Input Values between 0-30"),
        'thinness_five_nine_years': (lambda x: 0 <= x <= 30, "Input Values between 0-30"),
        'schooling': (lambda x: 1 <= x <= 15, "Input Values between 1-15"),
        'economy_status_developed': (lambda x: 0 <= x <= 1, "0 - for undeveloped | 1 for developed"),
        #'Economy_status_Developing': (lambda x: isinstance(x, int) and 0 <= x <= 1, "0-1")
    }

    features = {}  # Dictionary to store the user-input values

    for feature, (condition, condition_range) in conditions.items():
        while True:
            value = input(f"Enter value for {feature} (between {condition_range}), or enter 'cancel' to stop: ")
            if value.lower() == 'cancel':
                print("Input canceled by the user.")
                return None

            try:
                value = float(value)
                if not condition(value):
                    raise ValueError("Invalid value!")
                features[feature] = value
                break
            except ValueError:
                print("Invalid value! Try again or enter 'cancel' to get out of the loop")

    return pd.DataFrame([features])


In [None]:
## setting out the input features for our model that will be using features that are ethically

def input_features_simple():
    # Define the conditions for each feature
    conditions = {
        #'Year': (lambda x: isinstance(x, int) and 2000 <= x <= 2015, "Input Values between 2000-2015"),
        #'Infant_deaths': (lambda x: 1 <= x <= 140, "Input Values between 1-140"),
        #'Under_five_deaths': (lambda x: 2 <= x <= 225, "Input Values between 2-225"),
        'adult_mortality': (lambda x: 49 <= x <= 720, "Input Values between 49-720"),
        'alcohol_consumption': (lambda x: 0 <= x <= 18, "Input Values between 0-18"),
        #'Hepatitis_B': (lambda x: 12 <= x <= 100, "Input Values between 12-100"),
        #'measles': (lambda x: 10 <= x <= 100, "Input Values between 10-100"),
        #'BMI': (lambda x: 19 <= x <= 33, "Input Values between 19-33"),
        #'polio': (lambda x: 8 <= x <= 100, "Input Values between 8-100"),
        #'Diphtheria': (lambda x: 16 <= x <= 100, "Input Values between 16-100"),
        #'incidents_hiv': (lambda x: 0 <= x <= 22, "Input Values between 0-22"),
        #'GDP_per_capita': (lambda x: 10 <= x <= 100, "Input Values between 10-100"),
        #'Population_mln': (lambda x: 10 <= x <= 100, "Input Values between 10-100"),
        #'thinness_ten_nineteen_years': (lambda x: 0 <= x <= 30, "Input Values between 0-30"),
        #'thinness_five_nine_years': (lambda x: 0 <= x <= 30, "Input Values between 0-30"),
        'schooling': (lambda x: 1 <= x <= 15, "Input Values between 1-15"),
        'economy_status_developed': (lambda x: isinstance(x, float) and 0 <= x <= 1, "0 - for undeveloped | 1 for developed"),
        #'Economy_status_Developing': (lambda x: isinstance(x, int) and 0 <= x <= 1, "0-1")
    }

    features = {}  # Dictionary to store the user-input values

    for feature, (condition, condition_range) in conditions.items():
        while True:
            value = input(f"Enter value for {feature} (between {condition_range}), or enter 'cancel' to stop: ")
            if value.lower() == 'cancel':
                print("Input canceled by the user.")
                return None

            try:
                value = float(value)
                if not condition(value):
                    raise ValueError("Invalid value!")
                features[feature] = value
                break
            except ValueError:
                print("Invalid value! Try again or enter 'cancel' to get out of the loop")

    return pd.DataFrame([features])


In [None]:
# applying scaling to columns we will be using

def power_transform(train,test,input, scale_columns = ['adult_mortality','polio','incidents_hiv','thinness_five_nine_years','alcohol_consumption','measles']):

    pt = PowerTransformer()


    train_columns = train[scale_columns].copy()
    test_columns = test[scale_columns].copy()
    input_columns = input[scale_columns].copy()

    train.drop(columns=scale_columns, inplace=True)
    test.drop(columns=scale_columns, inplace=True)
    input.drop(columns=scale_columns, inplace=True)


    pt.fit(train_columns)

    train_scaled = pd.DataFrame(pt.transform(train_columns), columns=[col + '_pt' for col in scale_columns])
    train_scaled['index'] = train.index
    train_return = train_scaled.join(train, on='index')
    train_return.set_index('index', inplace=True)

    test_scaled = pd.DataFrame(pt.transform(test_columns), columns=[col + '_pt' for col in scale_columns])
    test_scaled['index'] = test.index
    test_return = test_scaled.join(test, on='index')
    test_return.set_index('index', inplace=True)

    input_scaled = pd.DataFrame(pt.transform(input_columns), columns=[col + '_pt' for col in scale_columns])
    input_scaled['index'] = input.index
    input_return = input_scaled.join(input, on='index')
    input_return.set_index('index', inplace=True)


    return train_return, test_return, input_return

In [None]:
# applying scaling to columns we will be using

def normaliser(train,test,input, scale_columns = ['thinness_ten_nineteen_years', 'schooling']):

    norm = Normalizer()


    train_columns = train[scale_columns].copy()
    test_columns = test[scale_columns].copy()
    input_columns = input[scale_columns].copy()

    train.drop(columns=scale_columns, inplace=True)
    test.drop(columns=scale_columns, inplace=True)
    input.drop(columns=scale_columns, inplace=True)

    print(train_columns)
    norm.fit(train_columns)

    train_scaled = pd.DataFrame(norm.transform(train_columns), columns=[col + '_norm' for col in scale_columns])
    print(train_scaled)
    train_scaled['index'] = train.index
    train_return = train_scaled.join(train, on='index')
    train_return.set_index('index', inplace=True)

    print(type(train_return))

    test_scaled = pd.DataFrame(norm.transform(test_columns), columns=[col + '_norm' for col in scale_columns])
    test_scaled['index'] = test.index
    print(test_scaled)
    test_return = test_scaled.join(test, on='index')
    test_return.set_index('index', inplace=True)

    input_scaled = pd.DataFrame(norm.transform(input_columns), columns=[col + '_norm' for col in scale_columns])
    input_scaled['index'] = input.index
    input_return = input_scaled.join(input, on='index')
    input_return.set_index('index', inplace=True)


    return train_return, test_return, input_return

In [None]:
def cleaning(df):

    # Making all columns be formatted the same (stripped, lowercase, spacing, underscores in the middle)
    clean_cols = list(df.columns)
    new_cols = []
    for col in clean_cols:
        new_cols.append(col.strip().replace('  ', ' ').replace(' ', '').lower())
    df.columns = new_cols

    return df

In [None]:
# our deatiled model

def detailed_model():

    features = ['adult_mortality', 'alcohol_consumption', 'measles', 'polio', 'incidents_hiv', 'thinness_ten_nineteen_years', 'thinness_five_nine_years', 'schooling', 'economy_status_developed']

    #take inputs and convert them to a dataframe
    input_df = input_features_detailed()

    df = pd.read_csv('sample_data/Life_Expectancy_Data_Updated.csv')


    df_clean = cleaning(df)

    X = df_clean[features]
    y = df_clean['life_expectancy']

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 63)

    #transform training data
    X_train_fe, X_test_fe, input_fe = power_transform(X_train, X_test, input_df )


    X_train_fe, X_test_fe, input_fe = normaliser(X_train_fe, X_test_fe, input_fe)

    X_train_fe = sm.add_constant(X_train_fe)
    X_test_fe = sm.add_constant(X_test_fe)
    input_fe = sm.add_constant(input_fe, has_constant='add')

    lin_reg = sm.OLS(y_train, X_train_fe)
    model = lin_reg.fit()

    #uncomment below to see summary statistics of model
    #print(model.summary())

    y_pred_test = model.predict(X_test_fe)

    test_rmse = statsmodels.tools.eval_measures.rmse(y_test, y_pred_test)


    #predict on inputs
    input_pred = model.predict(input_fe)

    print(f"Predicted life expectancy of {input_pred} years")
    print(f"RMSE value of {test_rmse}")

    pass

In [None]:
# calling the deatiled model function

detailed_model()

In [None]:
# creating a simple fucntion that does not include columns that are considered unethical

def simple_model():

    features = ['adult_mortality','schooling','alcohol_consumption', 'economy_status_developed']

    #take inputs and convert them to a dataframe
    input_df = input_features_simple()

    df = pd.read_csv('Life-Expectancy-Data-Updated.csv')


    df_clean = cleaning(df)

    X = df_clean[features]
    y = df_clean['life_expectancy']

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 63)

    #transform training data
    X_train_fe, X_test_fe, input_fe = power_transform(X_train, X_test, input_df, scale_columns=['adult_mortality', 'alcohol_consumption'] )


    X_train_fe = sm.add_constant(X_train_fe)
    X_test_fe = sm.add_constant(X_test_fe)
    input_fe = sm.add_constant(input_fe, has_constant='add')



    lin_reg = sm.OLS(y_train, X_train_fe)
    model = lin_reg.fit()

    #uncomment below to see summary statistics of the model
    #print(model.summary()) --
    y_pred = model.predict(X_train_fe)
    #

    y_pred_test = model.predict(X_test_fe)

    test_rmse = statsmodels.tools.eval_measures.rmse(y_test, y_pred_test)

    #predict on inputs
    input_pred = model.predict(input_fe)

    print(f"Predicted life expectancy of {input_pred} years")
    print(f"RMSE of {test_rmse}")

    pass

In [None]:
# calling simple model

simple_model()

In [None]:
# creating a main overview function that takes an input whether the user wants to use ethical or unethical columns, in turn the respective model will be called

def main():

    valid = False
    while not valid:
        choice = input("Would you like to use protected data (Yes/No): ")

        if choice.lower() in ['yes','no']:
            valid = True
        #else:
        #input incorrect prompt

    if choice.lower() == 'no':
        simple_model()
    else:
        detailed_model()

    pass

In [None]:
main()

Predicted life expectancy of index
0    73.686899
dtype: float64 years
RMSE of 2.8561936548662885


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(columns=scale_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(columns=scale_columns, inplace=True)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
