## Imports

In [2]:
import numpy as np #for logging GDP and HIV
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #To scaler results
import pandas as pd  #df transformation for scaler

# Helper Function

Functions to help map, validate the input from the question prompts 

## Mapping Functions

In [7]:
#map values from the question selection 1-8 to the appropiateley named Region_ 
def map_region(value):
    if value == 1:
        name = 'Region_Asia'
    elif value == 2:
        name = 'Region_Central America and Caribbean'
    elif value == 3:
        name = 'Region_European Union'
    elif value == 4:
        name = 'Region_Middle East'
    elif value == 5:
        name = 'Region_North America'
    elif value == 6:
        name = 'Region_Oceania'
    elif value == 7:
        name = 'Region_Rest of Europe'
    elif value == 8:
        name = 'Region_South America'
    return name

## Validation Functions

In [10]:
# input_value: input response from question prompts
# tries to cast input_value as a float, checks if positive. If an exception is raised asks that the Value be positive number 
def validate_positive_float(input_value):
    try:
        value = float(input_value)
        assert value > 0  #checks that the value is positive
        return value
    except (ValueError, #error if cannot be turned to float()
            AssertionError): #error if assertion cannot be done
        raise ValueError(f"Must be a positive number.") #message to be sent sepecifying a positive number is needed


# input_value: input response from question prompts
# minv : lower bound (default 0)
# maxv : upper bound (deafult 100000)
# Checks that a value is numeric, and that it falls within the specified bounds (inclusive)
def validate_ranged(input_value, minv=0, maxv=100000):
    try:
        value = float(input_value)
        assert minv <= value <= maxv  #check that teh value is between bounds (inclusive)
        return value
    except (ValueError, #error if cannot be turned to float()
            AssertionError): #error if assertion cannot be done
        raise ValueError(f"Must be a number between {minv} and {maxv}.") #message to be sent sepecifying that the number must be bounded 

# input_value: input response from question prompts
#calls for a ranged validation with the min BMI 1 and max BMI 40
def validate_bmi(input_value):
    return validate_ranged(input_value, 1, 40)
    
# input_value: input response from question prompts
# calls for a ranged validation with the min REGION 1 and max 8
# if no exception raised, maps the validated int response to the correct parameter name for that Region 
# parameter name based on OHE split
def validate_region(input_value):
    enc_map = int(validate_ranged(input_value, 1, 8))
    return map_region(enc_map)

# input_value: input response from question prompts
# calls for a ranged validation with the min SCHOOL 0 and max 20    
def validate_school(input_value):
    return validate_ranged(input_value, 0, 20)

# input_value: input response from question prompts
def validate_consent(input_value):
    try:
        lowered_in = str(input_value).strip().lower()   #lower input to verify case insensitive
        if lowered_in == "y": #consent is given for precise model 
            return True
        elif lowered_in == "n": #use minimal model 
            return False
        else:
            raise ValueError("Please enter 'y' or 'n'")
    except AttributeError:
        raise ValueError("Please input text") 
    


## Feature engineering

In [35]:
 #run feature engineering to get the scalers 
from IPython.utils.io import capture_output #liprary to supress print statements 
with capture_output() as captured: #capture any log output (print statements)
    %run ./fe_analytica.ipynb #run the feature engineering noteboook

In [36]:
# responses: validated and mapped responses from question prompt
# from imported scalers, scaled the input data to match the scale with the coefficients 
# logs the appropriate data parameters IF in responses
# returns dictionary of transformed values
def scale_responses(responses):
    responses = pd.DataFrame.from_dict([responses]) #convert to data frame for easier application 

   
    
    if 'Incidents_HIV_log' in responses: #check that the responses has HIV as an answer
        responses['Incidents_HIV_log'] = np.log(responses['Incidents_HIV_log'])
    if 'GDP_per_capita_log' in responses:#check that the responses has GDP as an answer
        responses['GDP_per_capita_log'] = np.log(responses['GDP_per_capita_log'])


    #IF statements check if the feature is in the responses data frame, since minimal model would not have all of these. 
    #If TRUE applies the correct scaler transformation model imported

    # standard scalers
    if 'BMI' in responses:
        responses['BMI'] = standard_scaler_bmi.transform(responses[['BMI']]) 
    if 'Schooling' in responses:
        responses['Schooling'] = standard_scaler_schooling.transform(responses[['Schooling']])  

     # minmax scalers
    if 'GDP_per_capita_log' in responses:
        responses['GDP_per_capita_log'] = minmax_scaler_gdp.transform(responses[['GDP_per_capita_log']])  
    if 'Incidents_HIV_log' in responses:
        responses['Incidents_HIV_log'] = minmax_scaler_hiv.transform(responses[['Incidents_HIV_log']])  

    # robust scalers
    if 'Under_five_deaths' in responses:
        responses['Under_five_deaths'] = robust_scaler_under_five.transform(responses[['Under_five_deaths']]) 
    if 'Adult_mortality' in responses:
        responses['Adult_mortality'] = robust_scaler_adult_mortality.transform(responses[['Adult_mortality']]) 

    #convert 1 row back into a dictionary
    scaled = responses.squeeze().to_dict()
   
    return scaled  

# Model 
Dictionary holding each model's coefficient and the correct parameter name to link back to the question responses. 

In [40]:
##Model 1: Precise model coefficients and constant
model_1_params = {
    'const': 69.21197568956534, 
    'Under_five_deaths': -4.1822209941909385, 
    'Adult_mortality': -6.247798980442863, 
    'BMI': -0.5529768222701575, 
    'Schooling': 0.5469970557304903, 
    'Region_Asia': -0.2587562486476594, 
    'Region_Central America and Caribbean': 1.5132693392133942, 
    'Region_European Union': 0.718721630638919, 
    'Region_Middle East': -0.22551325872249203, 
    'Region_North America': 1.4279321152360975, 
    'Region_Oceania': -0.7937502455802756, 
    'Region_Rest of Europe': 0.3350217991614541, 
    'Region_South America': 1.0844795471719362, 
    'Incidents_HIV_log': -1.2380416175750373, 
    'GDP_per_capita_log': 5.0624745859883316
}


##Model 2: Minimal model coefficients and constant
model_2_params = {
    'const': 72.1525761376697, 
    'Under_five_deaths': -5.494643384419958, 
    'Adult_mortality': -6.658005120051611, 
    'BMI': -0.13039081436056832
}

# Model Computation 

Takes validated, mapped and scaled responses and applied the  linear formula to it. At this point our responses dictionary will map 1 to 1 with each key name and model_x_param key name

$$ y = b_0 + x_1 b_1 + \dots + x_n b_n $$

In [43]:
#responses: dictionary of user responses mapping a transformed value to the same key as the model_params
#model_params: the selected model coefficient params
def compute_prediction(responses, model_params):
    prediction = model_params['const'] #add the constant value b0
    
    for feature, value in responses.items(): #iterate through each key_name and value
        if feature in model_params: #find the matching model_params
            prediction += model_params[feature] * value # add the multiplication of x (the user response) and b(the model coefficient)
    
    return prediction #return y 
    

# Interface

## Questions to ask 

Depending on model, they have a series of questions that will be asked. 
The array is an array of tuples. 

* $array[i][0]$: string : Question that will be asked, specifying conditions of input
* $array[i][1]$: function : Validation funtion this question will need to check that the input is correct
* $array[i][2]$: string : Parameter name in the coefficient model list to match
    * There is an exception with region, but this will be mapped later on and the name adjusted to match Region_* 

In [47]:
#questions for the precise model 
def get_precise_question_and_validation():
    precise_qv = [       
        ("Under five deaths (Number of under-five deaths per 1000 population)\n",
             validate_positive_float,
            "Under_five_deaths"
        ),
        ("Adult mortality (Probability of dying between 15 and 60 years per 1000 population)\n", 
             validate_positive_float,
             "Adult_mortality",
        ),
        ("BMI (Average Body Mass Index of entire population)\n", 
             validate_bmi,
            "BMI"
        ),
        ("Schooling (Number of years of Schooling(years)):\n", 
             validate_school,
                "Schooling"
        ),
        ("Incidents of HIV (Deaths per 1,000 live births HIV/AIDS (0-4 years))\n", 
             validate_positive_float,
            "Incidents_HIV_log"),
        ("GDP per capita (Gross Domestic Product per capita (in USD))\n$", 
             validate_positive_float,
        "GDP_per_capita_log"),
        (
            "Select a region for:\n"
            "  1: Asia\n"
            "  2: Central America and Caribbean\n"
            "  3: European Union\n"
            "  4: Middle East\n"
            "  5: North America\n"
            "  6: Oceania\n"
            "  7: Rest of Europe\n"
            "  8: South America\n",
            validate_region,
            "Region"
        )
    ]
    return precise_qv #array of tuples

In [49]:
#questions for the minimal model 
def get_minimal_question_and_validation():
    minimal_qv = [       
        ("Under five deaths (Number of under-five deaths per 1000 population)\n",
             validate_positive_float,
            "Under_five_deaths"
        ),
        ("Adult mortality (Probability of dying between 15 and 60 years per 1000 population)\n", 
             validate_positive_float,
             "Adult_mortality",
        ),
        ("BMI (Average Body Mass Index of entire population)\n", 
             validate_bmi,
            "BMI"
        )
    ] 
    return minimal_qv #array of tuples

## Logic 

In [52]:
# First prompt asked, selection of model. This will affect the questions asked and determine the model used to calculate the life expectancy
def get_model_wanted():
    while True: #Will keep trying until a response is succesful 
        try:
            #question to be asked, y/n
            response = input(    
                "Do you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N)"
            )
            validated_response = validate_consent(response) #check that the user response is a y/n selection
            break #succesful response, exit the loop
        except ValueError as e: #error raised, print the error. 
                print(e)
    return validated_response

In [54]:
#main application 
def app():
    model_chosed = get_model_wanted() #prompt for a model selection, ask for consent y/n
    if model_chosed: #precise model selected
        print("***** Precise Model ***** \n\n\n") #print user feedback to verify choice
        questions_and_validators = get_precise_question_and_validation() #gather the questions and validators that precise model needs
        model_param = model_1_params #assign model parameters to be for precise model
    else: #same as above but for minimalistic model
        print("***** Minimalistic Model *****\n\n\n")
        questions_and_validators = get_minimal_question_and_validation()
        model_param = model_2_params
    
    responses = {} #initialise response dictionary
    for question, validator, name in questions_and_validators: #iterate through each tuple in the array 
        while True:# try until we get a successful response
            try:
                response = input(f"{question} ") #output question 
                validated_response = validator(response) #use validation on response
                if name == 'Region': #if we are asking about region map then map the region and assign value as 1 
                    responses[validated_response] = 1 #OHE maps as true or not, so value will always be one
                else:
                    responses[name] = validated_response
                
                break #succesfull response, break the loop and to next iteration 
            except ValueError as e: #raise error if any value errors raised during validation 
                print(e)
     

    print("\n\n\n*****  *****")
    scaled_responses = scale_responses(responses) #scale the responses 
    prediction = compute_prediction(scaled_responses, model_param) #execute the prediction model 
    
    # print(f"Your responses are: \n {responses}") #feedback to verify correct gathering of responses
    print(f"\n\nPredicted life expectancy is: \n\n{prediction:.2f}") #output computated life expectancy to 2 dp 

# Application 

In [57]:
data = app() #run app 

Do you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N) y


***** Precise Model ***** 





Under five deaths (Number of under-five deaths per 1000 population)
  1
Adult mortality (Probability of dying between 15 and 60 years per 1000 population)
  1
BMI (Average Body Mass Index of entire population)
  1
Schooling (Number of years of Schooling(years)):
  1
Incidents of HIV (Deaths per 1,000 live births HIV/AIDS (0-4 years))
  1
GDP per capita (Gross Domestic Product per capita (in USD))
$  1
Select a region for:
  1: Asia
  2: Central America and Caribbean
  3: European Union
  4: Middle East
  5: North America
  6: Oceania
  7: Rest of Europe
  8: South America
  1





*****  *****


Predicted life expectancy is: 

78.22
