In [58]:
import warnings

import pandas as pd
import numpy as np

In [59]:
input_dir = '.\\out\\'
data_input = "cleaned_world_bank_data.pkl"
data = pd.read_pickle(input_dir + data_input)

test_targets_file = "test_targets_data.pkl"
test_targets = pd.read_pickle(input_dir + test_targets_file)

In [60]:
predict_year = 2010
target = 'SI.POV.DDAY'
n_avg = 3

### Naive Predictor

Take the average of the last 'n_avg' values of the target variable

In [61]:
#Create an empty output dataframe
countries_in_data = list(data.index.levels[0]) 
naive_predictions = pd.DataFrame(index=countries_in_data, columns=[target])

In [62]:
for country in countries_in_data:
    with warnings.catch_warnings():
        # it's ok that we are getting the mean of empty arrays sometimes. It's the nature of..
        #our data that there is a lot of missing data.
        warnings.simplefilter("ignore") 
        naive_predictions.loc[country] = np.nanmean(data.loc[(country,str(predict_year-n_avg-1)):(country,str(predict_year-1)), 'SI.POV.DDAY' ].values)

In [63]:
naive_predictions.head(10)

Unnamed: 0,SI.POV.DDAY
Afghanistan,
Albania,0.75
Algeria,5.8
American Samoa,
Andorra,
Angola,31.2
Antigua and Barbuda,
Argentina,2.85
Armenia,2.325
Aruba,


### Evaluate Naive Predictor


If there is no actual value for the country then we should exclude this country from the score. 

If there is a actual value for the country but we haven't predicted one this should register as a poor score.

In [64]:
naive_predictions.loc['Thailand']

SI.POV.DDAY    0.325
Name: Thailand, dtype: object

In [65]:
test_targets.loc['Thailand']

SI.POV.DDAY    0.1
Name: Thailand, dtype: object

In [17]:
np.nanmean((naive_predictions - test_targets)**2)

4.54646701388889

In [None]:
#mse = ((naive_predictions - B)**2).mean(axis=ax)

In [66]:
def mse_countries(true_data, predictions_data, ignore_countries = None):  
    """
    Calculates MSE of country and returns meta data on missing true values and predictions 

    Args:
        true_data: dataframe of  true_data
        predictions_data: dataframe of prediction data for comparing with true_data. Needs to be the..
                          dimension as true_data.
        ignore_countries: Countries to have their predictions ignored. Their predictions are set to NaN
    
    Returns:
        mse: Mean Squared Eror value
        countries_no_true_value: list of countries with no true value
        countries_no_prediction: list of coutries that have a true value bit no prediction
    """
    predictions_data_local = predictions_data.copy()
    
    assert(true_data.shape == predictions_data_local.shape), "Input dataframes need to have same dimensions"
    
    if ignore_countries is not None:
        predictions_data_local.loc[ignore_countries] = np.NaN
    
    mse = np.nanmean((predictions_data_local - test_targets)**2)
    countries_no_true_value = list(true_data[true_data.isna().values].index)
    countries_with_true_value = list(true_data[~true_data.isna().values].index)
    total_countries_with_no_prediction = list(predictions_data_local[predictions_data_local.isna().values].index)
    #What we are interested in is countries that have real values but no predictions.
    countries_no_prediction =  set(countries_with_true_value) & set(total_countries_with_no_prediction)
    countries_predicted = set(countries_with_true_value)
    return mse, countries_no_true_value, countries_no_prediction

In [67]:
mse_result,no_true_value,countries_not_predicted = mse_countries(test_targets, naive_predictions)

In [68]:
print("MSE for Naive predictor:", mse_result)
print("Number of coutries with true value but that had no prediction:", len(countries_not_predicted) )

MSE for Naive predictor: 25.038072289156624
Number of coutries with true value but that had no prediction: 2
