In [1]:
import warnings

import pandas as pd
import numpy as np

In [15]:
input_dir = '.\\out\\'
data_input = "cleaned_world_bank_data.pkl"
data = pd.read_pickle(input_dir + data_input)

test_targets_file = "test_targets_data.pkl"
test_targets = pd.read_pickle(input_dir + test_targets_file)

In [16]:
predict_year = 2010
target = 'SI.POV.DDAY'
n_avg = 3

### Naive Predictor

Take the average of the last 'n_avg' values of the target variable

In [17]:
#Create an empty output dataframe
countries_in_data = list(data.index.levels[0]) 
naive_predictions = pd.DataFrame(index=countries_in_data, columns=[target])

In [18]:
for country in countries_in_data:
    with warnings.catch_warnings():
        # it's ok that we are getting the mean of empty arrays sometimes. It's the nature of..
        #our data that there is a lot of missing data.
        warnings.simplefilter("ignore") 
        naive_predictions.loc[country] = np.nanmean(data.loc[(country,str(predict_year-n_avg-1)):(country,str(predict_year-1)), 'SI.POV.DDAY' ].values)

In [19]:
naive_predictions.head(10)

Unnamed: 0,SI.POV.DDAY
Afghanistan,
Albania,0.4
Algeria,
American Samoa,
Andorra,
Angola,30.1
Antigua and Barbuda,
Argentina,2.85
Armenia,2.325
Aruba,


### Evaluate Naive Predictor


If there is no actual value for the country then we should exclude this country from the score. 

If there is a actual value for the country but we haven't predicted one this should register as a poor score.

In [46]:
naive_predictions.head(20)

Unnamed: 0,SI.POV.DDAY
Afghanistan,
Albania,0.4
Algeria,
American Samoa,
Andorra,
Angola,30.1
Antigua and Barbuda,
Argentina,2.85
Armenia,2.325
Aruba,


In [47]:
test_targets.head(20)

Unnamed: 0,SI.POV.DDAY
Afghanistan,
Albania,
Algeria,
American Samoa,
Andorra,
Angola,
Antigua and Barbuda,
Argentina,1.1
Armenia,1.9
Aruba,


In [24]:
np.nanmean((naive_predictions - test_targets)**2)

4.54646701388889

In [None]:
#mse = ((naive_predictions - B)**2).mean(axis=ax)

In [62]:
def mse_countries(true_data, predictions_data):  
    """
    Calculates MSE of country and returns meta data on missing true values and predictions 

    Args:
        true_data: dataframe of  true_data
        predictions_data: dataframe of prediction data for comparing with true_data. Needs to be the..
                          dimension as true_data.
    
    Returns:
        mse: Mean Squared Eror value
        countries_no_true_value: list of countries with no true value
        countries_no_prediction: list of coutries that have a true value bit no prediction
    """
    
    assert(true_data.shape == predictions_data.shape), "Input dataframes need to have same dimensions"
    
    mse = np.nanmean((naive_predictions - test_targets)**2)
    countries_no_true_value = list(true_data[true_data.isna().values].index)
    countries_with_true_value = list(true_data[~true_data.isna().values].index)
    total_countries_with_no_prediction = list(predictions_data[predictions_data.isna().values].index)
    #What we are interested in is countries that have real values but no predictions.
    countries_no_prediction =  set(countries_with_true_value) & set(total_countries_with_no_prediction)
    
    return mse, countries_no_true_value, countries_no_prediction

In [98]:
mse_result,_,countries_not_predicted = mse_countries(test_targets, naive_predictions)

In [99]:
print("MSE for Naive predictor:", mse_result)
print("Number of coutries with true value but that had no prediction:", len(countries_not_predicted) )

MSE for Naive predictor: 4.54646701388889
Number of coutries with true value but that had no prediction: 13
