In [14]:
import warnings

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

import sys
sys.path.append('..')
from utils import preprocess, missing, evaluate

In [2]:
input_dir = '.\\..\\data\\'
data_input = "cleaned_data.pkl"
data = pd.read_pickle(input_dir + data_input)

In [44]:
predict_year = 2010
target = 'SI.POV.DDAY'

### Obtaining the targets

Here we read in the cleaned input data and apply the windowing method to get the targets (see the Linear regression file for more details on the windowing method.

In [4]:
%time data_regressors, data_targets = \
        preprocess.window_data(data, lag=2,num_windows=1, step=1, predict_year=2010, \
                         target=target, impute_type='interpolation')

Wall time: 42.6 s


In [6]:
idx = pd.IndexSlice
data_test_targets= data_targets.loc[idx[:,1],:]

### Naive Predictor

Take the average of the last 'n_avg' values of the target variable

In [None]:
#number of previous values to average over to make prediction
n_avg = 1

#Note on n_avg: I found that the best result was obtained by setting to 1 (i.e. that the prediction is simply..
#the value from last year. I guess that this is not, altogether, very surprising)

In [20]:
#First fill in missing values 
data_imputed = preprocess.impute_data_interpolation(data, predict_year-1, 'linear')

In [52]:
#Create an empty output dataframe
countries_in_data = list(data_imputed.index.levels[0]) 
naive_predictions = pd.DataFrame(index=countries_in_data, columns=[target])

idx = pd.IndexSlice
for country in countries_in_data:
    with warnings.catch_warnings():
        # it's ok that we are getting the mean of empty arrays sometimes. It's the nature of..
        #our data that there is a lot of missing data.
        warnings.simplefilter("ignore") 
        naive_predictions.loc[country] = np.nanmean(data_imputed.loc[idx[country,str(predict_year-n_avg):str(predict_year-1)], 'SI.POV.DDAY' ].values)

### Evaluate Naive Predictor


If there is no actual value for the country then we should exclude this country from the score. 

If there is a actual value for the country but we haven't predicted one this should register as a poor score.

In [54]:
mse= mean_squared_error(naive_predictions, data_test_targets )
print("MSE for Naive predictor:", np.sqrt(mse))

MSE for Naive predictor: 4.771564871748177


In [19]:
mse_result,no_true_value,countries_not_predicted = evaluate.mse_countries(data_test_targets, naive_predictions, ignore)