In [1]:
import warnings

import pandas as pd
import numpy as np

import sys
sys.path.append('..')
from utils import preprocess, missing, evaluate

In [2]:
input_dir = '.\\..\\data\\'
data_input = "cleaned_data.pkl"
data = pd.read_pickle(input_dir + data_input)

In [3]:
predict_year = 2010
target = 'SI.POV.DDAY'
n_avg = 3

### Missing Value Imputation

In [12]:
#get rid of EN.POP.SLUM.UR.ZS and SI.SPR.PC40 since they have so many missing values
#This needs to be removed from both the train and test regressors file
remove = ['EN.POP.SLUM.UR.ZS', 'SI.SPR.PC40']
data = data.drop(remove, axis=1)

#Missing value imputation using forward fill folled by backfill 
missing.perform_ffill_bfill(data, predict_year, target)
    
#Set remaining missing values to 0. (This should be replaced by mean imputation ASAP)

#First, for all columns except the target 
all_cols_except_target = list(data.columns.values)
all_cols_except_target.remove(target)

data.loc[:,all_cols_except_target] = data.loc[:,all_cols_except_target].fillna(0)
#Then, for the target just cover years up to the year before our target year.  
idx = pd.IndexSlice
data.loc[idx[:, :str(predict_year-1)], target] = data.loc[idx[:, :str(predict_year-1)], target].fillna(0)

### Obtaining the targets

Here we read in the cleaned input data and apply the windowing method to get the targets (see the Linear regression file for more details on the windowing method.

In [13]:
_,_,_, data_test_targets =   preprocess.window_data2(data,predict_year=2010, target=target)

### Naive Predictor

Take the average of the last 'n_avg' values of the target variable

In [14]:
#Create an empty output dataframe
countries_in_data = list(data.index.levels[0]) 
naive_predictions = pd.DataFrame(index=countries_in_data, columns=[target])

In [15]:
for country in countries_in_data:
    with warnings.catch_warnings():
        # it's ok that we are getting the mean of empty arrays sometimes. It's the nature of..
        #our data that there is a lot of missing data.
        warnings.simplefilter("ignore") 
        naive_predictions.loc[country] = np.nanmean(data.loc[(country,str(predict_year-n_avg-1)):(country,str(predict_year-1)), 'SI.POV.DDAY' ].values)

In [16]:
naive_predictions.head(10)

Unnamed: 0,SI.POV.DDAY
Afghanistan,0.0
Albania,0.75
Algeria,5.8
American Samoa,0.0
Andorra,0.0
Angola,31.2
Antigua and Barbuda,0.0
Argentina,2.85
Armenia,2.325
Aruba,0.0


### Evaluate Naive Predictor


If there is no actual value for the country then we should exclude this country from the score. 

If there is a actual value for the country but we haven't predicted one this should register as a poor score.

In [19]:
ignore = ['Vanuatu','Tuvalu']
mse_result,no_true_value,countries_not_predicted = evaluate.mse_countries(data_test_targets, naive_predictions, ignore)

In [20]:
print("MSE for Naive predictor:", mse_result)
print("Number of coutries with true value but that had no prediction:", len(countries_not_predicted) )

MSE for Naive predictor: 21.86175986842105
Number of coutries with true value but that had no prediction: 2
