In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.metrics.scorer import make_scorer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression


from matplotlib import pyplot as plt
import seaborn as sns
import missingno as msno

import sys
sys.path.append('..')
from utils import preprocess, missing, evaluate

### Global Variables and Load Data

In [2]:
target = 'SI.POV.DDAY'
predict_year=2010
#percent of input Indicators to use (set to 100 for full set of input features)
percent = 50

In [3]:
#Load the data from disk
input_dir = '.\\..\\data\\'
data_input = "cleaned_data.pkl"
data = pd.read_pickle(input_dir + data_input)

#Possible subset of data choosen to reduce calulation time
#For percetages less than 100% we try to choose a subset that represents the spread of variables

if percent == 100:
    pass
else: 
    num_indicators_original = data.shape[1]
    step = int(100/percent)
    data_new = data.iloc[:,::step].copy()
    #Add the target column if not already included
    if target not in data_new.columns:
        data_new[target] = data[target]
    data = data_new
    
print(data.shape[1], "indicators included")

411 indicators included


### Break data into windows

In [4]:
%time data_regressors, data_targets = \
        preprocess.window_data(data, lag=3,num_windows=10, step=1, predict_year=2010, \
                         target=target, impute_type='interpolation')

Wall time: 3min 35s


In [5]:
#Break up into training and testing data.

idx = pd.IndexSlice
data_train_regressors = data_regressors.loc[idx[:,2:10],:]
data_train_targets = data_targets.loc[idx[:,2:10],:]
data_test_regressors = data_regressors.loc[idx[:,1],:]
data_test_targets= data_targets.loc[idx[:,1],:]

In [6]:
#For Training, only consider windows that don't have a missing target as they offer nothing to training
#Therefore, remove those observations from both the training regressors and targets datasets.
data_train_regressors_subset = data_train_regressors[~np.isnan(list(data_train_targets.values.flatten()))]
data_train_targets_subset = data_train_targets[~np.isnan(list(data_train_targets.values.flatten()))]

#For testing, also remove windows with no target variable as it is impossible to measure preformance.
data_test_regressors_subset = data_test_regressors[~np.isnan(list(data_test_targets.values.flatten()))]
data_test_targets_subset = data_test_targets[~np.isnan(list(data_test_targets.values.flatten()))]

### Models

In [52]:
forest = RandomForestRegressor(n_estimators=100)
forest.fit( data_train_regressors_subset.values,data_train_targets_subset.values.ravel())
#Make predictions
predictions = forest.predict(data_test_regressors_subset.values) 

mse= mean_squared_error(data_test_targets_subset, predictions)
print("RMSE of random forest  is:", np.sqrt(mse))

RMSE of random forest  is: 4.89543775809411


In [53]:
gradboost = GradientBoostingRegressor(n_estimators=100)
gradboost.fit( data_train_regressors_subset.values,data_train_targets_subset.values.ravel())
#Make predictions
predictions = gradboost.predict(data_test_regressors_subset.values) 

mse= mean_squared_error(data_test_targets_subset, predictions)
print("RMSE of gradient boosting is:", np.sqrt(mse))

RMSE of gradient boosting is: 4.5152008405377995


#### Select K Best

In this section I consider using k best features according to some metric of measure to see if it gives any tangible gain in peroformance of the tree-based methods.

Using the F score to rank features did not result in much of an improvement. Some experimentation could be carried out on other metrics to mcompare featurs. There are selection available in [Scikit-Learn](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection)

In [61]:
kbest_data = data.copy()

k=50

In [62]:
#Some (if not all) of the feature selection algorithms dont deal with missing values
kbest_data = preprocess.impute_data_interpolation(kbest_data, predict_year-1, 'linear')

idx = pd.IndexSlice
kbest_y = kbest_data.loc[idx[:,'1972':'2009'],target]

kbest_X = kbest_data.loc[idx[:,'1972':'2009'],:]
kbest_X = kbest_X.drop(target, axis=1)

kbest_selector = SelectKBest(score_func=f_regression, k=k)
scores = kbest_selector.fit(kbest_X.values,kbest_y.values)

top_k_score = np.sort(scores.scores_)[-(k)]
map_of_top_k_scores = scores.scores_ > top_k_score

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  


In [63]:
small_data = data.copy()

In [64]:
#Reduce columns to just the k best
target_column = small_data[target]
small_data = small_data.drop(target,axis=1)

small_data = small_data.loc[:,map_of_top_k_scores]
small_data[target] = target_column.values

In [65]:
%time data_regressors, data_targets = \
        preprocess.window_data(small_data, lag=3,num_windows=10, step=1, predict_year=2010, \
                         target=target, impute_type='interpolation')

Wall time: 26.9 s


In [66]:
#Break up into training and testing data.

idx = pd.IndexSlice
data_train_regressors = data_regressors.loc[idx[:,2:10],:]
data_train_targets = data_targets.loc[idx[:,2:10],:]
data_test_regressors = data_regressors.loc[idx[:,1],:]
data_test_targets= data_targets.loc[idx[:,1],:]

#For Training, only consider windows that don't have a missing target as they offer nothing to training
#Therefore, remove those observations from both the training regressors and targets datasets.
data_train_regressors_subset = data_train_regressors[~np.isnan(list(data_train_targets.values.flatten()))]
data_train_targets_subset = data_train_targets[~np.isnan(list(data_train_targets.values.flatten()))]

#For testing, also remove windows with no target variable as it is impossible to measure preformance.
data_test_regressors_subset = data_test_regressors[~np.isnan(list(data_test_targets.values.flatten()))]
data_test_targets_subset = data_test_targets[~np.isnan(list(data_test_targets.values.flatten()))]

In [68]:
forest = RandomForestRegressor(n_estimators=100, min_samples_split=3)
forest.fit( data_train_regressors_subset.values,data_train_targets_subset.values.ravel())
#Make predictions
predictions = forest.predict(data_test_regressors_subset.values) 

mse= mean_squared_error(data_test_targets_subset, predictions)
print("MSE of random forest using subset of features is:", mse)

MSE of random forest using subset of features is: 26.0995689670536


In [69]:
gradboost = GradientBoostingRegressor(n_estimators=100)
gradboost.fit( data_train_regressors_subset.values,data_train_targets_subset.values.ravel())
#Make predictions
predictions = gradboost.predict(data_test_regressors_subset.values) 

mse= mean_squared_error(data_test_targets_subset, predictions)
print("MSE of gradient boosting using subset of features is:", mse)

MSE of gradient boosting using subset of features is: 26.94904740652549
