In [15]:
%load_ext jupyternotify
import numpy as np
import pandas as pd
import matplotlib
import math
from sklearn import model_selection, ensemble, metrics, linear_model, preprocessing, pipeline
from matplotlib import pyplot as plt
%matplotlib inline

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [2]:
data = pd.read_csv('dataset/undp_train.csv', na_values='nan')

In [6]:
test = pd.read_csv('dataset/undp_test.csv', na_values='nan')

In [83]:
numeric_data_indices = list(range(len(data.columns)-1))
numeric_data_indices.remove(1)

In [93]:
data_no_world = data[data['country'] != 'World']

In [94]:
# FOR ONE HOT ENCODING
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score as CVS

def RMSE(x,y):
    return MSE(x,y)**0.5

# Pipeline
pipe = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        
            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_data_indices])),
                ('imputing', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
                ('scaling', preprocessing.StandardScaler())            
                        ])),
        
            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, 1].reshape(-1, 1))),
                ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown = 'ignore'))            
                        ])),
        ]))
    ]
)

# shortcut for scorer
nrmse = 'neg_root_mean_squared_error'

X = data_no_world.iloc[:, 0:-1].values
y = data_no_world.iloc[:, -1].values

X = pipe.fit_transform(X)

# split
(X_train, X_test, y_train,
 y_test) = model_selection.train_test_split(X, y,
          test_size = 0.2, shuffle=True, random_state=777)

In [95]:
#XGBoost WITHOUT target transform
import xgboost as xgb
XGB = xgb.XGBRegressor(
    max_depth=5,
    learning_rate=0.1,
    n_estimators=1500,
    verbosity=1,
    silent=None,
    objective='reg:squarederror',
    booster='gbtree',
    n_jobs=3,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=1,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=0.3,
    reg_alpha=0,
    reg_lambda=0.11,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=42,
    seed=None,
    missing=None,
    importance_type='gain')
XGB.fit(X_train, y_train)

print(MSE(y_train, XGB.predict(X_train))**0.5)
print(MSE(y_test, XGB.predict(X_test))**0.5)

0.000944783712376604
0.007413320735632963


In [87]:
cvs = -CVS(XGB, X_train,y_train, scoring=nrmse, cv=5)
print(cvs.mean(), 'CV score 5-fold of XGB on train')

0.008529225967588486 CV score 5-fold of XGB on train


In [78]:
#Let's use RandomForest
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=500,
                              criterion='mse',
                              max_depth=None,
                              min_samples_split=2,
                              min_samples_leaf=1,
#                              max_features=26,
                              max_features=86,
                              n_jobs=3)

print(-CVS(forest, X_train,y_train, scoring=nrmse, cv=5).mean(), 'CV score 5-fold of RandomForest on train')
forest.fit(X_train, y_train)
print(RMSE(y_test, forest.predict(X_test)))

0.014048396146600828 CV score 5-fold of RandomForest on train
0.008773215580983748
