In [None]:
# regression model

In [None]:
# https://medium.com/analytics-vidhya/hyperparameter-tuning-hyperopt-bayesian-optimization-for-xgboost-and-neural-network-8aedf278a1c9#:~:text=HyperParameter%20Tuning%20%E2%80%94%20Hyperopt%20Bayesian%20Optimization%20for%20(Xgboost%20and%20Neural%20network),-Hyperparameters%3A%20These%20are&text=HYPEROPT%3A%20It%20is%20a%20powerful,TPE%20(Tree%20Parzen%20Estimators)

In [1]:
#------------------------------------------------------------------------------ 
# import packages
#------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import random
from datetime import date
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Classification imports
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score

#from hyperopt import fmin, hp, tpe, rand, Trials, STATUS_OK
import pickle

pd.options.display.max_columns = 50
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [2]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [3]:
import preprocess
import models
import visualization

preprocess_traintestsplit mod is imported into another module
train_predict_model_xgb_tpe mod is imported into another module
visualize mod is imported into another module


In [4]:
path_to_file = '../data/raw/'
df_labels = pd.read_csv(path_to_file + 'major_donor_labels.csv')
df_donations = pd.read_csv(path_to_file + 'donations.csv')
df_features = pd.read_csv(path_to_file + 'windfall_features.csv')

In [5]:
# clean df_labels
# rename and drop unnecessary columns
df_labels = df_labels.drop('Unnamed: 0', axis=1)
df_labels.set_axis(['candidate_id', 'ideal_donor'], axis=1, inplace=True)
df_labels.shape

(130114, 2)

In [6]:
df_donations.dropna(how='any', inplace=True)
df_donations.reset_index(drop=True, inplace=True)
df_donations.set_axis(['candidate_id', 'trans_date', 'amount'], axis=1, inplace=True)
df_donations['trans_date'] = pd.to_datetime(df_donations['trans_date']).copy()

In [7]:
# get original amount column for candidate_id's. Will use this to measure correlations of features to target.

# get "ideal donors" as specified by prompt 
df_donations_idealdonors = df_donations.loc[~(df_donations['amount'] < 0) & ((df_donations['trans_date'] < '2021-07-31') & (df_donations['trans_date'] > '2016-08-01')), ['candidate_id','trans_date','amount']]
df_donations_idealdonors['trans_date'].agg(['min','max'])

min   2016-08-03
max   2021-07-30
Name: trans_date, dtype: datetime64[ns]

In [8]:
df_donations_idealdonors_aggamount = df_donations_idealdonors.groupby('candidate_id')['amount'].sum().reset_index()
del df_donations_idealdonors

# create target label
fx = lambda x: (1 if x >= 20000 else 0)
df_donations_idealdonors_aggamount['target'] = df_donations_idealdonors_aggamount['amount'].apply(fx)

In [9]:
df_labels_idealdonors = df_labels.join(df_donations_idealdonors_aggamount.set_index('candidate_id'), on=['candidate_id'], how='left').fillna(0)
print('check = 0: %d' %(df_labels_idealdonors['ideal_donor'] != df_labels_idealdonors['target']).sum())
#df_labels_idealdonors.drop(columns=['ideal_donor','target'], inplace=True)

df_labels_idealdonors.drop(columns=['ideal_donor','amount'], inplace=True)

check = 0: 0


In [10]:
# independent variables
ind_features = [name for name in df_features.columns if name.find('Class') == -1 and name.find('Cause') == -1]
ind_features = ind_features[1:]
ind_features

['totalHouseholdDebt',
 'primaryPropertyLoanToValue',
 'primaryPropertyValue',
 'propertyCount',
 'NetWorth']

In [11]:
# preprocess and add feature engineered columns to datasets
df_features_addfeatures = preprocess.create_features_df_features(df_features, ind_features, ID='candidate_id')
df_donations_addfeatures = preprocess.create_features_df_donations(df_donations, PredictedOn='2016-08-01')

# join datasets
df_donorfeatures = df_features_addfeatures.join(df_donations_addfeatures.set_index('candidate_id'), on=['candidate_id'], how='inner')

# add col of random int to check baseline feature importance
df_donorfeatures['random_value'] = np.random.randint(0,100, size=len(df_donorfeatures))

# add scaled amount_prev by NetWorth
df_donorfeatures['amountscaled_prev360d3'] = df_donorfeatures['amount_prev360d3'] / df_donorfeatures['NetWorth']
df_donorfeatures['amountscaled_prev360d5'] = df_donorfeatures['amount_prev360d5'] / df_donorfeatures['NetWorth']

In [12]:
# join dependent variable
df_final = df_donorfeatures.join(df_labels_idealdonors.set_index('candidate_id'), on=['candidate_id'], how='left')

In [13]:
df_final.head()

Unnamed: 0,candidate_id,primaryPropertyValue,propertyCount,NetWorth,primaryPropertyLoanToValue_ideal,primaryPropertyValueToNetWorth_ratio,LoanAmount,amount_prev360d2,amount_prev360d3,amount_prev360d4,amount_prev360d5,count_trans_date_prev5y,random_value,amountscaled_prev360d3,amountscaled_prev360d5,target
0,candidate_0,2215000.0,4.0,14011369.0,1,0.158086,745082.0,2100.0,1800.01,4500.0,1100.0,2122.0,30,0.000128,7.9e-05,1.0
1,candidate_1,3650000.0,1.0,5812754.0,0,0.62793,3024625.0,0.0,0.0,0.0,0.0,0.0,8,0.0,0.0,1.0
2,candidate_2,625000.0,1.0,1060001.0,1,0.589622,1.0,0.0,0.0,0.0,0.0,0.0,12,0.0,0.0,0.0
3,candidate_3,903455.0,3.0,4237949.0,1,0.213182,26807.0,0.0,0.0,0.0,0.0,0.0,47,0.0,0.0,0.0
4,candidate_4,2608000.0,1.0,10013587.0,1,0.260446,1110278.0,0.0,0.0,0.0,0.0,0.0,50,0.0,0.0,0.0


In [14]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50610 entries, 0 to 50918
Data columns (total 16 columns):
candidate_id                            50610 non-null object
primaryPropertyValue                    50610 non-null float64
propertyCount                           50610 non-null float64
NetWorth                                50610 non-null float64
primaryPropertyLoanToValue_ideal        50610 non-null int64
primaryPropertyValueToNetWorth_ratio    50610 non-null float64
LoanAmount                              50610 non-null float64
amount_prev360d2                        50610 non-null float64
amount_prev360d3                        50610 non-null float64
amount_prev360d4                        50610 non-null float64
amount_prev360d5                        50610 non-null float64
count_trans_date_prev5y                 50610 non-null float64
random_value                            50610 non-null int64
amountscaled_prev360d3                  50610 non-null float64
amountscaled_p

In [15]:
def feature_target_split(df, cnames_to_drop=['ideal_donor', 'candidate_id'], target_col='ideal_donor'):
    '''
    Args:
        df (DataFrame): data to be analyzed
        cnames_to_drop (list): list of names to be dropped from df
        target_col (str): name of column of true values. 
    Returns:
        DataFrame of X values
        Series of y values
    ''' 
    # get feature names
    fset=[x for x in df if x not in cnames_to_drop]
    
    # Breaking up preprocessed data into predictor and target
    X=df[fset]
    y=df[target_col]
    
    print('Metadata about full dataset:')
    print('    number of members in full dataset: %d' % len(X))
    print('    number of features in full dataset = %d' % len(fset))
    print('    number of classes in full dataset : %d \n' %y.nunique())
    print('')
    
    return X, y

In [50]:
#--------------------------------------------------------------------------
# split data into train and test
#--------------------------------------------------------------------------

#target_col = 'amount'
#cnames_to_drop = ['amount', 'candidate_id']

target_col = 'target'
cnames_to_drop = ['target', 'candidate_id']
test_size = 0.30
random_state = 42

X, y = feature_target_split(df_final, cnames_to_drop=cnames_to_drop, target_col=target_col)

print('count per class: %d %d' %(y.value_counts()[0], y.value_counts()[1]))
#Xtrain, Xtest, ytrain, ytest = train_test_split(X.values, y.values, test_size=test_size, stratify=None, shuffle=True, random_state=random_state)

Metadata about full dataset:
    number of members in full dataset: 50610
    number of features in full dataset = 14
    number of classes in full dataset : 2 


count per class: 50225 385


In [17]:
print('number of target members in train: %d' %(ytrain>20000).astype('float').sum())
print('number of target members in test: %d' % (ytest>20000).astype('float').sum())

NameError: name 'ytrain' is not defined

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
#from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

In [18]:
#from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import average_precision_score

In [20]:
#help(GradientBoostingClassifier)

In [71]:
#from scipy.stats import uniform #as sp_randFloat
#from scipy.stats import randint

In [72]:
%%time

random_state = 42
learning_rate = .1
n_estimators = 500
n_iter_no_change = 50
validation_fraction = 0.1 # fraction of the whole dataset that will be kept aside from training to assess the validation loss of the model.
tol = 0.02 # if the scores don't improve by at least 0.01 for the last X stages, stop fitting additional stages
verbose = 1

# configure the cross-validation procedure
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# enumerate splits
outer_results = list()
best_inner_params_list = []
for train_ix, test_ix in cv_outer.split(X.values, y.values):
    # split data
    X_train, X_test = X.values[train_ix, :], X.values[test_ix, :]
    y_train, y_test = y.values[train_ix], y.values[test_ix]
    
    # configure the cross-validation procedure
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    
    # define the model
    model = GradientBoostingClassifier(random_state=random_state, learning_rate=learning_rate, 
                                       n_estimators=n_estimators, n_iter_no_change=n_iter_no_change, 
                                       tol=tol, validation_fraction=validation_fraction, verbose=verbose)
    
    # define search space
    #params_fixed = {'learning_rate'    :[learning_rate],  
    #                'n_estimators'     :[n_estimators],  
    #                'random_state'     :[random_state],
    #                'n_iter_no_change ':[n_iter_no_change],
    #                'verbose'          :[1]
    #               } 
        
    param_space = {'max_depth'        : np.linspace(5, 20, 16, endpoint=True),
                   'subsample'        : np.linspace(.4, 1, 7, endpoint=True),
                   #'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
                   #'min_samples_leaf' : np.linspace(0.1, 1.0, 10, endpoint=True),
                   'max_features'     : list(range(1,X.shape[1]+1))
                  }
    #param_space.update(params_fixed)
    
    # define search
    scoring = 'average_precision' # 'neg_mean_squared_error'
    search = RandomizedSearchCV(model, param_distributions=param_space, scoring=scoring, n_jobs=-1, 
                                cv=cv_inner, refit=True, verbose=0)
    
    # execute search
    print('execute search')
    result = search.fit(X_train, y_train)
    
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    score = average_precision_score(y_test, yhat)
    
    # store the result
    outer_results.append(score)
    best_inner_params_list.append(result.best_params_)
    # report progress
    print('>outer_test=%.3f, inner_test=%.3f, cfg=%s' % (score, result.best_score_, result.best_params_))

# summarize the estimated performance of the model
print('mean estimated performance: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

execute search
      Iter       Train Loss   Remaining Time 
         1           0.0208            2.17m
         2           0.0127            1.47m
         3           0.0115            1.30m
         4           0.0104            1.18m
         5           0.0094            1.10m
         6           0.0085            1.05m
         7           0.0077           59.91s
         8           0.0070           58.09s
         9           0.0063           56.98s
        10           0.0057           56.37s
        20           0.0022           53.17s
        30           0.0009           47.73s
        40           0.0004           45.16s
        50           0.0002           44.73s
>outer_test=0.380, inner_test=0.628, cfg={'subsample': 1.0, 'max_features': 10, 'max_depth': 19.0}
execute search
      Iter       Train Loss   Remaining Time 
         1           0.0280            1.61m
         2           0.0148            1.31m
         3           0.0119            1.21m
         4    

In [75]:
best_inner_params_list[4]

{'subsample': 1.0, 'max_features': 7, 'max_depth': 11.0}

In [74]:
outer_results

[0.3795081081786021,
 0.2603938897324301,
 0.34997575039068807,
 0.48164101073917764,
 0.44698737542374284]

In [54]:
best_model

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=10.0,
                           max_features=13, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=100, presort='deprecated',
                           random_state=42, subsample=0.8999999999999999,
                           tol=0.01, validation_fraction=0.1, verbose=1,
                           warm_start=False)

In [76]:
xgb_model = GridSearchCV(best_model, param_grid=best_inner_params_list[4], scoring=scoring, n_jobs=-1, cv=cv_inner, refit=True, verbose=0)
xgb_model.fit(X_train, y_train)

# Use the fitted model to check training error
ytr_p = xgb_model.predict(X_train)
train_score = average_precision_score(y_train, ytr_p)

# Use the fitted model to check test error
yte_p = xgb_model.predict(X_test) 
test_score = average_precision_score(y_test, yte_p) #mse 

print('Test errors/scores of the tuned model:')
print('mean estimated performance: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

print('Training and test errors/scores of the fitted model:')
print('    score on training set (training error of tuned model) = %.8f' % train_score)
print('    score on test set (test error of tuned model)         = %.8f' % test_score)
print('')

ValueError: Parameter values for parameter (subsample) need to be a sequence(but not a string) or np.ndarray.

In [28]:
%%time
# configure the cross-validation procedure
cv_outer = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()

for train_ix, test_ix in cv_outer.split(X.values, y.values):
    # split data
    X_train, X_test = X.values[train_ix, :], X.values[test_ix, :]
    y_train, y_test = y.values[train_ix], y.values[test_ix]
    # configure the cross-validation procedure
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    # define the model
    model = GradientBoostingClassifier(random_state=1)
    # define search space
    parameters = {'learning_rate': [.1,.2,.3], #[0.01,0.02,0.03,0.04],
                  #'subsample'    : [0.9], #0.5, 0.2, 0.1],
                  'n_estimators' : [10,100,200], #,1000, 1500],
                  #'max_depth'    : [4,6], #,8,10]
                 }
    # define search
    scoring = 'average_precision' # 'neg_mean_squared_error'
    search = GridSearchCV(model, param_grid=parameters, scoring=scoring, n_jobs=-1, cv=cv_inner, refit=True, verbose=0)
    # execute search
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # evaluate the model
    #score = mean_squared_error(y_test, yhat)
    score = average_precision_score(y_test, yhat)
    # store the result
    outer_results.append(score)
    # report progress
    print('>outer_test=%.3f, inner_test=%.3f, cfg=%s' % (score, result.best_score_, result.best_params_))

# summarize the estimated performance of the model
print('mean estimated performance: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>outer_test=0.385, inner_test=0.487, cfg={'learning_rate': 0.1, 'n_estimators': 10}
>outer_test=0.236, inner_test=0.507, cfg={'learning_rate': 0.1, 'n_estimators': 10}
>outer_test=0.267, inner_test=0.487, cfg={'learning_rate': 0.1, 'n_estimators': 10}
>outer_test=0.255, inner_test=0.534, cfg={'learning_rate': 0.1, 'n_estimators': 10}
mean estimated performance: 0.286 (0.058)
CPU times: user 3.57 s, sys: 54.2 ms, total: 3.62 s
Wall time: 1min 49s


In [39]:
xgb_model = GridSearchCV(best_model, param_grid=result.best_params_, scoring=scoring, n_jobs=-1, cv=cv_inner, refit=True, verbose=0)
xgb_model.fit(X_train, y_train)

# Use the fitted model to check training error
ytr_p = xgb_model.predict(X_train)
train_score = average_precision_score(y_train, ytr_p)

# Use the fitted model to check test error
yte_p = xgb_model.predict(X_test) 
test_score = average_precision_score(y_test, yte_p) #mse 

print('Test errors/scores of the tuned model:')
print('mean estimated performance: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

print('Training and test errors/scores of the fitted model:')
print('    score on training set (training error of tuned model) = %.8f' % train_score)
print('    score on test set (test error of tuned model)         = %.8f' % test_score)
print('')

Training and test errors/scores of the fitted model:
    score on training set (training error of tuned model) = 0.42502365
    score on test set (test error of tuned model)         = 0.25484604



In [40]:
xgb_model.cv_results_

{'mean_fit_time': array([ 0.59126314,  5.83333588, 11.57723482,  0.58296092,  5.8234024 ,
        11.61663119,  0.6504883 ,  5.78328196, 11.23645409]),
 'std_fit_time': array([0.00764116, 0.00458663, 0.08935957, 0.00578645, 0.09298026,
        0.02168486, 0.06112856, 0.02091556, 0.14044283]),
 'mean_score_time': array([0.00538349, 0.01640201, 0.02796443, 0.00508531, 0.01494487,
        0.02603571, 0.00583172, 0.01539771, 0.02513051]),
 'std_score_time': array([8.71975550e-05, 5.73674671e-04, 1.82076742e-03, 1.32636798e-04,
        3.23495113e-04, 1.51055130e-03, 9.50139309e-04, 8.19477093e-04,
        1.46300410e-03]),
 'param_learning_rate': masked_array(data=[0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[10, 100, 200, 10, 100, 200, 10, 100, 200],
              mask=[False, False, False, Fa

In [41]:
xgb_model.best_params_

{'learning_rate': 0.1, 'n_estimators': 10}

In [64]:
%%time
# configure the cross-validation procedure
cv_outer = KFold(n_splits=4, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in cv_outer.split(X.values):
    # split data
    X_train, X_test = X.values[train_ix, :], X.values[test_ix, :]
    y_train, y_test = y.values[train_ix], y.values[test_ix]
    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
    # define the model
    model = GradientBoostingRegressor(random_state=1)
    # define search space
    parameters = {'learning_rate': [.1,.2,.3], #[0.01,0.02,0.03,0.04],
                  #'subsample'    : [0.9], #0.5, 0.2, 0.1],
                  'n_estimators' : [10,100,200], #,1000, 1500],
                  #'max_depth'    : [4,6], #,8,10]
                 }
    # define search
    scoring = 'neg_mean_squared_error'
    search = GridSearchCV(model, param_grid=parameters, scoring=scoring, n_jobs=-1, cv=cv_inner, refit=True, verbose=0)
    # execute search
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # evaluate the model
    #score = mean_squared_error(y_test, yhat)
    score = average_precision_score(y_test, yhat)
    # store the result
    outer_results.append(score)
    # report progress
    print('>score_test=%.3f, score_est=%.3f, cfg=%s' % (score, result.best_score_, result.best_params_))

# summarize the estimated performance of the model
print('mean estimated performance: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>mse=0.638, est=0.590, cfg={'learning_rate': 0.2, 'n_estimators': 200}
>mse=0.637, est=0.648, cfg={'learning_rate': 0.2, 'n_estimators': 200}
>mse=0.689, est=0.612, cfg={'learning_rate': 0.3, 'n_estimators': 200}
>mse=0.653, est=0.602, cfg={'learning_rate': 0.1, 'n_estimators': 200}
neg_mean_squared_error: 0.654 (0.021)
CPU times: user 54.9 s, sys: 18.9 ms, total: 55 s
Wall time: 2min 29s


In [65]:
y_train.shape

(37958,)

In [66]:
ytrain.shape

(35427,)

In [40]:
xgb_model = GridSearchCV(model, param_grid=parameters, scoring='neg_mean_squared_error', n_jobs=-1, cv=cv_inner, refit=True, verbose=0)
xgb_model.fit(Xtrain, ytrain)

# Use the fitted model to check training error
ytr_p = xgb_model.predict(Xtrain)
train_score = mean_squared_error(ytrain, ytr_p)

# Use the fitted model to check test error
yte_p = xgb_model.predict(Xtest) 
test_score = mean_squared_error(ytest, yte_p) #mse 
print('Training and test errors/scores of the fitted model:')
print('    score on training set (training error of tuned model) = %.8f' % train_score)
print('    score on test set (test error of tuned model)         = %.8f' % test_score)
print('')

Training and test errors/scores of the fitted model:
    score on training set (training error of tuned model) = 606657075.47324610
    score on test set (test error of tuned model)         = 10548878589.98448753



In [28]:
sorted(outer_results)

[275978237.6719656, 279222869.1453383, 1702575340.9956436, 12235885515.391294]

In [27]:
dict(enumerate(outer_results))

{0: 1702575340.9956436,
 1: 275978237.6719656,
 2: 279222869.1453383,
 3: 12235885515.391294}

In [37]:
best_model.grid_scores_

AttributeError: 'GradientBoostingRegressor' object has no attribute 'grid_scores_'

In [None]:
ytr_p = xgb_model.predict(data = xgb.DMatrix(data=Xtrain.sort_index().values)) 
train_score = average_precision_score(ytrain.sort_index().values, ytr_p)  #roc_auc_score

# Use the fitted model to check test error
yte_p = xgb_model.predict_proba(Xtest.sort_index().values) #yhat
test_score = average_precision_score(ytest.sort_index().values, yte_p) #mse 
print('Training and test errors/scores of the fitted model:')
print('    score on training set (training error of tuned model) = %.8f' % train_score)
print('    score on test set (test error of tuned model)         = %.8f' % test_score)
print('')

In [112]:
best_model

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.2, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=1, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
param_bst

In [116]:
result.best_estimator_

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.2, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=1, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [38]:
result.best_params_

{'learning_rate': 0.1, 'n_estimators': 10}

In [39]:
result.cv_results_

{'mean_fit_time': array([ 0.52153858,  5.26300017, 10.58611965,  0.55092994,  5.26175094,
        10.49052215,  0.53461019,  5.2285769 , 10.07590501]),
 'std_fit_time': array([0.00211097, 0.01611831, 0.05989154, 0.02010799, 0.03768722,
        0.05657721, 0.01564212, 0.04863082, 0.20883988]),
 'mean_score_time': array([0.00233229, 0.01184551, 0.02213812, 0.00228357, 0.01141667,
        0.02137804, 0.00228087, 0.01139681, 0.01799226]),
 'std_score_time': array([7.58081940e-05, 1.90303268e-04, 3.30895957e-04, 5.84966677e-05,
        1.21579606e-04, 4.38652737e-04, 8.66255629e-05, 4.70253892e-04,
        8.81935579e-04]),
 'param_learning_rate': masked_array(data=[0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[10, 100, 200, 10, 100, 200, 10, 100, 200],
              mask=[False, False, False, Fa

In [96]:
result.cv_results_['mean_test_score']

array([-4.02632127e+09, -4.03794576e+09, -4.11614205e+09, -4.11641480e+09])

In [88]:
yhat

array([129.53761254, 129.53761254, 129.53761254, ..., 129.53761254,
       129.53761254, 129.53761254])

{0: 3357410081.3025613,
 1: 493778548.8933155,
 2: 97533439.17972018,
 3: 167438982.33600983,
 4: 740640387.6378227,
 5: 745941528.0623621,
 6: 145638275.0249669,
 7: 298658396.5743813,
 8: 27838745789.887886,
 9: 614994866.0176597}

In [None]:
%%time
#==========================================================================
# Keep the tuned parameters, reduce step size (eta), 
# and determine the number of trees (best_n_round).
# Train a model on Xtr with new eta and best_n_round, and test on Xte.
#==========================================================================
# prepare parameters
# parameters needed for boosters
small_eta = 0.1
print('reduce eta to %f and search for optimal boost rounds ......' % small_eta)

# booster parameters:
param_bst = {'booster'                : 'gbtree',     
             'verbosity'              : 1,            
             'objective'              : 'binary:logistic', 
             'eta'                    : small_eta,          
             'max_depth'              : tuned_params['max_depth'],           
             'min_child_weight'       : tuned_params['min_child_weight'],            
             'subsample'              : tuned_params['subsample'],          
             'colsample_bytree'       : tuned_params['colsample_bytree'],          
             'scale_pos_weight'       : tuned_params['scale_pos_weight'],
             'gamma'                  : tuned_params['gamma'],           
             'lambda'                 : tuned_params['lambda'],           
             'alpha'                  : 0,         
             'seed'                   : 0,
             'eval_metric'            : ['aucpr']
             }

# parameters needed for xgb.cv():
param_cv = {'kfold'       : 5, 
            'n_round'     : 10,#5000, 
            'n_earlystop' : 10,
            'seed_cv'     : seed_tuning, 
            'eval_metric' : ['aucpr']}

# use the tuned parameters, reduce eta, use xgb.cv on Xtr, ytr to determine
# the number of boosters
dtrain = xgb.DMatrix(data=Xtrain.sort_index().values, label=ytrain.sort_index().values)

#calibrated = CalibratedClassifierCV(xgb, method='isotonic', cv=5)
#calibrated.fit(Xtrain, ytrain)

print('start running xgb.cv ......\n')
cv_result = xgb.cv(params                = param_bst,  # a dictionary containing booster parameters 
                   dtrain                = dtrain, # DMatrix type
                   num_boost_round       = param_cv['n_round'],      # max number of boosting iterations
                   early_stopping_rounds = param_cv['n_earlystop'],  # stop if evaluation score is not improved after early_stopping_rounds of iterations
                   nfold                 = param_cv['kfold'],        # number of folds for cv
                   metrics               = param_cv['eval_metric'],  # 'mae', 'tweedie-nloglik@'+str(tweedie_varp), 
                   seed                  = param_cv['seed_cv'],      # seed to generate cv folds
                   shuffle               = True,
                   verbose_eval          = True)
best_n_round = cv_result.shape[0] 
print('CV result:')
print('    Best result obtained at n_round = %3d' % best_n_round)
print('    averaged score on training folds = %.6f (std dev = %.6f)' % (cv_result.iloc[-1,0], cv_result.iloc[-1,1]))
print('    averaged score on test folds     = %.6f (std dev = %.6f)' % (cv_result.iloc[-1,2], cv_result.iloc[-1,3]))
print('')

# train a model on Xtr, ytr  
print('Start training a model on Xtr ......')

xgb_model = xgb.train(params                = param_bst, 
                      dtrain                = dtrain, 
                      num_boost_round       = best_n_round, 
                      early_stopping_rounds = None)

# calibrate the fit
#calibrated = CalibratedClassifierCV(xgb_model, method='isotonic', cv=3)
#calibrated.fit(dtrain)
#ytr_p = calibrated.predict_proba(Xtrain.sort_index().values)
#yte_p = xgb_model.predict(data = xgb.DMatrix(data=Xtest.sort_index().values))

# Use the fitted model to check training error
ytr_p = xgb_model.predict(data = xgb.DMatrix(data=Xtrain.sort_index().values)) 
train_score = average_precision_score(ytrain.sort_index().values, ytr_p)  #roc_auc_score

# Use the fitted model to check test error
yte_p = calibrated.predict_proba(Xtest.sort_index().values)
test_score = average_precision_score(ytest.sort_index().values, yte_p) #roc_auc_score
print('Training and test errors/scores of the fitted model:')
print('    score on training set (training error of tuned model) = %.8f' % train_score)
print('    score on test set (test error of tuned model)         = %.8f' % test_score)
print('')