In [1]:
#------------------------------------------------------------------------------ 
# import packages
#------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import random
from datetime import date
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
from hyperopt import fmin, hp, tpe, rand, Trials, STATUS_OK
import pickle

pd.options.display.max_columns = 50
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [2]:
import preprocess
import models
import visualization

preprocess_traintestsplit mod is imported into another module
train_predict_model_xgb_tpe mod is imported into another module
visualize mod is imported into another module


In [3]:
#help(models)

In [4]:
#help(preprocess)

In [5]:
path_to_file = '../data/raw/'
df_labels = pd.read_csv(path_to_file + 'major_donor_labels.csv')
df_donations = pd.read_csv(path_to_file + 'donations.csv')
df_features = pd.read_csv(path_to_file + 'windfall_features.csv')

In [6]:
# clean df_labels
# rename and drop unnecessary columns
df_labels = df_labels.drop('Unnamed: 0', axis=1)
df_labels = df_labels.set_axis(['candidate_id', 'ideal_donor'], axis=1)
df_labels.shape

(130114, 2)

In [7]:
# independent variables
ind_features = [name for name in df_features.columns if name.find('Class') == -1 and name.find('Cause') == -1]
ind_features = ind_features[1:]
ind_features

['totalHouseholdDebt',
 'primaryPropertyLoanToValue',
 'primaryPropertyValue',
 'propertyCount',
 'NetWorth']

In [8]:
# preprocess and add feature engineered columns to datasets
df_features_addfeatures = preprocess.create_features_df_features(df_features, ind_features, ID='candidate_id')
df_donations_addfeatures = preprocess.create_features_df_donations(df_donations, PredictedOn='2016-08-01')

# join datasets
df_donorfeatures = df_features_addfeatures.join(df_donations_addfeatures.set_index('candidate_id'), on=['candidate_id'], how='inner')

# add col of random int to check baseline feature importance
df_donorfeatures['random_value'] = np.random.randint(0,100, size=len(df_donorfeatures))

# add scaled amount_prev by NetWorth
df_donorfeatures['amountscaled_prev360d3'] = df_donorfeatures['amount_prev360d3'] / df_donorfeatures['NetWorth']
df_donorfeatures['amountscaled_prev360d5'] = df_donorfeatures['amount_prev360d5'] / df_donorfeatures['NetWorth']

# join dependent variable
df_final = df_donorfeatures.set_index('candidate_id').join(df_labels.set_index('candidate_id'), how='left').reset_index()

In [9]:
df_final.head()

Unnamed: 0,candidate_id,primaryPropertyValue,propertyCount,NetWorth,primaryPropertyLoanToValue_ideal,primaryPropertyValueToNetWorth_ratio,LoanAmount,amount_prev360d2,amount_prev360d3,amount_prev360d4,amount_prev360d5,count_trans_date_prev5y,random_value,amountscaled_prev360d3,amountscaled_prev360d5,ideal_donor
0,candidate_0,2215000.0,4.0,14011369.0,1,0.158086,745082.0,2100.0,1800.01,4500.0,1100.0,2122.0,13,0.000128,7.9e-05,1
1,candidate_1,3650000.0,1.0,5812754.0,0,0.62793,3024625.0,0.0,0.0,0.0,0.0,0.0,99,0.0,0.0,1
2,candidate_2,625000.0,1.0,1060001.0,1,0.589622,1.0,0.0,0.0,0.0,0.0,0.0,44,0.0,0.0,0
3,candidate_3,903455.0,3.0,4237949.0,1,0.213182,26807.0,0.0,0.0,0.0,0.0,0.0,22,0.0,0.0,0
4,candidate_4,2608000.0,1.0,10013587.0,1,0.260446,1110278.0,0.0,0.0,0.0,0.0,0.0,61,0.0,0.0,0


In [10]:
#--------------------------------------------------------------------------
# split data into train and test
#--------------------------------------------------------------------------

target_col = 'ideal_donor'
cnames_to_drop = ['ideal_donor', 'candidate_id']
test_size = 0.30
random_state = 42

Xtrain, Xtest, ytrain, ytest = preprocess.train_test_stratifysplit(df_final, cnames_to_drop=cnames_to_drop, target_col=target_col, test_size=test_size, random_state=42)

Metadata about full dataset:
    number of members in full dataset: 50610
    number of features in full dataset = 14
    number of classes in full dataset : 2 


Metadata about train and test features:
    size of training feature (Xtr): 35427 x 14
    size of test feature (Xte)    : 15183 x 14

Metadata about target:
    size of training target (ytr) : 35427
    size of test target (yte)     : 15183
    training target: neg/pos = 130.21
    test target    : neg/pos = 131.03

SIZE full dataset: positive class = (385); negative class = (50225)
PROPORTION full dataset: positive class = (0.008); negative class = (0.992) 

SIZE train: positive class = (270); negative class = (35157)
PROPORTION train: train positive (0.008); train negative (0.992) class

SIZE test: positive class = (115); negative class = (15068)
PROPORTION test: test  positive (0.008); test negative (0.992) class


In [11]:
#--------------------------------------------------------------------------
# Series with saved index and id values for target. This is used to score the model.
#--------------------------------------------------------------------------
ID_test = df_final[df_final.index.isin(Xtest.index)]['candidate_id']

In [12]:
%%time
# Now we have Xtrain, ytrain, Xtest, ytest.
        # Start training and testing the model.
    
        #==========================================================================
        # tuning (cv on Xtrain, ytrain) ==> parameters
        # training (on Xtrain, ytrain)  ==> model
        # testing (on Xtest, ytest)     ==> score of the model
        #==========================================================================

# prepare tuning parameters
num_fold_tuning = 5
seed_tuning = 2021
skf_tuning = StratifiedKFold(n_splits     = num_fold_tuning,
                             random_state = seed_tuning,
                             shuffle      = True)

print('Tune and train a model on Xtrain ......')
print('Make predictions on Xtest ......')
yte_pred, tuning_cv_score, tuned_params, model = models.xgb_pipeline(Xtrain = Xtrain.sort_index().values,
                                                              ytrain = ytrain.sort_index().values,
                                                              Xtest  = Xtest.sort_index().values,
                                                              cv     = skf_tuning,
                                                              opt_method = tpe.suggest #rand.suggest <-- tree of Parzen hyperopt vs random grid search
                                                             )
print('Test predicted results on Xtest ......')
#test_score = roc_auc_score(ytest, yte_pred) # for auc
test_score = average_precision_score(ytest.sort_index(), yte_pred)


print('    best tuning cv score on the training set (Xtrain, ytrain) = %.8f' % tuning_cv_score)
print('    test score on the test set (Xtest, ytest)                 = %.8f' % test_score)

print('tuning, training and test done\n')

Tune and train a model on Xtrain ......
Make predictions on Xtest ......
[0]	train-aucpr:0.20978+0.01717	test-aucpr:0.20672+0.04413
[1]	train-aucpr:0.23698+0.02740	test-aucpr:0.24009+0.04354
[2]	train-aucpr:0.25340+0.04173	test-aucpr:0.25031+0.05310
[3]	train-aucpr:0.28097+0.03524	test-aucpr:0.27019+0.04797
[4]	train-aucpr:0.28725+0.04070	test-aucpr:0.28238+0.05619
[5]	train-aucpr:0.28769+0.04177	test-aucpr:0.28460+0.05751
[6]	train-aucpr:0.28827+0.04247	test-aucpr:0.28510+0.05748
[7]	train-aucpr:0.28897+0.04258	test-aucpr:0.28540+0.05810
[8]	train-aucpr:0.29400+0.04906	test-aucpr:0.29108+0.06104
[9]	train-aucpr:0.29413+0.04854	test-aucpr:0.29065+0.06036
[10]	train-aucpr:0.29477+0.04718	test-aucpr:0.29220+0.06036
[11]	train-aucpr:0.29423+0.04713	test-aucpr:0.29214+0.06045
[12]	train-aucpr:0.29459+0.04653	test-aucpr:0.29277+0.06050
[13]	train-aucpr:0.30776+0.04586	test-aucpr:0.30683+0.06618
[14]	train-aucpr:0.30775+0.04607	test-aucpr:0.30696+0.06631
[15]	train-aucpr:0.30811+0.04643	test

In [13]:
#--------------------------------------------------------------------------
# save model, parameters and predictions 
#--------------------------------------------------------------------------
print('Save results to file ......') 
dir_output = '../data/predictions/'

project_name = '1.0-akr-idealdonor-XGB-'
project_date = date.today().strftime("%Y%m%d")

# save a dataframe with columns (ID, Ypred, Ytrue) to csv
dict_prediction = {'ID'    : ID_test.sort_index(),
                   'Ytrue' : ytest.sort_index(), 
                   'Ypred' : yte_pred #yte_p
                  } 
df_prediction = pd.DataFrame(data = dict_prediction)
csvfile_name = project_name + project_date + '-prediction.csv'
df_prediction.to_csv(dir_output + csvfile_name, index=False)

# save the tuned parameters used to train the model  
#tuned_params['eta'] = small_eta
#tuned_params['num_boost_round'] = best_n_round
dir_output = '../data/params/'
npyfile_name = project_name + project_date + '-params.npy'
np.save(dir_output + npyfile_name, tuned_params)
# adict = np.load(npyfile_name).item()

# save the tuned model
dir_output = '../models/'
model_name = project_name + project_date + '-model.pickle.dat'
xgb_model = model
pickle.dump(xgb_model, open(dir_output + model_name, "wb"))
# model = pickle.load(open(model_name, "rb"))

print('All files saved.')
print('===============\n')

Save results to file ......
All files saved.

