In [9]:
%pylab inline

from lib.KDTreeEncoding import *

import xgboost as xgb
from lib.XGBHelper import *
from lib.XGBoost_params import *
from lib.score_analysis import *

from lib.logger import logger

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from numpy import load
from glob import glob
import pandas as pd
import pickle as pkl
import os

Populating the interactive namespace from numpy and matplotlib


In [3]:
# setting up random seed, it has been set in all helper lib too
np.random.seed(123)

In [6]:
# set path to directories here
poverty_dir = '/datasets/cs255-sp22-a00-public/poverty/'
image_dir = poverty_dir + '/anon_images'
train_table = 'public_tables/train.csv'

In [7]:
# creating main dataframe and image files here
df=pd.read_csv(train_table,index_col=0)
df.index = df['filename']
files=list(glob(f'{image_dir}/*.npz'))

In [11]:
## Pickled the encoded dataset, so can skip this cell if no changes are being made here (pickle again if Yes)
## Getting features from the images in the main dataframe Enc_data using KD-trees
## Can think of changes here to extract features from the images in a different manner
## max_images = 610 works, but 615 will run out of memory and kernel dies
## try changing the randomness factor here file choosing images to use for encoding
## tree depth: (2^(tree_depth+1))+1 will decide how many columns we will be having in our encoded dataset
## so right now 1024+1 columns will be used to encode the image data as a feature vector
encoded_dataset_file_name = f'encoded_dataset.pk'
encoded_tree_file_name = f'encoder_tree.pk'

if os.path.isfile(encoded_dataset_file_name) and os.path.isfile(encoded_tree_file_name):
    Enc_data = pkl.load(open(encoded_dataset_file_name,'rb'))
    tree = pkl.load(open(encoded_tree_file_name,'rb'))

else:
    tree_depth = 8
    train_size,tree = train_encoder(files,max_images=500,tree_depth=tree_depth)
    Enc_data=encoded_dataset(image_dir,df,tree, depth=tree_depth, label_col='label')
    # saving this encoded dataset and tree so that we don't have to encode it every time
    pkl.dump(Enc_data,open(encoded_dataset_file_name,'wb'))
    pkl.dump(tree, open(encoded_tree_file_name,'wb'))

In [12]:
# Creating object D to handle dataset functionalities
# like getting subsets, bootstrapping samples, etc
D = DataSplitter(Enc_data.data)

In [13]:
# creating a new metric for our XGBoost algorithm
from sklearn.metrics import f1_score, recall_score

def calc_f1(predt: np.ndarray, dtrain: xgb.DMatrix):
    preds = (predt > 0.5).astype(np.int64)
    res = f1_score(predt>0.5, dtrain.get_label())
    return 'f1', res



In [28]:
# Set all params for XGBoost here
# param dictionary is already present in the lib.XGBoost_Params file
# default parameters are set, can be customized and new params can also be added
GLOBAL_PARAMS = {'Rural': {}, 'Urban': {} }


param = GLOBAL_PARAMS['Rural']
param['max_depth'] = 3   # depth of tree
param['eta'] = 0.15      # shrinkage parameter
param['verbosity'] = 0  # 0= no logging 3=max logging
param['feature_selector'] = 'shuffle'
param['verbose_eval'] = 1 
param['custom_metric'] = calc_f1
param['disable_default_eval_metric']=True
param['eval_metric'] = ['error','logloss','auc']


param = GLOBAL_PARAMS['Urban']
param['max_depth'] = 3   # depth of tree
param['eta'] = 0.3      # shrinkage parameter
param['verbosity'] = 0  # 0= no logging 3=max logging
param['feature_selector'] = 'shuffle'
param['verbose_eval'] = 1 
param['custom_metric'] = calc_f1
param['disable_default_eval_metric']=True
param['eval_metric'] = ['error','logloss','auc']

GLOBAL_PARAMS

{'Rural': {'max_depth': 3,
  'eta': 0.15,
  'verbosity': 0,
  'feature_selector': 'shuffle',
  'verbose_eval': 1,
  'custom_metric': <function __main__.calc_f1(predt:numpy.ndarray, dtrain:xgboost.core.DMatrix)>,
  'disable_default_eval_metric': True,
  'eval_metric': ['error', 'logloss', 'auc']},
 'Urban': {'max_depth': 3,
  'eta': 0.3,
  'verbosity': 0,
  'feature_selector': 'shuffle',
  'verbose_eval': 1,
  'custom_metric': <function __main__.calc_f1(predt:numpy.ndarray, dtrain:xgboost.core.DMatrix)>,
  'disable_default_eval_metric': True,
  'eval_metric': ['error', 'logloss', 'auc']}}

In [29]:
# deciding which rows in dataset to select [True, False, False, True,.....]
train_selector=np.random.rand(df.shape[0]) > 0.35
# subset selecting everything which is True as Train set
Train=D.get_subset(train_selector)
# subset selecting everything which is False as Test set
Test=D.get_subset(~train_selector)

# checking the size of the train and test dataset train should be more
print("Training subset has: ", Train.shape[0], " samples")
print("Testing subset has: ", Test.shape[0], " samples")

Training subset has:  7443  samples
Testing subset has:  3922  samples


In [30]:
param['num_round']=200
log200=simple_bootstrap('xgb',Train,Test,param,ensemble_size=1)

styled_logs=[    
    {   'log':log200,
        'style':['r:','r-'],
        'label':'200 iterations',
        'label_color':'r'
    }
]
_mean,_std=plot_scores(styled_logs,title='All')

pickle_file=f'data/Checkpoint.pk'
Dump={'styled_logs':styled_logs,
     'tree':tree,
     'mean':_mean,
     'std':_std}
pkl.dump(Dump,open(pickle_file,'wb'))



KeyboardInterrupt: 

In [31]:
# Only for Urban Areas now
urban=True
area= 'Urban' if urban else 'Rural'
param = GLOBAL_PARAMS[area]
selector=df['urban']==urban
subData=D.get_subset(selector)
subD=DataSplitter(subData)

train_selector=np.random.rand(subData.shape[0]) > 0.3
Train=subD.get_subset(train_selector)
Test=subD.get_subset(~train_selector)

param['num_round']=200
log200=simple_bootstrap('xgb',Train,Test,param,ensemble_size=30)

styled_logs=[
    {   'log':log200,
        'style':['b:','b-'],
        'label':'200 iterations',
        'label_color':'b'
    }
]

_mean,_std=plot_scores(styled_logs,title=f'{area}Only: Split into train and test at random')

pickle_file=f'data/Checkpoint_{area}.pk'
Dump={'styled_logs':styled_logs,
     'tree':tree,
     'mean':_mean,
     'std':_std}
pkl.dump(Dump,open(pickle_file,'wb'))



[0]	train-error:0.20901	train-logloss:0.59322	train-auc:0.74554	train-f1:0.44927	eval-error:0.26148	eval-logloss:0.61021	eval-auc:0.69317	eval-f1:0.31983
[1]	train-error:0.20522	train-logloss:0.53007	train-auc:0.80390	train-f1:0.55281	eval-error:0.24508	eval-logloss:0.56068	eval-auc:0.73981	eval-f1:0.47452
[2]	train-error:0.19766	train-logloss:0.48973	train-auc:0.83299	train-f1:0.58722	eval-error:0.24590	eval-logloss:0.53233	eval-auc:0.75297	eval-f1:0.48630
[3]	train-error:0.18288	train-logloss:0.46059	train-auc:0.84901	train-f1:0.56536	eval-error:0.24344	eval-logloss:0.51451	eval-auc:0.76611	eval-f1:0.42105
[4]	train-error:0.16638	train-logloss:0.43907	train-auc:0.85625	train-f1:0.61218	eval-error:0.24016	eval-logloss:0.50534	eval-auc:0.76611	eval-f1:0.44821
[5]	train-error:0.16363	train-logloss:0.42117	train-auc:0.86659	train-f1:0.62579	eval-error:0.23689	eval-logloss:0.50069	eval-auc:0.76914	eval-f1:0.46382
[6]	train-error:0.16088	train-logloss:0.40724	train-auc:0.87876	train-f1:0.6

KeyboardInterrupt: 

In [None]:
urban=False
area= 'Urban' if urban else 'Rural'
selector=df['urban']==urban
subData=D.get_subset(selector)
subD=DataSplitter(subData)

train_selector=np.random.rand(subData.shape[0]) > 0.3
Train=subD.get_subset(train_selector)
Test=subD.get_subset(~train_selector)

param['num_round']=200
log200=simple_bootstrap('xgb',Train,Test,param,ensemble_size=30)

styled_logs=[
    {   'log':log200,
        'style':['b:','b-'],
        'label':'200 iterations',
        'label_color':'b'
    }
]


_mean,_std=plot_scores(styled_logs,title=f'{area}Only: Split into train and test at random')

pickle_file=f'data/Checkpoint_{area}.pk'
Dump={'styled_logs':styled_logs,
     'tree':tree,
     'mean':_mean,
     'std':_std}
pkl.dump(Dump,open(pickle_file,'wb'))



In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

parameters = {
    # Parameters that we are going to tune.
    'eta':0.15,
    'max_depth': 3,
    'verbosity': 0,
    'nthread': 7,
    'num_rounds': 300,
    'objective':'binary:logistic',
}
gs_xgb, X = log200[-1]['bst'], log200[-1]['dtrain'] 

gridsearch_params = [
    (max_depth, num_rounds, eta/100)
    for max_depth in range(2,4)
    for num_rounds in range(50,250,50)
    for eta in range(10,35,5)
]

min_auc = float("Inf")
best_params = None
for max_depth, num_rounds, eta in gridsearch_params:
    print("CV with max_depth={}, num_rounds={}, eta={}".format(
                             max_depth, num_rounds, eta))
    # Update our parameters
    parameters['max_depth'] = max_depth
    parameters['num_rounds'] = num_rounds
    parameters['eta'] = eta
    
    # Run CV
    cv_results = gs_xgb.cv(
        parameters,
        dtrain,
        nfold=5,
        metrics={'auc','error','logloss'},
        early_stopping_rounds=10
    )
    # Update best MAE
    print(cv_results)
    mean_auc = cv_results['test-auc-mean'].min()
    boost_rounds = cv_results['test-auc-mean'].argmin()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc < min_auc:
        min_auc = mean_auc
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], min_auc))