In [1]:
%pylab inline

from lib.KDTreeEncoding import *

import xgboost as xgb
from lib.XGBHelper import *
from lib.XGBoost_params import *
from lib.score_analysis import *

from lib.logger import logger

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from numpy import load
from glob import glob
import pandas as pd
import pickle as pkl

Populating the interactive namespace from numpy and matplotlib


In [2]:
# setting up random seed, it has been set in all helper lib too
np.random.seed(123)

In [3]:
# set path to directories here
#poverty_dir = '/datasets/cs255-sp22-a00-public/poverty/'
poverty_dir = '/home/tlaud/climate_glue/cse255'
image_dir = poverty_dir + '/anon_images'
#train_table = '../public_tables/train.csv'
train_table = '/home/tlaud/climate_glue/cse255/public_tables/train.csv'

In [4]:
# creating main dataframe and image files here
df=pd.read_csv(train_table,index_col=0)
df.index = df['filename']
files=list(glob(f'{image_dir}/*.npz'))

In [None]:
## Pickled the encoded dataset, so can skip this cell if no changes are being made here (pickle again if Yes)
## Getting features from the images in the main dataframe Enc_data using KD-trees
## Can think of changes here to extract features from the images in a different manner
## max_images = 610 works, but 615 will run out of memory and kernel dies
## try changing the randomness factor here file choosing images to use for encoding
## tree depth: (2^(tree_depth+1))+1 will decide how many columns we will be having in our encoded dataset
## so right now 1024+1 columns will be used to encode the image data as a feature vector
tree_depth = 9
train_size,tree=train_encoder(files,max_images=610,tree_depth=tree_depth)
Enc_data=encoded_dataset(image_dir,df,tree, depth=tree_depth, label_col='label')

In [None]:
# saving this encoded dataset and tree so that we don't have to encode it every time
# run this cell only if you have made changes to the encoding design
encoded_dataset=f'encoded_dataset.pk'
encoder_tree=f'encoder_tree.pk'
pkl.dump(Enc_data,open(encoded_dataset,'wb'))
pkl.dump(tree, open(encoder_tree,'wb'))

In [6]:
# run this cell to load the encoded dataset and tree 
encoded_dataset=f'encoded_dataset.pk'
encoder_tree = f'encoder_tree.pk'
Enc_data = pkl.load(open(encoded_dataset,'rb'))
tree = pkl.load(open(encoder_tree,'rb'))

In [7]:
# Creating object D to handle dataset functionalities
# like getting subsets, bootstrapping samples, etc
D = DataSplitter(Enc_data.data)

In [20]:
from sklearn.metrics import f1_score, recall_score
# Set all params for XGBoost here
# param dictionary is already present in the lib.XGBoost_Params file
# default parameters are set, can be customized and new params can also be added

def calc_f1(predt: np.ndarray, dtrain: xgb.DMatrix):
    preds = (predt > 0.5).astype(np.int64)
    res = f1_score(predt>0.5, dtrain.get_label())
    return 'f1', res

param['max_depth'] = 3   # depth of tree
param['eta'] = 0.18      # shrinkage parameter
#COPY THIS
param['verbose_eval'] = 1 
param['early_stopping_rounds'] = 10
param['eval_metric'] = ['auc']
param['feature_selector'] = 'shuffle'
param['custom_metric'] = calc_f1
param['disable_default_eval_metric']=True
# param['objective'] = 'binary:logistic'
# param['nthread'] = 7 # Number of threads used
# param['eval_metric'] = ['error','logloss']
param

{'max_depth': 3,
 'eta': 0.18,
 'verbosity': 0,
 'objective': 'binary:logistic',
 'nthread': 7,
 'eval_metric': ['auc'],
 'verbose_eval': 1,
 'early_stopping_rounds': 10,
 'feature_selector': 'shuffle',
 'custom_metric': <function __main__.calc_f1(predt: numpy.ndarray, dtrain: xgboost.core.DMatrix)>,
 'disable_default_eval_metric': True,
 'num_round': 200}

In [21]:
# deciding which rows in dataset to select [True, False, False, True,.....]
train_selector=np.random.rand(df.shape[0]) > 0.35
# subset selecting everything which is True as Train set
Train=D.get_subset(train_selector)
# subset selecting everything which is False as Test set
Test=D.get_subset(~train_selector)

# checking the size of the train and test dataset train should be more
print("Training subset has: ", Train.shape[0], " samples")
print("Testing subset has: ", Test.shape[0], " samples")

Training subset has:  7387  samples
Testing subset has:  3978  samples


In [22]:
param['num_round']=200
log200=simple_bootstrap('xgb',Train,Test,param,ensemble_size=10)

styled_logs=[    
    {   'log':log200,
        'style':['r:','r-'],
        'label':'200 iterations',
        'label_color':'r'
    }
]
_mean,_std=plot_scores(styled_logs,title='All')

pickle_file=f'data/Checkpoint.pk'
Dump={'styled_logs':styled_logs,
     'tree':tree,
     'mean':_mean,
     'std':_std}
pkl.dump(Dump,open(pickle_file,'wb'))



[0]	train-auc:0.64443	train-f1:0.59841	eval-auc:0.59166	eval-f1:0.56187
[1]	train-auc:0.68177	train-f1:0.58285	eval-auc:0.61717	eval-f1:0.52375




[2]	train-auc:0.70320	train-f1:0.56902	eval-auc:0.63069	eval-f1:0.50406
[3]	train-auc:0.71916	train-f1:0.61027	eval-auc:0.63621	eval-f1:0.53721
[4]	train-auc:0.72842	train-f1:0.57409	eval-auc:0.64626	eval-f1:0.48912
[5]	train-auc:0.73974	train-f1:0.60811	eval-auc:0.64985	eval-f1:0.52487
[6]	train-auc:0.74803	train-f1:0.60855	eval-auc:0.65268	eval-f1:0.50546
[7]	train-auc:0.75289	train-f1:0.59209	eval-auc:0.65388	eval-f1:0.49030
[8]	train-auc:0.76205	train-f1:0.61197	eval-auc:0.65783	eval-f1:0.50876
[9]	train-auc:0.76920	train-f1:0.62190	eval-auc:0.65812	eval-f1:0.50910
[10]	train-auc:0.77104	train-f1:0.62131	eval-auc:0.65727	eval-f1:0.51332
[11]	train-auc:0.77632	train-f1:0.62313	eval-auc:0.66027	eval-f1:0.51132
[12]	train-auc:0.78117	train-f1:0.63192	eval-auc:0.66189	eval-f1:0.51873
[13]	train-auc:0.78888	train-f1:0.63904	eval-auc:0.66483	eval-f1:0.51883
[14]	train-auc:0.79606	train-f1:0.65081	eval-auc:0.66494	eval-f1:0.52541
[15]	train-auc:0.80054	train-f1:0.65707	eval-auc:0.66832	ev

[115]	train-auc:0.94656	train-f1:0.84697	eval-auc:0.69638	eval-f1:0.58468
[116]	train-auc:0.94724	train-f1:0.84953	eval-auc:0.69677	eval-f1:0.58513
[117]	train-auc:0.94796	train-f1:0.84885	eval-auc:0.69652	eval-f1:0.58553
[118]	train-auc:0.94850	train-f1:0.85100	eval-auc:0.69653	eval-f1:0.58553
[119]	train-auc:0.94896	train-f1:0.85432	eval-auc:0.69689	eval-f1:0.58560
[120]	train-auc:0.94944	train-f1:0.85468	eval-auc:0.69686	eval-f1:0.58612
[121]	train-auc:0.95006	train-f1:0.85504	eval-auc:0.69721	eval-f1:0.58672
[122]	train-auc:0.95062	train-f1:0.85750	eval-auc:0.69744	eval-f1:0.58866
[123]	train-auc:0.95095	train-f1:0.85768	eval-auc:0.69767	eval-f1:0.58925
[124]	train-auc:0.95154	train-f1:0.85942	eval-auc:0.69779	eval-f1:0.58788
[125]	train-auc:0.95221	train-f1:0.86009	eval-auc:0.69827	eval-f1:0.58704
[126]	train-auc:0.95270	train-f1:0.86090	eval-auc:0.69797	eval-f1:0.58627
[127]	train-auc:0.95318	train-f1:0.86058	eval-auc:0.69790	eval-f1:0.58592
[128]	train-auc:0.95369	train-f1:0.861



[1]	train-auc:0.66046	train-f1:0.60807	eval-auc:0.59703	eval-f1:0.55926
[2]	train-auc:0.68933	train-f1:0.56801	eval-auc:0.61955	eval-f1:0.51152
[3]	train-auc:0.70897	train-f1:0.56945	eval-auc:0.63357	eval-f1:0.50709
[4]	train-auc:0.71693	train-f1:0.55683	eval-auc:0.64108	eval-f1:0.49331
[5]	train-auc:0.72639	train-f1:0.58168	eval-auc:0.64103	eval-f1:0.51261
[6]	train-auc:0.73907	train-f1:0.55442	eval-auc:0.64817	eval-f1:0.46822
[7]	train-auc:0.74662	train-f1:0.56748	eval-auc:0.64782	eval-f1:0.46614
[8]	train-auc:0.75180	train-f1:0.57803	eval-auc:0.64987	eval-f1:0.47743
[9]	train-auc:0.75863	train-f1:0.60720	eval-auc:0.65125	eval-f1:0.49638
[10]	train-auc:0.76216	train-f1:0.60157	eval-auc:0.65405	eval-f1:0.49270
[11]	train-auc:0.76497	train-f1:0.60355	eval-auc:0.65509	eval-f1:0.49438
[12]	train-auc:0.77156	train-f1:0.61313	eval-auc:0.65841	eval-f1:0.50347
[13]	train-auc:0.77815	train-f1:0.62602	eval-auc:0.66091	eval-f1:0.51367
[14]	train-auc:0.77924	train-f1:0.63045	eval-auc:0.66148	eva

[114]	train-auc:0.94820	train-f1:0.85081	eval-auc:0.69212	eval-f1:0.58092
[115]	train-auc:0.94883	train-f1:0.85409	eval-auc:0.69204	eval-f1:0.58039
[116]	train-auc:0.94920	train-f1:0.85195	eval-auc:0.69230	eval-f1:0.57850
[117]	train-auc:0.94957	train-f1:0.85349	eval-auc:0.69213	eval-f1:0.57815
[118]	train-auc:0.94989	train-f1:0.85460	eval-auc:0.69194	eval-f1:0.57755
[119]	train-auc:0.95033	train-f1:0.85329	eval-auc:0.69255	eval-f1:0.58002
[120]	train-auc:0.95087	train-f1:0.85589	eval-auc:0.69245	eval-f1:0.58115
[121]	train-auc:0.95161	train-f1:0.85853	eval-auc:0.69249	eval-f1:0.58030
[122]	train-auc:0.95232	train-f1:0.86048	eval-auc:0.69229	eval-f1:0.58125
[123]	train-auc:0.95326	train-f1:0.86182	eval-auc:0.69184	eval-f1:0.58271
[124]	train-auc:0.95373	train-f1:0.86326	eval-auc:0.69176	eval-f1:0.58200
[125]	train-auc:0.95465	train-f1:0.86521	eval-auc:0.69077	eval-f1:0.57857
[126]	train-auc:0.95502	train-f1:0.86562	eval-auc:0.69131	eval-f1:0.57823
[127]	train-auc:0.95545	train-f1:0.866



[2]	train-auc:0.69033	train-f1:0.34341	eval-auc:0.60454	eval-f1:0.27261
[3]	train-auc:0.71392	train-f1:0.63263	eval-auc:0.62549	eval-f1:0.55676
[4]	train-auc:0.72860	train-f1:0.63144	eval-auc:0.63808	eval-f1:0.54821
[5]	train-auc:0.74117	train-f1:0.62780	eval-auc:0.64280	eval-f1:0.53042
[6]	train-auc:0.75276	train-f1:0.59859	eval-auc:0.65065	eval-f1:0.49372
[7]	train-auc:0.75629	train-f1:0.60080	eval-auc:0.65432	eval-f1:0.49486
[8]	train-auc:0.76272	train-f1:0.63690	eval-auc:0.65535	eval-f1:0.53092
[9]	train-auc:0.76738	train-f1:0.62161	eval-auc:0.65924	eval-f1:0.51265
[10]	train-auc:0.77447	train-f1:0.63127	eval-auc:0.66093	eval-f1:0.51667
[11]	train-auc:0.77680	train-f1:0.62698	eval-auc:0.66230	eval-f1:0.51914
[12]	train-auc:0.78276	train-f1:0.64400	eval-auc:0.66319	eval-f1:0.52712
[13]	train-auc:0.78528	train-f1:0.65016	eval-auc:0.66232	eval-f1:0.52767
[14]	train-auc:0.79106	train-f1:0.65533	eval-auc:0.66799	eval-f1:0.53408
[15]	train-auc:0.79914	train-f1:0.66765	eval-auc:0.66964	ev

[115]	train-auc:0.95015	train-f1:0.86457	eval-auc:0.70415	eval-f1:0.60065
[116]	train-auc:0.95044	train-f1:0.86523	eval-auc:0.70456	eval-f1:0.60207
[117]	train-auc:0.95084	train-f1:0.86545	eval-auc:0.70454	eval-f1:0.60189
[118]	train-auc:0.95121	train-f1:0.86641	eval-auc:0.70444	eval-f1:0.60148
[119]	train-auc:0.95202	train-f1:0.86856	eval-auc:0.70447	eval-f1:0.60071
[120]	train-auc:0.95245	train-f1:0.86904	eval-auc:0.70472	eval-f1:0.60212
[121]	train-auc:0.95266	train-f1:0.86957	eval-auc:0.70471	eval-f1:0.60324
[122]	train-auc:0.95340	train-f1:0.86948	eval-auc:0.70426	eval-f1:0.60006
[123]	train-auc:0.95370	train-f1:0.87019	eval-auc:0.70481	eval-f1:0.59882
[124]	train-auc:0.95441	train-f1:0.87284	eval-auc:0.70467	eval-f1:0.60118
[125]	train-auc:0.95470	train-f1:0.87320	eval-auc:0.70468	eval-f1:0.60154
[126]	train-auc:0.95504	train-f1:0.87368	eval-auc:0.70471	eval-f1:0.60237
[127]	train-auc:0.95548	train-f1:0.87471	eval-auc:0.70516	eval-f1:0.60190
[128]	train-auc:0.95621	train-f1:0.876



[2]	train-auc:0.69000	train-f1:0.61310	eval-auc:0.60930	eval-f1:0.53869
[3]	train-auc:0.71301	train-f1:0.62846	eval-auc:0.61918	eval-f1:0.54604
[4]	train-auc:0.73167	train-f1:0.61616	eval-auc:0.63402	eval-f1:0.52727
[5]	train-auc:0.74142	train-f1:0.61607	eval-auc:0.64605	eval-f1:0.53837
[6]	train-auc:0.75300	train-f1:0.63316	eval-auc:0.64950	eval-f1:0.54243
[7]	train-auc:0.75550	train-f1:0.63442	eval-auc:0.65345	eval-f1:0.54660
[8]	train-auc:0.76305	train-f1:0.64142	eval-auc:0.65577	eval-f1:0.54308
[9]	train-auc:0.76999	train-f1:0.64680	eval-auc:0.65901	eval-f1:0.54303
[10]	train-auc:0.77622	train-f1:0.65448	eval-auc:0.66213	eval-f1:0.54831
[11]	train-auc:0.78236	train-f1:0.65814	eval-auc:0.66361	eval-f1:0.53822
[12]	train-auc:0.78976	train-f1:0.66325	eval-auc:0.66334	eval-f1:0.53849
[13]	train-auc:0.79448	train-f1:0.66345	eval-auc:0.66337	eval-f1:0.53595
[14]	train-auc:0.79961	train-f1:0.66822	eval-auc:0.66464	eval-f1:0.53185
[15]	train-auc:0.80238	train-f1:0.67120	eval-auc:0.66593	ev

[115]	train-auc:0.94994	train-f1:0.85995	eval-auc:0.69200	eval-f1:0.58323
[116]	train-auc:0.95040	train-f1:0.86124	eval-auc:0.69251	eval-f1:0.58306
[117]	train-auc:0.95073	train-f1:0.86173	eval-auc:0.69278	eval-f1:0.58440
[118]	train-auc:0.95133	train-f1:0.86337	eval-auc:0.69341	eval-f1:0.58380
[119]	train-auc:0.95191	train-f1:0.86417	eval-auc:0.69294	eval-f1:0.58363
[120]	train-auc:0.95220	train-f1:0.86564	eval-auc:0.69296	eval-f1:0.58244
[121]	train-auc:0.95294	train-f1:0.86676	eval-auc:0.69357	eval-f1:0.58336
[122]	train-auc:0.95332	train-f1:0.86856	eval-auc:0.69276	eval-f1:0.58608
[123]	train-auc:0.95349	train-f1:0.87065	eval-auc:0.69252	eval-f1:0.58397
[124]	train-auc:0.95412	train-f1:0.87253	eval-auc:0.69312	eval-f1:0.58442
[125]	train-auc:0.95450	train-f1:0.87296	eval-auc:0.69262	eval-f1:0.58501
[126]	train-auc:0.95502	train-f1:0.87347	eval-auc:0.69290	eval-f1:0.58434
[127]	train-auc:0.95529	train-f1:0.87364	eval-auc:0.69270	eval-f1:0.58619
[128]	train-auc:0.95548	train-f1:0.873

KeyboardInterrupt: 

In [None]:
# Only for Urban Areas now
urban=True
area= 'Urban' if urban else 'Rural'
selector=df['urban']==urban
subData=D.get_subset(selector)
subD=DataSplitter(subData)

train_selector=np.random.rand(subData.shape[0]) > 0.7
Train=subD.get_subset(train_selector)
Test=subD.get_subset(~train_selector)

param['num_round']=10
log10=simple_bootstrap(Train,Test,param,ensemble_size=30)
param['num_round']=100
log100=simple_bootstrap(Train,Test,param,ensemble_size=30)

styled_logs=[
    {   'log':log10,
        'style':['g:','g-'],
        'label':'10 iterations',
        'label_color':'g'
    },
    {   'log':log100,
        'style':['b:','b-'],
        'label':'100 iterations',
        'label_color':'b'
    }
]

_mean,_std=plot_scores(styled_logs,title=f'{area}Only: Split into train and test at random')

pickle_file=f'data/Dump{area}.pk'
Dump={'styled_logs':styled_logs,
     'tree':tree,
     'mean':_mean,
     'std':_std}
pkl.dump(Dump,open(pickle_file,'wb'))



In [None]:
_mean, _std

In [None]:
urban=False
area= 'Urban' if urban else 'Rural'
selector=df['urban']==urban
subData=D.get_subset(selector)
subD=DataSplitter(subData)

train_selector=np.random.rand(subData.shape[0]) > 0.7
Train=subD.get_subset(train_selector)
Test=subD.get_subset(~train_selector)

param['num_round']=10
log10=simple_bootstrap(Train,Test,param,ensemble_size=30)
param['num_round']=100
log100=simple_bootstrap(Train,Test,param,ensemble_size=30)

styled_logs=[
    {   'log':log10,
        'style':['y:','y-'],
        'label':'10 iterations',
        'label_color':'y'
    },
    {   'log':log100,
        'style':['m:','m-'],
        'label':'100 iterations',
        'label_color':'m'
    }
]

_mean,_std=plot_scores(styled_logs,title=f'{area}Only: Split into train and test at random')

pickle_file=f'data/Dump{area}.pk'
Dump={'styled_logs':styled_logs,
     'tree':tree,
     'mean':_mean,
     'std':_std}
pkl.dump(Dump,open(pickle_file,'wb'))

