In [1]:
import argparse
import numpy as np
import pandas as pd

from os import path, makedirs
from datasets import load_dataset
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
#-------------------
# Parameters
#-------------------    
parser = argparse.ArgumentParser()
parser.add_argument('--model_id', type=str, default='xgboost')
parser.add_argument('--dataset', type=str, default='beanham/spatial_join_dataset')
args = parser.parse_args(args=[])
args.save_path=f'inference_results/{args.model_id}/'

## Full Training is Powerful

In [82]:
## load data
data = load_dataset(args.dataset)
train, val, test =data['train'], data['val'], data['test']
train=train.to_pandas()
val=val.to_pandas()
test=test.to_pandas()

## extract X,y
train_X=train[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
val_X=val[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
test_X=test[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
train_y=train['label']
val_y=val['label']
test_y=test['label']

## normalization
mean=train_X.mean()
std=train_X.std()
train_X=(train_X-mean)/std
val_X=(val_X-mean)/std
test_X=(test_X-mean)/std

In [85]:
xgboost = XGBClassifier(n_estimators = 500,
                        learning_rate = 0.05, 
                        objective = 'binary:logistic',
                        random_state = 100)
xgboost.fit(train_X, train_y, eval_set = [(val_X,val_y)], verbose=False)

In [86]:
pred = xgboost.predict(test_X)
accuracy_score(pred, test_y)

0.9954382535027696

## Weak Labels Don't Work

In [87]:
## load data
data = load_dataset(args.dataset)
train, val, test =data['train'], data['val'], data['test']
train=train.to_pandas()
val=val.to_pandas()
test=test.to_pandas()

## extract X,y
train_X=train[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
val_X=val[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
test_X=test[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]

## weak labels
threshold=10
train_y=1*(train['min_angle']<=threshold)
val_y=1*(val['min_angle']<=threshold)
test_y=test['label']

## normalization
mean=train_X.mean()
std=train_X.std()
train_X=(train_X-mean)/std
val_X=(val_X-mean)/std
test_X=(test_X-mean)/std

In [88]:
xgboost = XGBClassifier(n_estimators = 500,
                        learning_rate = 0.05, 
                        objective = 'binary:logistic',
                        random_state = 100)
xgboost.fit(train_X, train_y, eval_set = [(val_X,val_y)], verbose=False)

In [89]:
## heuristic
accuracy_score(np.array(test['min_angle']<=threshold), test_y)

0.9433040078201369

In [90]:
pred = xgboost.predict(test_X)
accuracy_score(pred, test_y)

0.9436298468556533

## Does XGBoost Generalize Well?

In [3]:
## load data
data = load_dataset(args.dataset)
train, val, test =data['train'], data['val'], data['test']
train=train.to_pandas()
val=val.to_pandas()
test=test.to_pandas()

In [4]:
threshold=10
idd_positive_train=train[(train['label']==1)&(train['min_angle']<=threshold)]
idd_negative_train=train[(train['label']==0)&(train['min_angle']>threshold)]
idd_train=pd.concat([idd_positive_train, idd_negative_train])
idd_train=idd_train.sample(frac=1)

idd_positive_val=val[(val['label']==1)&(val['min_angle']<=threshold)]
idd_negative_val=val[(val['label']==0)&(val['min_angle']>threshold)]
idd_val=pd.concat([idd_positive_val, idd_negative_val])
idd_val=idd_val.sample(frac=1)

idd_positive_test=test[(test['label']==1)&(test['min_angle']<=threshold)]
idd_negative_test=test[(test['label']==0)&(test['min_angle']>threshold)]
odd_positive_test=test[(test['label']==0)&(test['min_angle']<=threshold)]
odd_negative_test=test[(test['label']==1)&(test['min_angle']>threshold)]
idd_test=pd.concat([idd_positive_test, idd_negative_test])
idd_test=idd_test.sample(frac=1)
odd_test=pd.concat([odd_positive_test, odd_negative_test])
odd_test=odd_test.sample(frac=1)

In [10]:
## extract X,y
train_X=idd_train[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
val_X=idd_val[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
train_y=idd_train['label']
val_y=idd_val['label']

test_X=test[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
idd_test_X=idd_test[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
odd_test_X=odd_test[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
test_y=test['label']
idd_test_y=idd_test['label']
odd_test_y=odd_test['label']

## normalization
mean=train_X.mean()
std=train_X.std()
train_X=(train_X-mean)/std
val_X=(val_X-mean)/std

test_X=(test_X-mean)/std
idd_test_X=(idd_test_X-mean)/std
odd_test_X=(odd_test_X-mean)/std

In [6]:
xgboost = XGBClassifier(n_estimators = 500,
                        learning_rate = 0.05, 
                        objective = 'binary:logistic',
                        random_state = 100)
xgboost.fit(train_X, train_y, eval_set = [(val_X, val_y)], verbose=False)

In [9]:
## idd accuracy
pred = xgboost.predict(val_X)
accuracy_score(pred, val_y)

1.0

In [11]:
## idd accuracy
pred = xgboost.predict(test_X)
accuracy_score(pred, test_y)

0.945584881068752

In [12]:
## idd accuracy
pred = xgboost.predict(idd_test_X)
accuracy_score(pred, idd_test_y)

0.9996545768566494

In [13]:
## odd accuracy
pred = xgboost.predict(odd_test_X)
accuracy_score(pred, odd_test_y)

0.04597701149425287