In [2]:
import argparse
import numpy as np
import pandas as pd

from os import path, makedirs
from datasets import load_dataset
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

In [3]:
#-------------------
# Parameters
#-------------------    
parser = argparse.ArgumentParser()
parser.add_argument('--model_id', type=str, default='xgboost')
parser.add_argument('--dataset', type=str, default='beanham/spatial_join_dataset')
args = parser.parse_args(args=[])
args.save_path=f'inference_results/{args.model_id}/'

## Full Training is Powerful

In [4]:
## load data
data = load_dataset(args.dataset)
train, val, test =data['train'], data['val'], data['test']
train=train.to_pandas()
val=val.to_pandas()
## evaluate on a subset
np.random.seed(100)
index=np.random.randint(0, len(test), 1000)
test=test.select(index).to_pandas()

## extract X,y
#train_X=train[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
#val_X=val[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
#test_X=test[['min_angle', 'max_angle', 'min_euc_dist', 'max_euc_dist', 'min_area', 'max_area']]
train_X=train[['min_angle', 'min_euc_dist', 'max_area']]
val_X=val[['min_angle', 'min_euc_dist', 'max_area']]
test_X=test[['min_angle', 'min_euc_dist', 'max_area']]
train_y=train['label']
val_y=val['label']
test_y=test['label']

## normalization
mean=train_X.mean()
std=train_X.std()
train_X=(train_X-mean)/std
val_X=(val_X-mean)/std
test_X=(test_X-mean)/std

In [5]:
train_X.shape, val_X.shape, test_X.shape

((6442, 3), (716, 3), (1000, 3))

In [6]:
xgboost = XGBClassifier(n_estimators = 500,
                        learning_rate = 0.05, 
                        objective = 'binary:logistic',
                        random_state = 100)
#subset=index=np.random.randint(0, len(train), int(0.005*len(train)))
#xgboost.fit(train_X.iloc[subset], train_y.iloc[subset], eval_set = [(val_X, val_y)], verbose=False)
xgboost.fit(train_X, train_y, eval_set = [(val_X, val_y)], verbose=False)

In [7]:
pred = xgboost.predict(test_X)
accuracy_score(pred, test_y)

0.993

In [8]:
f1_score(pred, test_y)

0.9945011783189317

In [9]:
xgboost.feature_importances_

array([0.7438962 , 0.04230634, 0.21379751], dtype=float32)

In [6]:
from lightgbm import LGBMClassifier

In [8]:
model = LGBMClassifier(objective='binary',random_state=100,verbose=-1)
model.fit(train_X, train_y, eval_set = [(val_X, val_y)])

In [9]:
pred = model.predict(test_X)
accuracy_score(pred, test_y)

0.998

## Weak Labels Don't Work

In [32]:
## load data
data = load_dataset(args.dataset)
train, val, test =data['train'], data['val'], data['test']
train=train.to_pandas()
val=val.to_pandas()
test=test.select(index).to_pandas()

## extract X,y
train_X=train[['min_angle', 'min_euc_dist', 'min_area', 'max_area']]
val_X=val[['min_angle', 'min_euc_dist', 'min_area', 'max_area']]
test_X=test[['min_angle', 'min_euc_dist', 'min_area', 'max_area']]
test_y=test['label']

## normalization
mean=train_X.mean()
std=train_X.std()
train_X=(train_X-mean)/std
val_X=(val_X-mean)/std
test_X=(test_X-mean)/std

In [60]:
## weak labels
threshold=1
train_y=1*(train['min_angle']<=threshold)
val_y=1*(val['min_angle']<=threshold)
xgboost = XGBClassifier(n_estimators = 500,
                        learning_rate = 0.05, 
                        objective = 'binary:logistic',
                        random_state = 100)
xgboost.fit(train_X, train_y, eval_set = [(val_X,val_y)], verbose=False)

In [61]:
pred = xgboost.predict(test_X)
accuracy_score(pred, test_y)

0.863

In [68]:
test.iloc[[0,4]]

Unnamed: 0,sidewalk,road,min_angle,max_angle,min_euc_dist,max_euc_dist,min_area,max_area,hausdorff_dist,label
0,"{'coordinates': [[-122.2017203, 47.6112977], [...","{'coordinates': [[-122.2016161, 47.6114907], [...",1.094691,178.905309,8.916691,8.916691,0.174975,0.538494,72.147724,1
4,"{'coordinates': [[-122.1292819, 47.54715379999...","{'coordinates': [[-122.1291831, 47.5470408], [...",1.383903,178.616097,9.362863,9.362863,0.241637,0.348247,47.349345,1


In [70]:
pred[[0,4]]

array([0, 0])