In [1]:
import pandas as pd
import numpy as np

import os
os.environ['CFLAGS'] = '-fopenmp'
os.environ['LDFLAGS'] = '-fopenmp'

os.environ["C_INCLUDE_PATH"] = np.get_include()

import pyximport
pyximport.install()

from rocauc_pairwise.sigmoid_pairwise_auc_cpu import sigmoid_pairwise_diff_hess_auc_cpu, sigmoid_pairwise_diff_hess_auc_exact_cpu
from rocauc_pairwise.sigmoid_pairwise_cpu import sigmoid_pairwise_diff_hess

In [2]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import roc_auc_score
import tqdm
from lightgbm import LGBMClassifier, Dataset
import lightgbm
from sklearn.model_selection import train_test_split
import numpy as np
import math

RANDOM_STATE = 42

In [3]:
#X = pd.read_csv('./data/breast-cancer.csv', index_col='id')

In [4]:
#X['class'] = X.diagnosis.apply(lambda x: int(x == 'M'))

In [5]:
#X.drop('diagnosis', axis=1, inplace=True)

In [6]:
#labelencoder=OrdinalEncoder()
#X[X.columns] = labelencoder.fit_transform(X[X.columns])

In [7]:
X = pd.read_csv('./data/orange_small_churn_train_data.csv', index_col='ID')

non_cat_features = [f'Var{i + 1}' for i in range(190)]
cat_features = [f'Var{k + 1}' for k in range(190, 190 + 40)]

mask = X[cat_features].nunique() < 50

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
ohe = OneHotEncoder(sparse=False)
X[cat_features] = oe.fit_transform(X[cat_features])

X['labels'] += 1
X['labels'] /= 2

X = X.drop(18298, axis=0)

In [8]:
#X = X.sample(5_000)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X.drop('labels', axis=1), X['labels'], test_size=0.2, random_state=43)

In [10]:
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

In [11]:
fit = Dataset(X_train, y_train, free_raw_data=True)
val = Dataset(X_test, y_test, free_raw_data=True)

In [12]:
from sklearn.metrics import roc_auc_score

In [13]:
def roc_auc_lgbm(preds, train_data):
    y = train_data.get_label()
    auc = roc_auc_score(y, preds)
    is_higher_better = True
    return 'default_rate', auc, is_higher_better

In [14]:
model = lightgbm.train(
    params={'learning_rate': 0.001,
            'num_leaves' : 58,
            'objective': 'binary'},
    train_set=fit,
    num_boost_round=150,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    early_stopping_rounds=150,
    verbose_eval=50,
    feval=roc_auc_lgbm
    )



[LightGBM] [Info] Number of positive: 1110, number of negative: 13528
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12878
[LightGBM] [Info] Number of data points in the train set: 14638, number of used features: 212
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075830 -> initscore=-2.500402
[LightGBM] [Info] Start training from score -2.500402
Training until validation scores don't improve for 150 rounds
[50]	fit's binary_logloss: 0.26223	fit's default_rate: 0.768806	val's binary_logloss: 0.258469	val's default_rate: 0.688975
[100]	fit's binary_logloss: 0.257008	fit's default_rate: 0.77475	val's binary_logloss: 0.256418	val's default_rate: 0.69023
[150]	fit's binary_logloss: 0.252433	fit's default_rate: 0.781188	val's binary_logloss: 0.254539	val's default_rate: 0.6938
Did not meet early stopping. Best iteration is:
[150]	fit's binary_logloss: 0.252433	fit's default_rate: 0.781188	val's binary_logloss: 0.254539	val's default_rate: 0.6938


In [15]:
def sigmoid_pairwise_loss(preds, train_data):
    y = train_data.get_label()
    
    y = np.array(y, dtype=np.int64)
    preds = np.array(preds, dtype=np.float64)

    if np.mean(preds) == 0:
        preds[0] = 1.0


    grad, hess = sigmoid_pairwise_diff_hess_auc_exact_cpu(y, np.exp(preds), 12)
    
    return -grad, -hess

In [16]:
model = lightgbm.train(
        params={'learning_rate': 0.01,
                'num_leaves' : 31,
                    'boosting_type' : 'gbdt'},
        train_set=fit,
        num_boost_round=800,
        valid_sets=(fit, val),
        valid_names=('fit', 'val'),
        feval=roc_auc_lgbm,
        verbose_eval=1,
        fobj =sigmoid_pairwise_loss
        )



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12878
[LightGBM] [Info] Number of data points in the train set: 14638, number of used features: 212
[1]	fit's default_rate: 0.498099	val's default_rate: 0.504432
[2]	fit's default_rate: 0.673246	val's default_rate: 0.656408
[3]	fit's default_rate: 0.731487	val's default_rate: 0.682645
[4]	fit's default_rate: 0.741912	val's default_rate: 0.68541
[5]	fit's default_rate: 0.745968	val's default_rate: 0.686886
[6]	fit's default_rate: 0.748238	val's default_rate: 0.688966
[7]	fit's default_rate: 0.749889	val's default_rate: 0.689596
[8]	fit's default_rate: 0.750363	val's default_rate: 0.689258
[9]	fit's default_rate: 0.751441	val's default_rate: 0.689248
[10]	fit's default_rate: 0.751225	val's default_rate: 0.69063
[11]	fit's default_rate: 0.750721	val's default_rate: 0.690986
[12]	fit's default_rate: 0.750678	val's default_rate: 0.690995
[13]	fit's default_rate: 0.75103	val's default_rate: 0.691867
[14]	f

In [61]:
y_true = np.random.randint(0, 2, 1000)
y_pred = np.zeros(1000)
y_pred = np.exp(y_pred)

In [62]:
sigmoid_pairwise_diff_hess_auc_exact_cpu(y_true, y_pred, 12)

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 