In [1]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

import pandas as pd


import sys
sys.path.append('..')
from utils import plot_counts_and_proportion, read_train_transaction, get_categorical_from_df, preprocessing

In [2]:
learning_rate = 0.05


params_tree = {
    'application': 'binary',
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'boosting_type': 'gbdt',
    'boosting': 'gbdt',
    # 'categorical_feature': get_categorical_from_df(X_train)[0],
    'learning_rate': learning_rate,
    'metric': 'auc',
    'min_data': 50,
    'max_depth': 10,
    'is_unbalance': 'true',
    'num_leaves': 31,
    # 'feature_fraction': 0.5,
    'objective': 'binary',
    'sub_feature': .5,
    'verbose': 0,
}

lgb_fit_params = {}



In [3]:
class TrainLGB():
    def __init__(self, params_tree, lgb_fit_params):
        self.params_tree = params_tree
        self.params_iters = lgb_fit_params
        
    def fit(self, X, y):
        cats_index = get_categorical_from_df(X)[0]
        train_data = lgb.Dataset(X, label=y, categorical_feature=cats_index)
        clf = lgb.train(self.params_tree, train_data, **self.params_iters)
        return clf

In [31]:
not_nan

TransactionID
2987000     True
2987001    False
2987002    False
2987003    False
2987004    False
           ...  
2991995    False
2991996    False
2991997    False
2991998    False
2991999    False
Name: card2, Length: 5000, dtype: bool

In [45]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
X, y = read_train_transaction(nrows = 5000,folder_path = '../../input/')
print(X.loc[:, col].head())

TransactionID
2987000      NaN
2987001    404.0
2987002    490.0
2987003    567.0
2987004    514.0
Name: card2, dtype: float64


In [66]:
X, y = read_train_transaction(nrows = 5000,folder_path = '../../input/')
print(X.loc[:, col].head())
le = LabelEncoder()

col = 'card2'
not_nan = ~ X[col].isna() 
X.loc[not_nan, col] = le.fit_transform(X.loc[not_nan, col])
X.loc[~not_nan, col] = -1
X.loc[:, col] =  X.loc[:, col].astype('int8')
print(X.loc[:, col].head())
# le.fit_transform(X['card2'])
# sorted(np.unique(le.fit_transform(X['card2'])))

TransactionID
2987000      NaN
2987001    404.0
2987002    490.0
2987003    567.0
2987004    514.0
Name: card2, dtype: float64
TransactionID
2987000    -1
2987001   -58
2987002    -3
2987003    53
2987004    14
Name: card2, dtype: int8


In [63]:
X.loc[:, col].fillna(-1)

TransactionID
2987000     -1.0
2987001    198.0
2987002    253.0
2987003    309.0
2987004    270.0
           ...  
2991995     46.0
2991996    142.0
2991997    191.0
2991998    142.0
2991999    300.0
Name: card2, Length: 5000, dtype: float64

In [24]:
le.classes_

array([100., 101., 102., 103., 104., 105., 106., 108., 110., 111., 112.,
       113., 114., 115., 117., 118., 122., 123., 126., 127., 128., 130.,
       133., 134., 135., 136., 142., 143., 144., 145., 146., 147., 148.,
       150., 152., 155., 158., 159., 160., 161., 162., 163., 165., 166.,
       167., 168., 170., 171., 172., 174., 176., 177., 180., 181., 183.,
       184., 191., 192., 194., 197., 198., 199., 200., 201., 202., 203.,
       204., 205., 206., 210., 214., 215., 216., 218., 219., 222., 225.,
       226., 229., 231., 234., 236., 239., 240., 242., 243., 245., 246.,
       247., 248., 250., 251., 253., 254., 255., 257., 258., 260., 262.,
       264., 265., 266., 268., 269., 270., 271., 272., 275., 276., 278.,
       280., 281., 283., 284., 285., 286., 287., 290., 291., 294., 295.,
       296., 297., 298., 299., 300., 301., 302., 303., 304., 307., 308.,
       309., 310., 311., 313., 314., 315., 316., 317., 318., 320., 321.,
       322., 324., 325., 327., 330., 332., 333., 33

In [23]:
le.inverse_transform([np.nan])

ValueError: y contains previously unseen labels: [nan]

In [21]:
dir(le)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 'classes_',
 'fit',
 'fit_transform',
 'get_params',
 'inverse_transform',
 'set_params',
 'transform']

In [4]:
X, y = read_train_transaction(nrows = 5000,folder_path = '../../input/')
X, y = preprocessing(X, y, detect_outliers=False, convert_DT=False, create_features_props_over_cats = False, group_cat_prop=False,
                    is_nan_indicators=False)

In [5]:
from sklearn.model_selection import KFold

def cross_val_scores(X, y, params_tree, lgb_fit_params, score=roc_auc_score):
    score = roc_auc_score
    kf = KFold(n_splits=5)
    scores = []
    for train, test in kf.split(X):
        X.iloc[train, :]
        X.iloc[test, :]
        y.iloc[train]
        X_train, X_test, y_train, y_test = X.iloc[train, :], X.iloc[test, :], y.iloc[train], y.iloc[test]
        lgb_mod = TrainLGB(params_tree, lgb_fit_params).fit(X_train, y_train)
        y_pred = lgb_mod.predict(X_test)
        scores.append(score(y_test, y_pred))
    return scores

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

import warnings
warnings.filterwarnings("ignore", message='Using categorical_feature in Dataset.')

# Forward stepwise selection
all_cols = set(X_train.columns)
possible_aumentations = set(X_train.columns)
selected_vars = []
for k in range(len(all_cols)):
    # Choose best model among all p-k possible models
    best_score = 0
    best_augmentation = None
    for poss_col in possible_aumentations:
        subset_predictors = [*selected_vars, poss_col]
        scores = cross_val_scores(X_train[subset_predictors], y, params_tree, lgb_fit_params, roc_auc_score)
        if np.mean(scores) > best_score:
            best_score = np.mean(scores)
            best_augmentation = poss_col
    # The best possible augmentation was chosen
    selected_vars.append(best_augmentation)
    possible_aumentations.remove(best_augmentation)
    print(best_augmentation)

NameError: name 'np' is not defined