In [79]:
import numpy as np, pandas as pd

%matplotlib inline

import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns

import lightgbm as lgb

import gc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score
import random

In [2]:
train = pd.read_csv('train_3.csv')
test = pd.read_csv('test_3.csv')
train_labels = pd.read_csv('train_labels.csv')
train_ids = pd.read_csv('train_ids.csv')
test_ids = pd.read_csv('test_ids.csv')

# 10. Manual LGBM Fit 
Precision on Validation Set - 78.3

In [15]:
def model(train, test, encoding = 'ohe', n_folds = 5):
    
    train_ids = train['SK_ID_CURR']
    test_ids = test['SK_ID_CURR']
    
    labels = train['TARGET']
    
    if encoding == 'ohe':
        train = pd.get_dummies(train)
        test = pd.get_dummies(test)
        
        train, tets = train.align(test, join = 'inner', axis = 1)
        cat_indices = 'auto'
        
    elif encoding == 'le':
        
        lab_enc = LabelEncoder()
        cat_indices = []
        
        for i, col in enumerate(train):
            if train[col].dtype == 'object':
                features[col] = label_encoder.fit_transform(np.array(train[col].astype(str)).reshape((-1,)))
                test[col] = label_encoder.transform(np.array(test[col].astype(str)).reshape((-1,)))

                cat_indices.append(i)
                
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    columns = train.columns
    
    k_fold = KFold(n_splits = n_folds, shuffle = False)
    feat_importances = np.zeros(len(columns))
    test_preds = np.zeros(test.shape[0])
    val_preds = np.zeros(train.shape[0])
    
    valid_scores = []
    train_scores = []
    
    for train_indices, valid_indices in k_fold.split(train):
        
        train_features, train_labels = train.iloc[train_indices, :], labels[train_indices]
        valid_features, valid_labels = train.iloc[valid_indices, :], labels[valid_indices]
        
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', boosting_type='goss',
                                       class_weight = 'balanced', learning_rate = 0.05, 
                                       reg_alpha = 0.1, reg_lambda = 0.1, n_jobs = -1, random_state = 50)

        model.fit(train_features, train_labels, eval_metric = 'auc',
                      eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                      eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  callbacks = [lgb.early_stopping(stopping_rounds = 100), lgb.log_evaluation(period = 200)])

        best_iteration = model.best_iteration_
        
        feat_importances += model.feature_importances_ / k_fold.n_splits

        test_preds += model.predict_proba(test, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        val_preds[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)

        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_preds})

    feature_importances = pd.DataFrame({'feature': columns, 'importance': feat_importances})
    valid_auc = roc_auc_score(labels, val_preds)
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

    

In [16]:
train['TARGET'] = train_labels
train['SK_ID_CURR'] = train_ids
test['SK_ID_CURR'] = test_ids

In [17]:
submission, feature_importances, metrics = model(train, test)

Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.829374	train's binary_logloss: 0.516316	valid's auc: 0.782281	valid's binary_logloss: 0.54001
[400]	train's auc: 0.867329	train's binary_logloss: 0.473039	valid's auc: 0.782889	valid's binary_logloss: 0.518335
Early stopping, best iteration is:
[312]	train's auc: 0.852069	train's binary_logloss: 0.490454	valid's auc: 0.78345	valid's binary_logloss: 0.526771
Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.828858	train's binary_logloss: 0.516735	valid's auc: 0.783643	valid's binary_logloss: 0.537869
[400]	train's auc: 0.867003	train's binary_logloss: 0.473471	valid's auc: 0.786537	valid's binary_logloss: 0.510784
Early stopping, best iteration is:
[439]	train's auc: 0.873033	train's binary_logloss: 0.466434	valid's auc: 0.786835	valid's binary_logloss: 0.506309
Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.829993	train's binary_logloss

In [18]:
metrics

Unnamed: 0,fold,train,valid
0,0,0.852069,0.78345
1,1,0.873033,0.786835
2,2,0.890091,0.779406
3,3,0.870663,0.785253
4,4,0.871418,0.78608
5,overall,0.871455,0.78396


In [20]:
submission.to_csv('submission.csv', index= False) #Kaggle result 0.783

# 11. Cross Validation

In [21]:
train_s = train.sample(n = 16000)
train_s = train_s.select_dtypes('number') 
#As practice we limit ourselves to numerical columns to speed up search. In real world application we don't do this!

labels = np.array(train_s['TARGET'].astype(np.int32)).reshape((-1, ))
train_s = train_s.drop(columns = ['TARGET', 'SK_ID_CURR'])

train_s_x, test_s_x, train_s_y, test_s_y = train_test_split(train_s, labels, test_size = 6000)

In [57]:
train_set = lgb.Dataset(data = train_s_x, label = train_s_y)
test_set = lgb.Dataset(data = test_s_x, label = test_s_y)

In [44]:
model = lgb.LGBMClassifier()
default_params = model.get_params()


del default_params['n_estimators']
default_params['verbose'] = -1
default_params['silent'] = True
N_FOLDS = 5
# Cross validation with early stopping
cv_results = lgb.cv(default_params, train_set, num_boost_round = 10000, 
                    callbacks = [lgb.early_stopping(stopping_rounds = 100), lgb.log_evaluation(period = -1)],                     
                    metrics = 'auc', nfold = N_FOLDS)
print('The maximum validation ROC AUC was: {:.5f}.'.format(cv_results['auc-mean'][-1]))



[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077125
[LightGBM] [Info] Start training from score 0.077125
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[31]	cv_agg's auc: 0.710248 + 0.0163919
The maximum validation ROC AUC was: 0.71025.


In [46]:
model.n_estimators = len(cv_results['auc-mean'])

model.fit(train_s_x, train_s_y)
preds = model.predict_proba(test_s_x)[:, 1]
baseline_auc = roc_auc_score(test_s_y, preds)

print('The baseline model scores {:.5f} ROC AUC on the test set.'.format(baseline_auc))

The baseline model scores 0.73344 ROC AUC on the test set.


# 12. Hyperparameter Tuning

## 12.1 Evaluation Function

In [97]:
def objective(hyperparameters, i):
    
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']
    
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = N_FOLDS, metrics = 'auc', seed = 42, callbacks = [lgb.early_stopping(stopping_rounds = 100)])
    
    score = cv_results['auc-mean'][-1]
    estimators = len(cv_results['auc-mean'])
    hyperparameters['n_estimators'] = estimators 
    
    return [score, hyperparameters, i]

## 12.2 Grid

In [103]:
param_grid = {
    'boosting_type': ['gbdt', 'goss'], #, 'dart'
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
    #'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10)),
    'subsample': list(np.linspace(0.5, 1, 100)),
    'is_unbalance': [True, False]
}

## 12.3 Algorithm
We will be focusing on Grid Search and Random Search
### 12.3.1 Grid Search

In [59]:
#subsample = 1.0 if boosting_type == 'goss' else random.sample(param_grid['subsample'], 1)[0]

MAX_EVALS = 5
random_results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(MAX_EVALS)))

grid_results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(MAX_EVALS)))

In [68]:
import itertools

def grid_search(param_grid, max_evals = MAX_EVALS):
    
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(MAX_EVALS)))
    
    # https://codereview.stackexchange.com/questions/171173/list-all-possible-permutations-from-a-python-dictionary-of-lists
    keys, values = zip(*param_grid.items())
    i = 0
    
    for v in itertools.product(*values):
        
        hyperparameters = dict(zip(keys, v))
        
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']
        
        eval_results = objective(hyperparameters, i)
        
        results.loc[i, :] = eval_results
        
        i += 1
        
        # Normally would not limit iterations - we do for purpose of speed
        if i > MAX_EVALS:
            break
       
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    
    return results    

In [84]:
grid_results = grid_search(param_grid)




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info]



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info]



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info]



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info]



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info]



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info]

In [85]:
print('The best validation score was {:.5f}'.format(grid_results.loc[0, 'score']))
print('\nThe best hyperparameters were:')

import pprint
pprint.pprint(grid_results.loc[0, 'params'])

The best validation score was 0.73558

The best hyperparameters were:
{'boosting_type': 'gbdt',
 'colsample_bytree': 0.6,
 'is_unbalance': True,
 'learning_rate': 0.004999999999999999,
 'min_child_samples': 20,
 'n_estimators': 767,
 'num_leaves': 20,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 0.5}


In [86]:
grid_search_params = grid_results.loc[0, 'params']

# Create, train, test model
model = lgb.LGBMClassifier(**grid_search_params, random_state=42)
model.fit(train_s_x, train_s_y)

LGBMClassifier(colsample_bytree=0.6, is_unbalance=True,
               learning_rate=0.004999999999999999, n_estimators=767,
               num_leaves=20, random_state=42, subsample=0.5)

In [87]:
preds = model.predict_proba(test_s_x)[:, 1]
print('The best model from grid search scores {:.5f} ROC AUC on the test set.'.format(roc_auc_score(test_s_y, preds)))


The best model from grid search scores 0.74865 ROC AUC on the test set.


### 12.3.2 Random Search

In [107]:
def random_search(param_grid, max_evals = MAX_EVALS):
    
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                                  index = list(range(MAX_EVALS)))
    
    
    for i in range(MAX_EVALS):
        print('============================= {} ============='.format(i))
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
        print(hyperparameters)
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']

        eval_results = objective(hyperparameters, i)
        print('asdasd')
        results.loc[i, :] = eval_results
    
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    return results 

In [108]:
random_results = random_search(param_grid)

print('The best validation score was {:.5f}'.format(random_results.loc[0, 'score']))
print('\nThe best hyperparameters were:')

pprint.pprint(random_results.loc[0, 'params'])

{'boosting_type': 'goss', 'num_leaves': 21, 'learning_rate': 0.2797162853084688, 'min_child_samples': 380, 'reg_alpha': 0.9183673469387754, 'reg_lambda': 0.2040816326530612, 'colsample_bytree': 0.8222222222222222, 'subsample': 0.9797979797979799, 'is_unbalance': False}
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Using GOSS
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Using GOSS
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Using GOSS
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bin



[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077125
[LightGBM] [Info] Start training from score 0.077125
Training until validation scores don't improve for 100 rounds






Early stopping, best iteration is:
[31]	cv_agg's auc: 0.731368 + 0.0195143
asdasd
{'boosting_type': 'gbdt', 'num_leaves': 97, 'learning_rate': 0.009621754876151651, 'min_child_samples': 90, 'reg_alpha': 0.3877551020408163, 'reg_lambda': 0.7346938775510203, 'colsample_bytree': 0.9555555555555555, 'subsample': 0.8838383838383839, 'is_unbalance': True}
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077125
[LightGBM] [Info] Start training from score 0.077125
Training until validation scores don't improve for 100 rounds










































asdasd
{'boosting_type': 'gbdt', 'num_leaves': 51, 'learning_rate': 0.28888950589852513, 'min_child_samples': 85, 'reg_alpha': 0.2040816326530612, 'reg_lambda': 0.12244897959183673, 'colsample_bytree': 0.6888888888888889, 'subsample': 0.9747474747474748, 'is_unbalance': True}




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info]



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077125
[LightGBM] [Info] Start training from score 0.077125
Training until validation scores don't improve for 100 rounds
asdasd
{'boosting_type': 'gbdt', 'num_leaves': 112, 'learning_rate': 0.011358640665134524, 'min_child_samples': 185, 'reg_alpha': 0.673469387755102, 'reg_lambda': 0.5306122448979591, 'colsample_bytree': 0.9555555555555555, 'subsample': 0.7474747474747475, 'is_unbalance': False}
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55557
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 538
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077250
[LightGBM] [Info] Start training from score 0.077125
[LightGBM] [Info] Start training from score 0.077125
Training until validation scores don't improve for 100 rounds






























asdasd
The best validation score was 0.73137

The best hyperparameters were:
{'boosting_type': 'goss',
 'colsample_bytree': 0.8222222222222222,
 'is_unbalance': False,
 'learning_rate': 0.2797162853084688,
 'min_child_samples': 380,
 'n_estimators': 31,
 'num_leaves': 21,
 'reg_alpha': 0.9183673469387754,
 'reg_lambda': 0.2040816326530612,
 'subsample': 1.0}


In [109]:
random_search_params = random_results.loc[0, 'params']

model = lgb.LGBMClassifier(**random_search_params, random_state = 42)
model.fit(train_s_x, train_s_y)

preds = model.predict_proba(test_s_x)[:, 1]

print('The best model from random search scores {:.5f} ROC AUC on the test set.'.format(roc_auc_score(test_s_y, preds)))

The best model from random search scores 0.74403 ROC AUC on the test set.
