In [1]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import random
import numpy as np
import lightgbm as lgb
from tqdm import tqdm

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import pickle

In [2]:
seed = 2626
np.random.seed(seed)
random.seed(seed)

In [3]:
# def my_read_csv(path):
#     df = pd.read_csv(path)
#     df.drop(columns=['id', 'match_id'], inplace=True)
#     label = df.pop('label')
#     df.insert(0, 'label', label)
#     return np.unique(df.values, axis=0)

In [4]:
DEBUG = False

DATA_DIR = '../input/mytrainingdata2'

if DEBUG:
    data = my_read_csv(f'{DATA_DIR}/train_data1.csv')
else:
    
    data = []
    for i in tqdm(range(1, 6)):
        with open(f'../input/mytrainingdata2/train_split{i}.pkl', 'rb') as handle:
            data.append(pickle.load(handle))
    data = np.concatenate(data, axis=0)

100%|██████████| 5/5 [00:27<00:00,  5.44s/it]


In [5]:
idx_full = np.arange(len(data))
random.shuffle(idx_full)
len_valid = int(len(data) * 0.2)

In [6]:
gc.collect()

72

In [7]:
params = {
    'objective': "binary",
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': seed,

    'max_depth': 8,   
    'num_leaves': 32, 
    'n_estimators': 1600, 
    "colsample_bytree": 0.9,
}

for fold_idx in range(5):
    print(f'=== fold_idx {fold_idx} ===')
    
    s, e = fold_idx *len_valid, (fold_idx+1) *len_valid

    valid_idx = idx_full[s: e]
    train_idx = np.delete(idx_full, np.arange(s, e))

    model = lgb.LGBMClassifier(**params)
    model.fit(
        data[train_idx, 1:], data[train_idx, 0], 
        eval_set=[(data[valid_idx, 1:], data[valid_idx, 0])],  
        early_stopping_rounds=20, 
        eval_metric='auc', 
        verbose=50
    )
    
    with open(f'lgb{fold_idx}.pkl', 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    del model
    gc.collect()

=== fold_idx 0 ===
[50]	valid_0's auc: 0.981457	valid_0's binary_logloss: 0.0707262
[100]	valid_0's auc: 0.982833	valid_0's binary_logloss: 0.0682088
[150]	valid_0's auc: 0.98352	valid_0's binary_logloss: 0.0669009
[200]	valid_0's auc: 0.984021	valid_0's binary_logloss: 0.0659714
[250]	valid_0's auc: 0.984377	valid_0's binary_logloss: 0.0652809
[300]	valid_0's auc: 0.984716	valid_0's binary_logloss: 0.0646042
[350]	valid_0's auc: 0.984986	valid_0's binary_logloss: 0.0640457
[400]	valid_0's auc: 0.985236	valid_0's binary_logloss: 0.0634865
[450]	valid_0's auc: 0.985475	valid_0's binary_logloss: 0.0630288
[500]	valid_0's auc: 0.985699	valid_0's binary_logloss: 0.0625571
[550]	valid_0's auc: 0.985878	valid_0's binary_logloss: 0.0621707
[600]	valid_0's auc: 0.986059	valid_0's binary_logloss: 0.0617842
[650]	valid_0's auc: 0.986203	valid_0's binary_logloss: 0.0614617
[700]	valid_0's auc: 0.986367	valid_0's binary_logloss: 0.061153
[750]	valid_0's auc: 0.986554	valid_0's binary_logloss: 0.06

In [8]:
models = []
for i in range(5):
    with open(f'lgb{i}.pkl', 'rb') as f:
        model = pickle.load(f)
    models.append(model)

In [9]:
preds = []
valids = []
for fold_idx in range(5):
    print(f'=== fold_idx {fold_idx} ===')
    
    s, e = fold_idx *len_valid, (fold_idx+1) *len_valid

    valid_idx = idx_full[s: e]
    train_idx = np.delete(idx_full, np.arange(s, e))
    valid_x, valid_y = data[valid_idx, 1:], data[valid_idx, 0]
    pred = models[fold_idx].predict_proba(valid_x)[:, 1]
    
    valids.append(valid_y)
    preds.append(pred)

=== fold_idx 0 ===
=== fold_idx 1 ===
=== fold_idx 2 ===
=== fold_idx 3 ===
=== fold_idx 4 ===


In [10]:
# def analysis(valid_y, pred, threshold):
#     print(f'=== th {threshold} ===')
#     _pred = pred > threshold
#     print(f'f1: {f1_score(valid_y, _pred):.6f}')
#     print(f'precision: {precision_score(valid_y, _pred)}')
#     print(f'recall: {recall_score(valid_y, _pred)}')
#     print(f'acc: {accuracy_score(valid_y, _pred)}')

for i in range(5):
    best_f1 = -1
    best_th = -1
    for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        score = f1_score(valids[i], preds[i] > threshold)
        if score > best_f1:
            best_f1 = score
            best_th = threshold
    print(f'fold{i}, best_f1: {best_f1}, best_th: {best_th}')

fold0, best_f1: 0.8259396057561195, best_th: 0.4
fold1, best_f1: 0.8262430479235355, best_th: 0.4
fold2, best_f1: 0.8261678503551066, best_th: 0.4
fold3, best_f1: 0.8248032287909466, best_th: 0.4
fold4, best_f1: 0.8257143240955925, best_th: 0.4


In [11]:
# with open('../input/my-training-data/train_data1.csv', 'r') as f:
#     fl = f.readline()
#     columns = [s.strip() for s in fl.split(',')]
# columns