In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np

In [2]:
data_version = "02-01-2021-v6"

In [3]:
df_train = pd.read_csv(f"../../data/kalapa/{data_version}/train.csv")
df_test = pd.read_csv(f"../../data/kalapa/{data_version}/test.csv")

In [4]:
y = df_train.label
train = df_train.drop(["label"], axis = 1)

In [5]:
def gini(y_true, y_score):
    return roc_auc_score(y_true, y_score)*2 - 1

def lgb_gini(y_pred, dataset_true):
    y_true = dataset_true.get_label()
    return 'gini', gini(y_true, y_pred), True

In [6]:
def to_category(df_fe):
    for col in cols:
        if df_fe[col].dtype.name == "object":
            df_fe[col] = df_fe[col].astype('category')
    return df_fe

In [7]:
cols = df_train.iloc[:,2:].columns
train = to_category(train)
test = to_category(df_test)
col2 = []
for col in cols:
    vc = train[col].value_counts()
    if len(vc) <= 3:
        col2.append(col)
        train[col] = train[col].astype('category')
for col in col2:
    test[col] = test[col].astype('category')

In [8]:
lgbm_param = {'boosting_type': 'gbdt',\
              'colsample_bytree': 0.6602479798930369, \
              'is_unbalance': False, \
              'learning_rate': 0.00746275526696824, \
              'max_depth': 15, \
              'metric': 'auc', \
              'min_child_samples': 25, \
              'num_leaves': 60, \
              'objective': 'binary', \
              'reg_alpha': 0.4693391197064131, \
              'reg_lambda': 0.16175478669541327, \
              'subsample_for_bin': 60000}

In [9]:
NUM_BOOST_ROUND= 10000

In [10]:
def kfold(train_fe,y_label,test_fe):
    seeds = np.random.randint(0, 10000, 1)
    preds = 0    
    feature_important = None
    avg_train_gini = 0
    avg_val_gini = 0

    for s in seeds:
        skf = StratifiedKFold(n_splits=5, random_state = 6484, shuffle=True)        
        lgbm_param['random_state'] = 6484    
        seed_train_gini = 0
        seed_val_gini = 0
        for i, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(y_label)), y_label)):                
            X_train, X_val = train_fe.iloc[train_idx].drop(["id"], 1), train_fe.iloc[val_idx].drop(["id"], 1)                
            y_train, y_val = y_label[train_idx], y_label[val_idx]

            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval  = lgb.Dataset(X_val, y_val)

            evals_result = {} 
            model = lgb.train(lgbm_param,
                        lgb_train,
                        num_boost_round=NUM_BOOST_ROUND,  
                        early_stopping_rounds=400,
                        feval=lgb_gini,
                        verbose_eval= 200,
                        evals_result=evals_result,
                        valid_sets=[lgb_train, lgb_eval])

            seed_train_gini += model.best_score["training"]["gini"] / skf.n_splits
            seed_val_gini += model.best_score["valid_1"]["gini"] / skf.n_splits

            avg_train_gini += model.best_score["training"]["gini"] / (len(seeds) * skf.n_splits)
            avg_val_gini += model.best_score["valid_1"]["gini"] / (len(seeds) * skf.n_splits)

            if feature_important is None:
                feature_important = model.feature_importance() / (len(seeds) * skf.n_splits)
            else:
                feature_important += model.feature_importance() / (len(seeds) * skf.n_splits)        

            pred = model.predict(test_fe.drop(["id"], 1))
            preds += pred / (skf.n_splits * len(seeds))

            print("Fold {}: {}/{}".format(i, model.best_score["training"]["gini"], model.best_score["valid_1"]["gini"]))
        print("Seed {}: {}/{}".format(s, seed_train_gini, seed_val_gini))

    print("-" * 30)
    print("Avg train gini: {}".format(avg_train_gini))
    print("Avg valid gini: {}".format(avg_val_gini))
    print("=" * 30)
    return preds

In [11]:
preds = kfold(train, y, test)

Training until validation scores don't improve for 400 rounds
[200]	training's auc: 0.776093	training's gini: 0.552185	valid_1's auc: 0.734756	valid_1's gini: 0.469513
[400]	training's auc: 0.799804	training's gini: 0.599609	valid_1's auc: 0.741512	valid_1's gini: 0.483023
[600]	training's auc: 0.821082	training's gini: 0.642164	valid_1's auc: 0.743951	valid_1's gini: 0.487901
[800]	training's auc: 0.840003	training's gini: 0.680005	valid_1's auc: 0.745462	valid_1's gini: 0.490925
[1000]	training's auc: 0.855501	training's gini: 0.711001	valid_1's auc: 0.746179	valid_1's gini: 0.492358
[1200]	training's auc: 0.868135	training's gini: 0.736269	valid_1's auc: 0.746693	valid_1's gini: 0.493387
[1400]	training's auc: 0.878805	training's gini: 0.75761	valid_1's auc: 0.747079	valid_1's gini: 0.494158
[1600]	training's auc: 0.888098	training's gini: 0.776197	valid_1's auc: 0.747101	valid_1's gini: 0.494201
[1800]	training's auc: 0.896315	training's gini: 0.79263	valid_1's auc: 0.74744	valid_1

Fold 4: 0.8035950470944138/0.49597683869803455
Seed 2877: 0.8212204521083153/0.4995512788672322
------------------------------
Avg train gini: 0.8212204521083153
Avg valid gini: 0.4995512788672322


In [12]:
"""
preds2 = preds
for i in range(len(preds)):
    if preds[i] >= 0.5:
        preds2[i] = 1
    else:
        preds2[i] = 0
"""
new_label = pd.DataFrame(preds, columns = ["label"])

In [13]:
new_data = pd.concat([df_test.id, new_label,df_test.iloc[:,1:]], axis = 1)

In [14]:
new_data.to_csv(f"../../data/kalapa/{data_version}/new_train.csv", index = False)