In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

PATH = 'data/'

In [2]:
train_err = pd.read_csv(PATH+'train_err_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

encoder = LabelEncoder()
encoder.fit(sorted(list(train_err['model_nm'].unique())))

train_err['model_nm'] = encoder.transform(train_err['model_nm'])

In [3]:
train_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,3,05.15.2138,15,1
1,10000,20201101030309,3,05.15.2138,12,1
2,10000,20201101030309,3,05.15.2138,11,1
3,10000,20201101050514,3,05.15.2138,16,1
4,10000,20201101050515,3,05.15.2138,4,0
...,...,...,...,...,...,...
16554658,24999,20201130163051,3,05.15.2138,15,1
16554659,24999,20201130172625,3,05.15.2138,16,1
16554660,24999,20201130172625,3,05.15.2138,4,0
16554661,24999,20201130172631,3,05.15.2138,4,0


In [40]:
temp = []
for time in train_err['time'].astype(str):
    tmp = time[0:8]
    temp.append(tmp)
    
train_err['time'] = temp

In [41]:
temp = []
for time in train_prob['time'].astype(str):
    tmp = time[0:8]
    temp.append(tmp)
    
train_prob['time'] = temp

In [42]:
err_space = 30
temp = list(train_err['errcode'].value_counts()[:err_space].index)
train = train_err[train_err['errcode'].isin(temp)]

encoder1 = LabelEncoder()
encoder1.fit(sorted(list(train['errcode'].unique())))

train['errcode'] = encoder1.transform(train['errcode'])

In [43]:
id_error = train[['user_id','errcode']].values
error = np.zeros((15000, err_space))

for person_idx, err in tqdm(id_error):
    error[person_idx-10000, err] += 1

error.shape

  0%|          | 0/16516166 [00:00<?, ?it/s]

(15000, 30)

In [44]:
problem = np.zeros(15000)

problem[train_prob.user_id.unique()-10000] = 1 
problem.shape

(15000,)

In [45]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
#                 'objective'     : 'tweedie',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed'          : 1015
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(error, problem):

    # split train, validation set
    X = error[train_idx]
    y = problem[train_idx]
    valid_x = error[val_idx]
    valid_y = problem[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 5
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3458
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
Training until validation scores don't improve for 5 rounds
[20]	valid_0's auc: 0.728232	valid_0's pr_auc: 0.590658
[40]	valid_0's auc: 0.732032	valid_0's pr_auc: 0.592537
Early stopping, best iteration is:
[36]	valid_0's auc: 0.731389	valid_0's pr_auc: 0.595136
[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3489
[LightGBM] [Info] Number of data points in the train set: 12

In [46]:
print(np.mean(auc_scores))

0.7416520999999999
