In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
train_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_err_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_problem_data.csv')
test_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/test_err_data.csv')

In [None]:
train_err.head(2)

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,model_3,05.15.2138,15,1
1,10000,20201101030309,model_3,05.15.2138,12,1


In [None]:
print('model_nm의 종류:',train_err['model_nm'].unique(),'\n')
print('errtype의 종류:',np.sort(train_err['errtype'].unique()),'\n')
print('user_id 종류의 개수:',len(train_err['user_id'].unique()))

model_nm의 종류: ['model_3' 'model_2' 'model_0' 'model_1' 'model_7' 'model_4' 'model_5'
 'model_8' 'model_6'] 

errtype의 종류: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 30 31 32 33 34 35 36 37 38 39 40 41 42] 

user_id 종류의 개수: 15000


In [None]:
train_problem.head(2)

In [None]:
test_err.head(2)

In [None]:
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

In [None]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,42))

for person_idx, err in tqdm(id_error):
  error[person_idx-train_user_id_min,err-1] +=1
error

100%|██████████| 16554663/16554663 [00:40<00:00, 403898.17it/s]


array([[  0.,   0.,   8., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ..., 113.,  56.,   1.],
       [  0.,   0.,   2., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,  58.,   8.,   5.],
       [  0.,   0.,   0., ...,   6.,   0.,   0.],
       [  0.,   0.,   4., ...,   0.,   0.,   0.]])

In [None]:
error.shape

(15000, 42)

In [None]:
# train_problem.csv
print('user_id 최소값 :',train_problem['user_id'].min())
print('user_id 최대값 :',train_problem['user_id'].max())
print('train_problem 개수:',len(train_problem))

user_id 최소값 : 10001
user_id 최대값 : 24998
train_problem 개수: 5429


In [None]:
problem = np.zeros(15000)
problem[train_problem['user_id'].unique()-train_user_id_min] = 1
problem.shape

(15000,)

In [None]:
# light_gbm 모델 훈련
train_x = error
train_y = problem
print(train_x.shape)
print(train_y.shape)

(15000, 42)
(15000,)


In [None]:
def f_pr_auc(probas_pred,y_true):
  labels = y_true.get_label()
  p, r, _ = precision_recall_curve(labels,probas_pred)
  score = auc(r,p)
  return 'pr_auc',score,True

models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5

# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }
                
# 5 Kfold cross validation
k_fold = KFold(n_splits=6, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 5
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

Training until validation scores don't improve for 3 rounds.
[20]	valid_0's auc: 0.797673	valid_0's pr_auc: 0.792522
Early stopping, best iteration is:
[25]	valid_0's auc: 0.801171	valid_0's pr_auc: 0.797338
Training until validation scores don't improve for 3 rounds.
Early stopping, best iteration is:
[2]	valid_0's auc: 0.788479	valid_0's pr_auc: 0.318902
Training until validation scores don't improve for 3 rounds.
Early stopping, best iteration is:
[2]	valid_0's auc: 0.758498	valid_0's pr_auc: 0.358092
Training until validation scores don't improve for 3 rounds.
Early stopping, best iteration is:
[10]	valid_0's auc: 0.807468	valid_0's pr_auc: 0.551358
Training until validation scores don't improve for 3 rounds.
Early stopping, best iteration is:
[14]	valid_0's auc: 0.802301	valid_0's pr_auc: 0.974445


In [None]:
# 교차검증 점수 확인
print(np.mean(auc_scores))

0.7915834292563759


In [None]:
# 제출 파일 생성
test_err.head(2)

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2


In [None]:
print('test_err의 user_id 최소값 :' ,test_err['user_id'].min())
print('test_err의 user_id 최대값 :' ,test_err['user_id'].max())
print('test_err의 user_id 개수 :' ,len(test_err['user_id'].unique()))

test_err의 user_id 최소값 : 30000
test_err의 user_id 최대값 : 44998
test_err의 user_id 개수 : 14998


In [None]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [None]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
  test_x[person_idx-test_user_id_min,err-1]+=1
test_x = test_x.reshape(test_x.shape[0],-1)
test_x.shape


  0%|          | 0/16532648 [00:00<?, ?it/s][A
  0%|          | 40347/16532648 [00:00<00:40, 403468.46it/s][A
  0%|          | 74487/16532648 [00:00<00:43, 382599.35it/s][A
  1%|          | 110321/16532648 [00:00<00:43, 374981.11it/s][A
  1%|          | 150828/16532648 [00:00<00:42, 383526.27it/s][A
  1%|          | 190843/16532648 [00:00<00:42, 388366.37it/s][A
  1%|▏         | 230590/16532648 [00:00<00:41, 391051.55it/s][A
  2%|▏         | 268584/16532648 [00:00<00:41, 387649.14it/s][A
  2%|▏         | 309316/16532648 [00:00<00:41, 393347.53it/s][A
  2%|▏         | 349928/16532648 [00:00<00:40, 397093.11it/s][A
  2%|▏         | 388071/16532648 [00:01<00:41, 391339.73it/s][A
  3%|▎         | 426128/16532648 [00:01<00:42, 378178.44it/s][A
  3%|▎         | 463288/16532648 [00:01<00:43, 373656.69it/s][A
  3%|▎         | 500518/16532648 [00:01<00:42, 373236.98it/s][A
  3%|▎         | 537528/16532648 [00:01<00:43, 371481.37it/s][A
  3%|▎         | 574459/16532648 [00:01<00:

(14999, 42)

In [None]:
# 예측
pred_y_list =[]
for model in models:
  pred_y = model.predict(test_x)
  pred_y_list.append(pred_y.reshape(-1,1))
pred_ensemble = np.mean(pred_y_list,axis=0)
pred_ensemble

array([[0.58413718],
       [0.26337558],
       [0.31536181],
       ...,
       [0.38028413],
       [0.6155689 ],
       [0.33541155]])

In [None]:
pred = np.where(pred_ensemble>threshold, 1,0)
pred

array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]])