In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
train_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_err_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_problem_data.csv')
test_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/test_err_data.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/데이콘/data/sample_submission.csv')

In [4]:
train_err.head(2)

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,model_3,05.15.2138,15,1
1,10000,20201101030309,model_3,05.15.2138,12,1


In [5]:
print('model_nm의 종류:',train_err['model_nm'].unique(),'\n')
print('errtype의 종류:',np.sort(train_err['errtype'].unique()),'\n')
print('user_id 종류의 개수:',len(train_err['user_id'].unique()))

model_nm의 종류: ['model_3' 'model_2' 'model_0' 'model_1' 'model_7' 'model_4' 'model_5'
 'model_8' 'model_6'] 

errtype의 종류: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 30 31 32 33 34 35 36 37 38 39 40 41 42] 

user_id 종류의 개수: 15000


In [6]:
train_problem.head(2)

Unnamed: 0,user_id,time
0,19224,20201102200000
1,23664,20201116140000


In [7]:
test_err.head(2)

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2


In [8]:
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

In [9]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,42))

for person_idx, err in tqdm(id_error):
  error[person_idx-train_user_id_min,err-1] +=1
error

100%|██████████| 16554663/16554663 [00:47<00:00, 349067.92it/s]


array([[  0.,   0.,   8., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ..., 113.,  56.,   1.],
       [  0.,   0.,   2., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,  58.,   8.,   5.],
       [  0.,   0.,   0., ...,   6.,   0.,   0.],
       [  0.,   0.,   4., ...,   0.,   0.,   0.]])

In [10]:
error.shape

(15000, 42)

In [11]:
# train_problem.csv
print('user_id 최소값 :',train_problem['user_id'].min())
print('user_id 최대값 :',train_problem['user_id'].max())
print('train_problem 개수:',len(train_problem))

user_id 최소값 : 10001
user_id 최대값 : 24998
train_problem 개수: 5429


In [12]:
problem = np.zeros(15000)
problem[train_problem['user_id'].unique()-train_user_id_min] = 1
problem.shape

(15000,)

In [13]:
# light_gbm 모델 훈련
train_x = error
train_y = problem
print(train_x.shape)
print(train_y.shape)

(15000, 42)
(15000,)


In [41]:
def f_pr_auc(probas_pred,y_true):
  labels = y_true.get_label()
  p, r, _ = precision_recall_curve(labels,probas_pred)
  score = auc(r,p)
  return 'pr_auc',score,True

models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5

# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }
                
# 5 Kfold cross validation
k_fold = KFold(n_splits=3, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 10
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

Training until validation scores don't improve for 10 rounds.
[20]	valid_0's auc: 0.797673	valid_0's pr_auc: 0.792522
[40]	valid_0's auc: 0.803789	valid_0's pr_auc: 0.799645
Early stopping, best iteration is:
[44]	valid_0's auc: 0.80377	valid_0's pr_auc: 0.800085
Training until validation scores don't improve for 10 rounds.
[20]	valid_0's auc: 0.805071	valid_0's pr_auc: 0.314401
Early stopping, best iteration is:
[16]	valid_0's auc: 0.805031	valid_0's pr_auc: 0.378328
Training until validation scores don't improve for 10 rounds.
[20]	valid_0's auc: 0.780506	valid_0's pr_auc: 0.38051
Early stopping, best iteration is:
[10]	valid_0's auc: 0.769998	valid_0's pr_auc: 0.384243
Training until validation scores don't improve for 10 rounds.
[20]	valid_0's auc: 0.811374	valid_0's pr_auc: 0.545099
Early stopping, best iteration is:
[10]	valid_0's auc: 0.807468	valid_0's pr_auc: 0.551358
Training until validation scores don't improve for 10 rounds.
[20]	valid_0's auc: 0.802906	valid_0's pr_auc: 0

In [42]:
# 교차검증 점수 확인
print(np.mean(auc_scores))

0.7982904396441357


In [27]:
# 제출 파일 생성
test_err.head(2)

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2


In [28]:
print('test_err의 user_id 최소값 :' ,test_err['user_id'].min())
print('test_err의 user_id 최대값 :' ,test_err['user_id'].max())
print('test_err의 user_id 개수 :' ,len(test_err['user_id'].unique()))

test_err의 user_id 최소값 : 30000
test_err의 user_id 최대값 : 44998
test_err의 user_id 개수 : 14998


In [29]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [21]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
  test_x[person_idx-test_user_id_min,err-1]+=1
test_x = test_x.reshape(test_x.shape[0],-1)
test_x.shape

100%|██████████| 16532648/16532648 [00:48<00:00, 343841.96it/s]


(14999, 42)

In [43]:
# 예측
pred_y_list =[]
for model in models:
  pred_y = model.predict(test_x)
  pred_y_list.append(pred_y.reshape(-1,1))
pred_ensemble = np.mean(pred_y_list,axis=0)
pred_ensemble

array([[0.7376916 ],
       [0.22536515],
       [0.30857069],
       ...,
       [0.44003607],
       [0.7402628 ],
       [0.35199055]])

In [44]:
pred = np.where(pred_ensemble>threshold, 1,0)
pred

array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [45]:
sample_submission['problem'] = pred
sample_submission.to_csv('/content/drive/MyDrive/데이콘/submission/baseline_kfold3_submission.csv')