In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import warnings
warnings.filterwarnings(action='ignore')
import lightgbm as lgb
from sklearn.metrics import *
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix

In [5]:
train_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_err_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_problem_data.csv')
test_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/test_err_data.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/데이콘/data/sample_submission.csv')

In [6]:
problem = np.zeros(15000)
problem[train_problem.user_id.unique()-10000] = 1 
problem.shape

(15000,)

In [7]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((15000,42))

for person_idx, err in tqdm(id_error):
  error[person_idx-10000,err-1]+=1
error.shape

100%|██████████| 16554663/16554663 [00:47<00:00, 347746.53it/s]


(15000, 42)

In [8]:
problem_df = pd.DataFrame(problem)
problem_df.rename({0:'problem'},axis=1,inplace=True)
problem_df['problem'].value_counts()

0.0    10000
1.0     5000
Name: problem, dtype: int64

In [9]:
train_x = error
train_y = problem
print(train_x.shape)
print(train_y.shape)

(15000, 42)
(15000,)


In [10]:
smote = SMOTE(random_state=0)
x_train_over, y_train_over = smote.fit_sample(train_x,train_y)
print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트:',x_train_over.shape, y_train_over.shape)
print('SMOTE 적용 후 레이블 값 분포:\n',pd.Series(y_train_over).value_counts())

SMOTE 적용 전 학습용 피처/레이블 데이터 세트: (20000, 42) (20000,)
SMOTE 적용 후 레이블 값 분포:
 1.0    10000
0.0    10000
dtype: int64


In [27]:
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
accuracies   = []

threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=12, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(x_train_over):

    # split train, validation set
    X = x_train_over[train_idx]
    y = y_train_over[train_idx]
    valid_x = x_train_over[val_idx]
    valid_y = y_train_over[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 50
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)
    accuracy  = accuracy_score(valid_y,valid_pred)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    accuracies.append(accuracy)
    auc_scores.append(auc_score)

    print('==========================================================')

Training until validation scores don't improve for 50 rounds.
[20]	valid_0's auc: 0.905156	valid_0's pr_auc: 0.913827
[40]	valid_0's auc: 0.914372	valid_0's pr_auc: 0.923381
[60]	valid_0's auc: 0.915938	valid_0's pr_auc: 0.925776
[80]	valid_0's auc: 0.916335	valid_0's pr_auc: 0.926579
[100]	valid_0's auc: 0.915931	valid_0's pr_auc: 0.926356
[120]	valid_0's auc: 0.916233	valid_0's pr_auc: 0.926783
Early stopping, best iteration is:
[72]	valid_0's auc: 0.916771	valid_0's pr_auc: 0.92684
Training until validation scores don't improve for 50 rounds.
[20]	valid_0's auc: 0.896695	valid_0's pr_auc: 0.912348
[40]	valid_0's auc: 0.904529	valid_0's pr_auc: 0.920076
[60]	valid_0's auc: 0.906748	valid_0's pr_auc: 0.922779
[80]	valid_0's auc: 0.908503	valid_0's pr_auc: 0.924174
[100]	valid_0's auc: 0.908134	valid_0's pr_auc: 0.92413
[120]	valid_0's auc: 0.908174	valid_0's pr_auc: 0.924109
Early stopping, best iteration is:
[89]	valid_0's auc: 0.909251	valid_0's pr_auc: 0.924673
Training until valid

In [28]:
print(np.mean(auc_scores))
print(np.mean(precision))
print(np.mean(recall))
print(np.mean(accuracies))

0.9106728171597444
0.9966044142614601
0.7291925465838509
0.7901985029164637


In [14]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [15]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████| 16532648/16532648 [00:48<00:00, 339529.24it/s]

(14999, 42)





In [29]:
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [17]:
# pred = np.where(pred_ensemble>0.5,1,0)
# pred

array([[1],
       [0],
       [0],
       ...,
       [1],
       [1],
       [0]])

In [32]:
sample_submission['problem'] = pred_ensemble
sample_submission.tail()

Unnamed: 0,user_id,problem
14994,44994,0.266987
14995,44995,0.385795
14996,44996,0.679784
14997,44997,0.860752
14998,44998,0.439538


In [None]:
sample_submission.to_csv('/content/drive/MyDrive/데이콘/submission/lgb_smote_submission_threshold0.5.csv')