## 기초작업

In [72]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings(action='ignore')
PATH = 'data/'

In [73]:
train_qua = pd.read_csv(PATH+'train_quality_data.csv', thousands = ',')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

train_qua = train_qua.drop(['quality_3', 'quality_4'], axis=1)
train_qua = train_qua.drop_duplicates()
train_qua = train_qua.reset_index(drop=True)

In [74]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

# transformer = RobustScaler().fit(train_qua.iloc[:, 3:14])
# train_qua.iloc[:, 3:14] = transformer.transform(train_qua.iloc[:, 3:14])
train_qua['quality_sum'] = train_qua.iloc[:, 3:14].sum()
train_qua['quality_mean'] = train_qua.iloc[:, 3:14].mean()

## quality 데이터화

In [75]:
train = np.zeros((15000,2612,13))

for idx in tqdm(train_qua['user_id'].unique()):
    train[idx-10000] = np.vstack((np.array(train_qua[train_qua['user_id']==idx].iloc[:, 3:16]), np.zeros((2612-train_qua[train_qua['user_id']==idx].shape[0],13))))

train.shape

  0%|          | 0/8281 [00:00<?, ?it/s]

(15000, 2612, 13)

In [76]:
quality_ui = np.zeros(15000)
quality_ui[train_qua.user_id.unique()-10000] = 1
quality_ui = quality_ui.reshape(-1,1)

In [77]:
quality_str = np.hstack([np.max(train, axis=1) - np.min(train, axis=1),
                         np.mean(train, axis=1), quality_ui])
quality_str.shape

(15000, 27)

## problem 데이터화

In [78]:
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
# error와 동일한 방법으로 person_idx - 10000 위치에 
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem[train_prob.user_id.unique()-10000] = 1 
problem.shape

(15000,)

## 훈련

In [79]:
train_x = quality_str
train_y = problem
del quality_str, problem
print(train_x.shape)
print(train_y.shape)

(15000, 27)
(15000,)


In [80]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x, train_y):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2690
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.609975	valid_0's pr_auc: 0.475709
[LightGBM] [Info] Number of positive: 4000, number of negative: 8000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2715
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
Training u

In [81]:
print(np.mean(auc_scores))

0.5996113
