In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import warnings
warnings.filterwarnings(action='ignore')
import lightgbm as lgb
from sklearn.metrics import *
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [None]:
train_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_err_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_problem_data.csv')
test_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/test_err_data.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/데이콘/data/sample_submission.csv')

In [None]:
problem = np.zeros(15000)
problem[train_problem.user_id.unique()-10000] = 1 
problem.shape

(15000,)

In [None]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((15000,42))

for person_idx, err in tqdm(id_error):
  error[person_idx-10000,err-1]+=1
error.shape

100%|██████████| 16554663/16554663 [00:39<00:00, 424269.59it/s]


(15000, 42)

In [None]:
problem_df = pd.DataFrame(problem)
problem_df.rename({0:'problem'},axis=1,inplace=True)
problem_df['problem'].value_counts()

0.0    10000
1.0     5000
Name: problem, dtype: int64

In [None]:
train_x = error
train_y = problem
print(train_x.shape)
print(train_y.shape)

(15000, 42)
(15000,)


In [None]:
smote = SMOTE(random_state=0)
x_train_over, y_train_over = smote.fit_sample(train_x,train_y)
print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트:',x_train_over.shape, y_train_over.shape)
print('SMOTE 적용 후 레이블 값 분포:\n',pd.Series(y_train_over).value_counts())

SMOTE 적용 전 학습용 피처/레이블 데이터 세트: (20000, 42) (20000,)
SMOTE 적용 후 레이블 값 분포:
 1.0    10000
0.0    10000
dtype: int64


In [None]:
log_x_train_over = np.log1p(x_train_over)

In [None]:
# gkf = KFold(n_splits=10, shuffle=True, random_state=42).split(X=log_x_train_over, y=y_train_over)

# param_grid = {
#     'num_leaves': [50, 100, 150, 200],
#     'min_data_in_leaf': [100, 300, 400],
#     'learning_rate' :[ 0.01, 0.05, 0.1],
#     'max_bin' : [200,300,500],
#     'num_boost_round' : [500,1000,2000,5000],
#     }

# lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='auc')

# gsearch = RandomizedSearchCV(estimator=lgb_estimator, param_distributions=param_grid, cv=gkf)
# lgb_model = gsearch.fit(X=log_x_train_over, y=y_train_over)

# print(lgb_model.best_params_, lgb_model.best_score_)

KeyboardInterrupt: ignored

In [None]:
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
accuracies   = []

threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'learning_rate' : '0.01',
                'max_depth'     : '-1',
                'max_bin': 512,
                'num_leaves'    : 100,
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=12, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(log_x_train_over):

    # split train, validation set
    X = log_x_train_over[train_idx]
    y = y_train_over[train_idx]
    valid_x = log_x_train_over[val_idx]
    valid_y = y_train_over[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 200, 
                        early_stopping_rounds = 100
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)
    accuracy  = accuracy_score(valid_y,valid_pred)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    accuracies.append(accuracy)
    auc_scores.append(auc_score)

    print('==========================================================')

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.912313	valid_0's pr_auc: 0.921245
[400]	valid_0's auc: 0.915937	valid_0's pr_auc: 0.92472
[600]	valid_0's auc: 0.917533	valid_0's pr_auc: 0.92685
Early stopping, best iteration is:
[681]	valid_0's auc: 0.917726	valid_0's pr_auc: 0.926957
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.901619	valid_0's pr_auc: 0.915704
[400]	valid_0's auc: 0.908269	valid_0's pr_auc: 0.922987
[600]	valid_0's auc: 0.90948	valid_0's pr_auc: 0.925281
[800]	valid_0's auc: 0.910077	valid_0's pr_auc: 0.925818
Early stopping, best iteration is:
[722]	valid_0's auc: 0.910114	valid_0's pr_auc: 0.925864
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.892811	valid_0's pr_auc: 0.906808
[400]	valid_0's auc: 0.90097	valid_0's pr_auc: 0.91501
[600]	valid_0's auc: 0.904352	valid_0's pr_auc: 0.918439
Early stopping, best iteration is:
[625]	valid_0's auc: 0.9045

In [None]:
print(np.mean(auc_scores))
print(np.mean(precision))
print(np.mean(recall))
print(np.mean(accuracies))

0.912650131733758
0.9989637305699481
0.5987577639751552
0.8302918047843012


In [2]:
2*(0.99896*0.59872)/(0.99896+0.59872)

0.7487072895698763

In [None]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [None]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████| 16532648/16532648 [00:39<00:00, 414862.06it/s]

(14999, 42)





In [None]:
log_x_test_over = np.log1p(test_x)

In [None]:
pred_y_list = []
for model in models:
    pred_y = model.predict(log_x_test_over)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [None]:
pred = np.where(pred_ensemble>0.5,1,0)
pred

array([[1],
       [0],
       [0],
       ...,
       [1],
       [1],
       [0]])

In [None]:
sample_submission['problem'] = pred
sample_submission.head()

Unnamed: 0,user_id,problem
0,30000,1
1,30001,0
2,30002,0
3,30003,1
4,30004,1


In [None]:
sample_submission.to_csv('/content/drive/MyDrive/데이콘/submission/lgb_log(x)_smote_kfold12_submission_threshold0.5_ver2.csv')