In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import warnings
warnings.filterwarnings(action='ignore')
import lightgbm as lgb
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE

In [4]:
train_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_err_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_problem_data.csv')
test_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/test_err_data.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/데이콘/data/sample_submission.csv')

In [5]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((15000,42))

for person_idx, err in tqdm(id_error):
  error[person_idx-10000,err-1]+=1
error.shape

100%|██████████| 16554663/16554663 [00:46<00:00, 352277.72it/s]


(15000, 42)

In [6]:
problem = np.zeros(15000)
problem[train_problem.user_id.unique()-10000] = 1 
problem.shape

(15000,)

In [7]:
problem_df = pd.DataFrame(problem)
problem_df.rename({0:'problem'},axis=1,inplace=True)
problem_df['problem'].value_counts()

0.0    10000
1.0     5000
Name: problem, dtype: int64

In [8]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [9]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████| 16532648/16532648 [00:47<00:00, 347828.77it/s]

(14999, 42)





In [10]:
train_x = error
train_y = problem
print(train_x.shape)
print(train_y.shape)

(15000, 42)
(15000,)


In [11]:
smote = SMOTE(random_state=0)
x_train_over, y_train_over = smote.fit_sample(train_x,train_y)
print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트:',x_train_over.shape, y_train_over.shape)
print('SMOTE 적용 후 레이블 값 분포:\n',pd.Series(y_train_over).value_counts())

SMOTE 적용 전 학습용 피처/레이블 데이터 세트: (20000, 42) (20000,)
SMOTE 적용 후 레이블 값 분포:
 1.0    10000
0.0    10000
dtype: int64


In [33]:
lgb_param_dart = {'objective': 'binary', 
 'boosting_type': 'dart', 
 'subsample_freq': 5, 
 'min_data_in_leaf': 64, 
 'max_depth': -1, 
 'feature_fraction': 0.302, 
 'bagging_fraction': 0.904, 
 'nthread': 32, 
 'metric': 'auc', 
 'learning_rate': 0.01, 
 'max_drop': 65,
 'seed': 1015,
 'n_estimators': 1000}

In [34]:
NFOLD = 5
SEED = 1015
threshold = 0.5

models     = []
recalls    = []
precisions = []
auc_scores   = []
accuracies   = []
f1_scores = []


folds = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=42)
for train_idx, val_idx in folds.split(x_train_over,y_train_over):

    # split train, validation set
    X = x_train_over[train_idx]
    y = y_train_over[train_idx]
    valid_x = x_train_over[val_idx]
    valid_y = y_train_over[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    #run traning
    model = lgb.train(
                        lgb_param_dart,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = [d_train,d_val],
                        verbose_eval    = 200
                       )
    
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)


    # cal scores
    recall    = recall_score(valid_y, valid_pred)
    precision = precision_score(valid_y, valid_pred)
    auc_score = roc_auc_score(valid_y, valid_prob)
    accuracy  = accuracy_score(valid_y,valid_pred)
    f1__score = f1_score(valid_y,valid_pred)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    accuracies.append(accuracy)
    auc_scores.append(auc_score)
    f1_scores.append(f1__score)

    print('==========================================================')

[200]	training's auc: 0.896869	valid_1's auc: 0.880239
[400]	training's auc: 0.900891	valid_1's auc: 0.883976
[600]	training's auc: 0.904129	valid_1's auc: 0.886799
[800]	training's auc: 0.908422	valid_1's auc: 0.889207
[1000]	training's auc: 0.913102	valid_1's auc: 0.891814
[200]	training's auc: 0.894978	valid_1's auc: 0.890426
[400]	training's auc: 0.899702	valid_1's auc: 0.893999
[600]	training's auc: 0.902891	valid_1's auc: 0.896321
[800]	training's auc: 0.906993	valid_1's auc: 0.899412
[1000]	training's auc: 0.911625	valid_1's auc: 0.901486
[200]	training's auc: 0.895664	valid_1's auc: 0.886012
[400]	training's auc: 0.900127	valid_1's auc: 0.889264
[600]	training's auc: 0.903165	valid_1's auc: 0.890942
[800]	training's auc: 0.90712	valid_1's auc: 0.8938
[1000]	training's auc: 0.911914	valid_1's auc: 0.896626
[200]	training's auc: 0.89403	valid_1's auc: 0.89477
[400]	training's auc: 0.898285	valid_1's auc: 0.898917
[600]	training's auc: 0.902191	valid_1's auc: 0.90133
[800]	trainin

In [35]:
print(np.mean(auc_scores))
print(np.mean(accuracies))
print(np.mean(precision))
print(np.mean(recall))
print(np.mean(f1_scores))

0.897300975
0.8249500000000001
0.9004438807863031
0.71
0.8063516394473013


In [36]:
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [37]:
sample_submission['problem'] = pred_ensemble
sample_submission[-10:]

Unnamed: 0,user_id,problem
14989,44989,0.362294
14990,44990,0.532165
14991,44991,0.169911
14992,44992,0.202882
14993,44993,0.379803
14994,44994,0.439802
14995,44995,0.377687
14996,44996,0.502917
14997,44997,0.784203
14998,44998,0.408558


In [38]:
sample_submission.to_csv('/content/drive/MyDrive/데이콘/submission/lgb_smote_StratifiedKFold5_dart_submission.csv')