In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
from imblearn.over_sampling import SMOTE
warnings.filterwarnings(action='ignore')

In [3]:
train_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_err_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_problem_data.csv')
test_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/test_err_data.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/데이콘/data/sample_submission.csv')

In [4]:
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

In [5]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,42))

for person_idx, err in tqdm(id_error):
  error[person_idx-train_user_id_min,err-1] +=1

100%|██████████| 16554663/16554663 [01:09<00:00, 238585.37it/s]


In [6]:
problem = np.zeros(15000)
problem[train_problem['user_id'].unique()-train_user_id_min] = 1

In [7]:
train_x = error
train_y = problem
print(train_x.shape)
print(train_y.shape)

(15000, 42)
(15000,)


In [16]:
def f_pr_auc(probas_pred,y_true):
  labels = y_true.get_label()
  p, r, _ = precision_recall_curve(labels,probas_pred)
  score = auc(r,p)
  return 'pr_auc',score,True

models     = []
recalls    = []
precisions = []
auc_scores   = []
accuracies   = []
f1_scores = []
threshold = 0.5

# 파라미터 설정
params =      {
                'objective'     : 'binary:logistic',
                'eval_metric'        : 'auc',
               'early_stoppings'  :   100
                }

k_fold = KFold(n_splits=5,shuffle=True,random_state=42)
for train_idx, val_idx in k_fold.split(train_x):
  X = train_x[train_idx]
  y = train_y[train_idx]
  valid_x = train_x[val_idx]
  valid_y = train_y[val_idx]

  d_train = xgb.DMatrix(X,y)
  d_val = xgb.DMatrix(valid_x)


  xgb_model = xgb.train(params=params,
                        dtrain=d_train,
                        feval=f_pr_auc,
                        num_boost_round=1000,
                        verbose_eval=False)
 
  valid_prob = xgb_model.predict(d_val)
  valid_pred = np.where(valid_prob>threshold,1,0)

  recall = recall_score(valid_y,valid_pred)
  precision =precision_score(valid_y,valid_pred)
  auc_score = roc_auc_score(valid_y,valid_prob)

  # cal scores
  recall    = recall_score(    valid_y, valid_pred)
  precision = precision_score( valid_y, valid_pred)
  auc_score = roc_auc_score(   valid_y, valid_prob)
  accuracy  = accuracy_score(valid_y,valid_pred)
  f1__score = f1_score(valid_y,valid_pred)

  # append scores
  models.append(xgb_model)
  recalls.append(recall)
  precisions.append(precision)
  accuracies.append(accuracy)
  auc_scores.append(auc_score)
  f1_scores.append(f1__score)

  print('==========================================================')



In [17]:
# 교차검증 점수 확인
print(np.mean(auc_scores))
print(np.mean(accuracies))
print(np.mean(precision))
print(np.mean(recall))
print(np.mean(f1_scores))

0.888696843479176
0.8120999999999998
0.9390304847576212
0.6855162349507479
0.8034930866924055


In [18]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [19]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
  test_x[person_idx-test_user_id_min,err-1]+=1
test_x = test_x.reshape(test_x.shape[0],-1)
test_x.shape

100%|██████████| 16532648/16532648 [01:09<00:00, 239367.13it/s]


(14999, 42)

In [20]:
dtest = xgb.DMatrix(data=test_x)

In [21]:
# 예측
pred_y_list =[]
for model in models:
  pred_y = model.predict(dtest)
  pred_y_list.append(pred_y.reshape(-1,1))
pred_ensemble = np.mean(pred_y_list,axis=0)
pred_ensemble

array([[0.9869126 ],
       [0.2743775 ],
       [0.01808419],
       ...,
       [0.52288795],
       [0.93618184],
       [0.5746795 ]], dtype=float32)

In [22]:
sample_submission['problem'] = pred_ensemble
sample_submission.head()

Unnamed: 0,user_id,problem
0,30000,0.986913
1,30001,0.274377
2,30002,0.018084
3,30003,0.955967
4,30004,0.982405


In [None]:
sample_submission.to_csv('/content/drive/MyDrive/데이콘/submission/xgboost_kfold5_submission_threshold0.5.csv')