In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings(action='ignore')

In [None]:
train_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_err_data.csv')
train_problem = pd.read_csv('/content/drive/MyDrive/데이콘/data/train_problem_data.csv')
test_err = pd.read_csv('/content/drive/MyDrive/데이콘/data/test_err_data.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/데이콘/data/sample_submission.csv')

In [None]:
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

In [None]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,42))

for person_idx, err in tqdm(id_error):
  error[person_idx-train_user_id_min,err-1] +=1
error

100%|██████████| 16554663/16554663 [00:47<00:00, 349611.56it/s]


array([[  0.,   0.,   8., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ..., 113.,  56.,   1.],
       [  0.,   0.,   2., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,  58.,   8.,   5.],
       [  0.,   0.,   0., ...,   6.,   0.,   0.],
       [  0.,   0.,   4., ...,   0.,   0.,   0.]])

In [None]:
# error_df = pd.DataFrame(error)
# error_df

In [20]:
# scaler = StandardScaler()
# scaler.fit(error_df)
# error_scaled = scaler.transform(error_df)
# error_scaled

array([[-1.96898297e-01, -3.01227754e-02,  8.43453622e-02, ...,
        -6.02484448e-01, -2.97800178e-01, -6.44055997e-01],
       [-1.96898297e-01, -3.01227754e-02, -2.90703329e-02, ...,
         7.95710924e-01,  2.10432152e+00, -3.63005204e-01],
       [-1.96898297e-01, -3.01227754e-02, -7.16409141e-04, ...,
        -6.02484448e-01, -2.97800178e-01, -6.44055997e-01],
       ...,
       [-1.96898297e-01, -3.01227754e-02, -2.90703329e-02, ...,
         1.15173354e-01,  4.53600647e-02,  7.61197967e-01],
       [-1.96898297e-01, -3.01227754e-02, -2.90703329e-02, ...,
        -5.28243986e-01, -2.97800178e-01, -6.44055997e-01],
       [-1.96898297e-01, -3.01227754e-02,  2.76375146e-02, ...,
        -6.02484448e-01, -2.97800178e-01, -6.44055997e-01]])

In [None]:
problem = np.zeros(15000)
problem[train_problem['user_id'].unique()-train_user_id_min] = 1
problem

array([0., 1., 0., ..., 1., 1., 0.])

In [24]:
train_x = error
train_y = problem
print(train_x.shape)
print(train_y.shape)

(15000, 42)
(15000,)


In [25]:
def f_pr_auc(probas_pred,y_true):
  labels = y_true.get_label()
  p, r, _ = precision_recall_curve(labels,probas_pred)
  score = auc(r,p)
  return 'pr_auc',score,True

models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5

# 파라미터 설정
params =      {
                'objective'     : 'binary:logistic',
                'eval_metric'        : 'auc',
               'early_stoppings'  :   50
                }

k_fold = KFold(n_splits=5,shuffle=True,random_state=42)
for train_idx, val_idx in k_fold.split(train_x):
  X = train_x[train_idx]
  y = train_y[train_idx]
  valid_x = train_x[val_idx]
  valid_y = train_y[val_idx]

  d_train = xgb.DMatrix(X,y)
  d_val = xgb.DMatrix(valid_x)


  xgb_model = xgb.train(params=params,
                        dtrain=d_train,
                        feval=f_pr_auc,
                        num_boost_round=1000,
                        verbose_eval=20)
  
  valid_prob = xgb_model.predict(d_val)
  valid_pred = np.where(valid_prob>threshold,1,0)

  recall = recall_score(valid_y,valid_pred)
  precision =precision_score(valid_y,valid_pred)
  auc_score = roc_auc_score(valid_y,valid_prob)

  models.append(xgb_model)
  recalls.append(recall)
  precisions.append(precision)
  auc_scores.append(auc_score)

  print('==========================================================')



In [26]:
# 교차검증 점수 확인
print(np.mean(auc_scores))

0.7625310516298501


In [None]:
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [None]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
  test_x[person_idx-test_user_id_min,err-1]+=1
test_x = test_x.reshape(test_x.shape[0],-1)
test_x.shape

100%|██████████| 16532648/16532648 [00:46<00:00, 352646.79it/s]


(14999, 42)

In [None]:
dtest = xgb.DMatrix(data=test_x)

In [None]:
# 예측
pred_y_list =[]
for model in models:
  pred_y = model.predict(dtest)
  pred_y_list.append(pred_y.reshape(-1,1))
pred_ensemble = np.mean(pred_y_list,axis=0)
pred_ensemble

array([[0.9218465 ],
       [0.15814725],
       [0.03062022],
       ...,
       [0.549404  ],
       [0.88255847],
       [0.6183587 ]], dtype=float32)

In [None]:
sample_submission['problem'] = pred_ensemble
sample_submission.head()

Unnamed: 0,user_id,problem
0,30000,0.921847
1,30001,0.158147
2,30002,0.03062
3,30003,0.805593
4,30004,0.78881


In [None]:
sample_submission.to_csv('/content/drive/MyDrive/데이콘/submission/xgboost_kfold5_submission_threshold0.5.csv')