In [1]:
import pandas as pd

# 데이터 경로
data_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

## 8.4.1 피처 엔지니어링
* 데이터 합치기

In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) # 타겟값 제거

all_features = all_data.columns # 전체 피처

* 명목형 피처 원-핫 인코딩

In [3]:
from sklearn.preprocessing import OneHotEncoder

# 명목형 피처
cat_features = [feature for feature in all_features if 'cat' in feature]

# 원-핫 인코딩 적용
onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

In [4]:
# '데이터 하나당 결측값 개수'를 파생 피처로 추가
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [5]:
# 명목형 피처, calc 분류의 피처를 제외한 피처
remaining_features = [feature for feature in all_features
                      if ('cat' not in feature and 'calc' not in feature)]

# num_missing을 remaining_features에 추가
remaining_features.append('num_missing')

* 파생 피처 만들기

In [6]:
# mix_ind 만들기 -> ind 피처들을 하나로 뭉치기

# 분류가 ind인 피처
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

* 고윳값별 개수를 새로운 피처로 추가

In [7]:
all_data['ps_ind_02_cat'].value_counts().to_dict()

{1: 1079327, 2: 309747, 3: 70172, 4: 28259, -1: 523}

In [8]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x:
                                                           val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')

In [9]:
cat_count_features

['ps_ind_02_cat_count',
 'ps_ind_04_cat_count',
 'ps_ind_05_cat_count',
 'ps_car_01_cat_count',
 'ps_car_02_cat_count',
 'ps_car_03_cat_count',
 'ps_car_04_cat_count',
 'ps_car_05_cat_count',
 'ps_car_06_cat_count',
 'ps_car_07_cat_count',
 'ps_car_08_cat_count',
 'ps_car_09_cat_count',
 'ps_car_10_cat_count',
 'ps_car_11_cat_count',
 'mix_ind_count']

* 필요없는 피처 제거  
지금까지 만든 피처들   
 encoded_cat_matrix : 원-핫 인코딩된 명목형 피처  
 remaining_features : 명목형 피처와 calc 분류의 피처를 제외한 피처들 (+num_missing)  
 cat_count_features : mix_ind를 포함한 명목형 피처의 고윳값별 개수 파생 피처

In [10]:
from scipy import sparse
# 필요없는 피처들
drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin',
                 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

# remaining_features, cat_count_features에서 drop_features를 제거한 데이터
all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

# 데이터 합치기
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix],
                              format='csr')

* 데이터 나누기

In [11]:
num_train = len(train)  # 훈련 데이터 개수

# 훈련 데이터와 테스트 데이터 나누기
X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

## 8.4.2 하이퍼파라미터 최적화

* 데이터셋

In [12]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# 8:2 비율로 훈련 데이터, 검증 데이터 분리 (베이지안 최적화 수행용)
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=0.2,
                                                      random_state=0)

# 베이지안 최적화용 데이터셋
bayes_dtrain = lgb.Dataset(X_train, y_train)
bayes_dvalid = lgb.Dataset(X_valid, y_valid)

* 하이퍼파라미터 범위설정

In [13]:
# 베이지안 최적화를 위한 하이퍼파라미터 범위
param_bounds = {'num_leaves': (30, 40),
                'lambda_l1': (0.7, 0.9),
                'lambda_l2': (0.9, 1),
                'feature_fraction': (0.6, 0.7),
                'bagging_fraction': (0.6, 0.9),
                'min_child_samples': (6, 10),
                'min_child_weight': (10, 40)}

# 값이 고정된 하이퍼파라미터
fixed_params = {'objective': 'binary',     # 이진분류 문제기 때문에 binary설정
                   'learning_rate': 0.005,    # 학습률은 보통 0.01 ~ 0.001 사이의 값으로 설정함
                   'bagging_freq': 1,         # learning_rate, bagging_freq는 범위로 해도되고 변경하면서 최적을 찾아도 됨
                   'force_row_wise': True,    # 경고문구 제거를 위해
                   'random_state': 1993}      # 동일한 결과를 위해 값 고정

In [14]:
import numpy as np

def eval_gini(y_true, y_pred):
    # 실제값과 예측값의 크기가 서로 같은지 확인 (값이 다르면 오류 발생)
    assert y_true.shape == y_pred.shape
    
    n_samples = y_true.shape[0]        # 데이터 개수
    L_mid = np.linspace(1 / n_samples, 1, n_samples) # 대각선 값
    
    # 1) 예측값에 대한 지니계수
    pred_order = y_true[y_pred.argsort()] # y_pred 크기 순으로 y_true 값 정렬
    L_pred = np.cumsum(pred_order) / np.sum(pred_order) # 로렌츠 곡선
    G_pred = np.sum(L_mid - L_pred)       # 예측값에 대한 지니계수
    
    # 2) 예측이 완벽할 때 지니계수
    true_order = y_true[y_true.argsort()]  # y_true 크기 순으로 y_true값 정렬
    L_true = np.cumsum(true_order) / np.sum(true_order)  # 로렌츠 곡선
    G_true = np.sum(L_mid - L_true)     # 예측이 완벽할 때 지니계수
    
    # 정규화된 지니계수
    return G_pred / G_true

In [15]:
# LightGBM용 gini() 함수
def gini(preds, dtrain):
    labels = dtrain.get_label() # 데이터셋의 타겟값을 반환함
    return 'gini', eval_gini(labels, preds), True    # 평가지표 이름, 평가점수, 평가점수가 높을수록 좋은지 여부

베이지안 최적화용 평가지표 계산 함수 작성

In [16]:
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction, 
                  bagging_fraction, min_child_samples, min_child_weight):
    '''최적화하려는 평가지표(지니계수) 계산 함수'''
    
    # 베이지안 최적화를 수행할 하이퍼파라미터
    params = {'num_leaves': int(round(num_leaves)),
              'lambda_l1': lambda_l1,
              'lambda_l2': lambda_l2,
              'feature_fraction': feature_fraction,
              'bagging_fraction': bagging_fraction,
              'min_child_samples': int(round(min_child_samples)),
              'min_child_weight': min_child_weight,
              'feature_pre_filter': False}
    
    # 고정된 하이퍼파라미터도 추가
    params.update(fixed_params)
    
    print('하이퍼파라미터:', params)
    
    # LightGBM 모델 훈련
    lgb_model = lgb.train(params=params,
                          train_set=bayes_dtrain,
                          num_boost_round=2500,
                          valid_sets=bayes_dvalid,
                          feval=gini,
                          early_stopping_rounds=300,
                          verbose_eval=False)
    # 검증 데이터로 예측 수행
    preds = lgb_model.predict(X_valid)
    # 지니계수 계산
    gini_score = eval_gini(y_valid, preds)
    print(f'지니계수 : {gini_score}\n')
    
    return gini_score

* 최적화 수행

In [17]:
from bayes_opt import BayesianOptimization

# 베이지안 최적화 객체 생성
optimizer = BayesianOptimization(f=eval_function,      # 평가지표 계산 함수
                                 pbounds=param_bounds, # 하이퍼파라미터 범위
                                 random_state=0)

In [18]:
# 베이지안 최적화 수행
'''init_points와 n_iter를 더한 값만큼 반복함'''
optimizer.maximize(init_points=3, # 무작위로 하이퍼파라미터를 탐색하는 횟수 
                   n_iter=6)      # 베이지안 최적화 반복횟수

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------
하이퍼파라미터: {'num_leaves': 34, 'lambda_l1': 0.8205526752143287, 'lambda_l2': 0.9544883182996897, 'feature_fraction': 0.6715189366372419, 'bagging_fraction': 0.7646440511781974, 'min_child_samples': 8, 'min_child_weight': 29.376823391999682, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1993}




[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1557
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2853830357519995

| [0m 1       [0m | [0m 0.2854  [0m | [0m 0.7646  [0m | [0m 0.6715  [0m | [0m 0.8206  [0m | [0m 0.9545  [0m | [0m 7.695   [0m | [0m 29.38   [0m | [0m 34.38   [0m |
하이퍼파라미터: {'num_leaves': 39, 'lambda_l1': 0.7766883037651555, 'lambda_l2': 0.9791725038082665, 'feature_fraction': 0.6963662760501029, 'bagging_fraction': 0.867531900234624, 'min_child_samples': 8, 'min_child_weight': 27.04133683281797, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1993}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightG



[LightGBM] [Info] Total Bins 1557
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2833674422512079

| [0m 4       [0m | [0m 0.2834  [0m | [0m 0.8978  [0m | [0m 0.6594  [0m | [0m 0.8445  [0m | [0m 0.9234  [0m | [0m 8.619   [0m | [0m 10.55   [0m | [0m 30.09   [0m |
하이퍼파라미터: {'num_leaves': 34, 'lambda_l1': 0.8439576755399246, 'lambda_l2': 0.9071389562807886, 'feature_fraction': 0.6141666844038982, 'bagging_fraction': 0.780941422121342, 'min_child_samples': 9, 'min_child_weight': 38.792062858582554, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1993}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1557
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2857146952724652

| [0m 5       [0m | [0m 0.2857  [0m | [0m 0.7809  [0m | [0m 0.6142  [0m | [0m 0.844   [0m | [0m 0.9071  [0m | [0m 8.837   [0m | [0m 38.79   [0m | [0m 33.75   [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.7113567244294035, 'lambda_l2': 0.9992148463611682, 'feature_fraction': 0.6823972673568225, 'bagging_fraction': 0.6452323984860321, 'min_child_samples': 9, 'min_child_weight': 36.23198396337493, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1993}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1557
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2859932988125352

| [0m 6       [0m | [0m 0.286   [0m | [0m 0.6452  [0m | [0m 0.6824  [0m | [0m 0.7114  [0m | [0m 0.9992  [0m | [0m 9.083   [0m | [0m 36.23   [0m | [0m 39.59   [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.8329533491733405, 'lambda_l2': 0.9070423594822995, 'feature_fraction': 0.643096339913785, 'bagging_fraction': 0.8810601650808936, 'min_child_samples': 9, 'min_child_weight': 34.00813892373257, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1993}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1557
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2853460692271328

| [0m 7       [0m | [0m 0.2853  [0m | [0m 0.8811  [0m | [0m 0.6431  [0m | [0m 0.833   [0m | [0m 0.907   [0m | [0m 9.16    [0m | [0m 34.01   [0m | [0m 39.91   [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.7375143924554232, 'lambda_l2': 0.9447377797391991, 'feature_fraction': 0.6749422364975074, 'bagging_fraction': 0.7037668349812904, 'min_child_samples': 7, 'min_child_weight': 36.336982968130606, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1993}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1557
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2850006897964863

| [0m 8       [0m | [0m 0.285   [0m | [0m 0.7038  [0m | [0m 0.6749  [0m | [0m 0.7375  [0m | [0m 0.9447  [0m | [0m 7.172   [0m | [0m 36.34   [0m | [0m 39.95   [0m |
하이퍼파라미터: {'num_leaves': 40, 'lambda_l1': 0.7309829153275589, 'lambda_l2': 0.9606913515746498, 'feature_fraction': 0.6312923791828471, 'bagging_fraction': 0.8515224104845123, 'min_child_samples': 10, 'min_child_weight': 35.29426485484539, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1993}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1557
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2856048470319783

| [0m 9       [0m | [0m 0.2856  [0m | [0m 0.8515  [0m | [0m 0.6313  [0m | [0m 0.731   [0m | [0m 0.9607  [0m | [0m 9.864   [0m | [0m 35.29   [0m | [0m 39.77   [0m |


* 결과 확인

In [19]:
# 평가함수 점수가 최대일 때 하이퍼파라미터
max_params = optimizer.max['params']
max_params

{'bagging_fraction': 0.6213108174593661,
 'feature_fraction': 0.608712929970154,
 'lambda_l1': 0.7040436794880651,
 'lambda_l2': 0.9832619845547939,
 'min_child_samples': 9.112627003799401,
 'min_child_weight': 36.10036444740457,
 'num_leaves': 39.78618342232764}

In [20]:
# 정수형 하이퍼파라미터 변환
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['min_child_samples'] = int(round(max_params['min_child_samples']))