In [None]:
# !pip install optuna-integration

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from tqdm import tqdm
import xgboost as xgb
import optuna
from optuna import Trial
from optuna.logging import set_verbosity, INFO
from optuna.integration import XGBoostPruningCallback

In [None]:
train = pd.read_csv('train.csv')

In [None]:
# id 열 삭제
train = train.drop(['id'], axis = 1)

In [None]:
# gender, vehicle_age, vehicle_damage 데이터 타입 변경 -> 라벨인코더로 통일
label_encoder = LabelEncoder()
cat=['Gender', 'Vehicle_Age', 'Vehicle_Damage']
for column in cat:
    train[column] = label_encoder.fit_transform(train[column])

In [None]:
numerical = ['Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Age']
scaler = MinMaxScaler()
train[numerical] = scaler.fit_transform(train[numerical])
train.head()

In [None]:
df = train

# X, y 데이터
X = df.drop('Response', axis = 1)
y = df['Response']

In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 데이터 표준화 (minmaxscaler)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
set_verbosity(INFO)

def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50)
    }

    cat_model = CatBoostClassifier(**param, loss_function='Logloss', eval_metric='AUC', random_seed=0, logging_level='Silent')
    cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)

    y_pred = cat_model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred)
    
    return roc_auc

# 진행 상황을 출력하는 콜백 함수 정의
def logging_callback(study, trial):
    print(f"Trial {trial.number} finished with value: {trial.value} and parameters: {trial.params}")

# Optuna 스터디 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, callbacks=[logging_callback])

# 최적의 결과 출력
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")