In [1]:
!pip install optuna-integration

Collecting optuna-integration
  Downloading optuna_integration-3.6.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna_integration-3.6.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.4/93.4 kB[0m [31m897.0 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-3.6.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from tqdm import tqdm
import lightgbm as lgb
import optuna
from optuna import Trial
from optuna.logging import set_verbosity, INFO
from optuna.integration import XGBoostPruningCallback
from lightgbm import early_stopping

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')

In [4]:
# id 열 삭제
train = train.drop(['id'], axis = 1)

In [5]:
# gender, vehicle_age, vehicle_damage 데이터 타입 변경 -> 라벨인코더로 통일
label_encoder = LabelEncoder()
cat=['Gender', 'Vehicle_Age', 'Vehicle_Damage']
for column in cat:
    train[column] = label_encoder.fit_transform(train[column])

In [6]:
# Age 범주형 변경

# 구간 설정
bins = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, float('inf')]

# 각 구간에 대한 라벨 설정
labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

# Age 열을 범주형 변수로 변환
train['AgeGroup'] = pd.cut(train['Age'], bins=bins, labels=labels, right=False)

In [7]:
# Age 열 삭제
train = train.drop(['Age'], axis = 1)

In [8]:
# IQR을 이용하여 이상치 제거
Q1 = train['Annual_Premium'].quantile(0.25)
Q3 = train['Annual_Premium'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 이상치가 있는 행 제거
deleted_train = train[(train['Annual_Premium'] >= lower_bound) & (train['Annual_Premium'] <= upper_bound)]

deleted_train.shape

(9127525, 11)

In [9]:
# min_max_scaler
numerical = ['Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
scaler = MinMaxScaler()
deleted_train[numerical] = scaler.fit_transform(deleted_train[numerical])
deleted_train.head()

Unnamed: 0,Gender,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,AgeGroup
1,1,1,0.538462,0,2,1,0.968248,0.154321,0.961938,1,4
2,0,1,0.269231,1,1,0,0.600141,0.932099,0.844291,0,1
4,0,1,0.288462,1,0,0,0.492679,0.932099,0.982699,0,3
5,0,1,0.903846,1,1,0,0.425631,0.932099,0.647059,0,2
6,1,1,0.865385,1,1,0,0.407603,0.932099,0.622837,0,0


In [10]:
df = train

In [11]:
# X, y 데이터
X = df.drop('Response', axis = 1)
y = df['Response']

In [12]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# 데이터 표준화 (minmaxscaler)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
params = {
    'lambda_l1': 3.111893931999387e-05,
    'lambda_l2': 5.1187331713238715e-06,
    'num_leaves': 119,
    'feature_fraction': 0.3404466623446993,
    'bagging_fraction': 0.9487585809243285,
    'bagging_freq': 2,
    'min_child_samples': 70,
    'learning_rate': 0.02817637192423954,
    'objective' : 'binary',
    'metric' : 'auc',
    'verbosity' : -1,
    'boosting_type' : 'gbdt'
}

In [15]:
# 모델 학습
model = lgb.LGBMClassifier(**params)

In [16]:
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

In [17]:
y_pred = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)

In [18]:
print(roc_auc)

0.871204275277996
