In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [4]:
# 1. 데이터 로드
data = pd.read_csv('../tabular_text_image_data.csv')  # 실제 경로로 변경

  data = pd.read_csv('../tabular_text_image_data.csv')  # 실제 경로로 변경


In [7]:
# 2. 결측치 제거
data = data.dropna()

In [11]:
# 3. 데이터 타입 변환
data = data.astype({'is_find': 'bool'})  # 'is_find' 컬럼을 bool 타입으로 변환

In [13]:
# 4. 특징 데이터(X)와 라벨(y) 분리
X = data.drop(columns='is_fraud')  # 'is_fraud'는 타겟 컬럼명, 실제 데이터셋에 맞게 변경
y = data['is_fraud']

In [14]:
# 5. 라벨 인코딩 (필요한 경우)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [15]:
# 6. 학습용 및 테스트용 데이터셋 분리 (70% 학습, 30% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [16]:
# 7. SMOTE를 활용한 오버샘플링
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [20]:
# 8. 파이프라인 정의 (StandardScaler + XGBoost 모델)
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler 추가
    ('classifier', XGBClassifier())
])

In [21]:
# 9. 하이퍼파라미터 탐색 범위 설정
param_distributions = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.7, 0.8, 1.0]
}

In [22]:
# 10. RandomizedSearchCV를 사용해 하이퍼파라미터 최적화 (verbose=0으로 설정)
search = RandomizedSearchCV(pipeline, param_distributions, n_iter=10, scoring='f1', n_jobs=-1, cv=3, random_state=42, verbose=0)
search.fit(X_train_resampled, y_train_resampled)

KeyboardInterrupt: 