In [2]:
import numpy as np
import pandas as pd
import warnings
import gc

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve

from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
warnings.filterwarnings('ignore')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
test=pd.read_csv('drive/MyDrive/data/test.csv')

In [3]:
train = pd.read_csv('drive/MyDrive/deleted_train1.csv')

In [5]:
train.head()

Unnamed: 0,id,Gender,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,AgeGroup
0,1,1,1,0.538462,0,2,1,0.968248,0.154321,0.961938,1,0.333333
1,2,0,1,0.269231,1,1,0,0.600141,0.932099,0.844291,0,0.083333
2,4,0,1,0.288462,1,0,0,0.492679,0.932099,0.982699,0,0.25
3,5,0,1,0.903846,1,1,0,0.425631,0.932099,0.647059,0,0.166667
4,6,1,1,0.865385,1,1,0,0.407603,0.932099,0.622837,0,0.0


In [6]:
train.drop(['id'],axis=1)

Unnamed: 0,Gender,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,AgeGroup
0,1,1,0.538462,0,2,1,0.968248,0.154321,0.961938,1,0.333333
1,0,1,0.269231,1,1,0,0.600141,0.932099,0.844291,0,0.083333
2,0,1,0.288462,1,0,0,0.492679,0.932099,0.982699,0,0.250000
3,0,1,0.903846,1,1,0,0.425631,0.932099,0.647059,0,0.166667
4,1,1,0.865385,1,1,0,0.407603,0.932099,0.622837,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
9127520,1,1,0.538462,1,0,0,0.457806,0.944444,0.660900,0,0.166667
9127521,1,1,0.115385,0,0,1,0.412612,0.154321,0.719723,0,0.416667
9127522,0,1,0.692308,0,1,1,0.449603,0.932099,0.363322,1,0.083333
9127523,0,1,0.538462,0,0,1,0.783595,0.154321,0.913495,1,0.500000


In [4]:
cat_cols = ["Gender", "Driving_License", "Previously_Insured", "Vehicle_Age", "Vehicle_Damage"]
num_cols = ["AgeGroup", "Region_Code", "Annual_Premium", "Policy_Sales_Channel", "Vintage"]
target = "Response"

In [5]:
X = train[cat_cols + num_cols]



y = train[target]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7, stratify=y)

## xgboost

In [9]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

# XGBoost 모델 설정 및 훈련
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='auc',
    n_estimators=1500,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.75,
    gamma=0.2,
    min_child_weight=1,
    reg_alpha=0.01,
    reg_lambda=0.8,
    tree_method='gpu_hist'  # device="gpu"는 더 이상 사용되지 않음, 대신 tree_method 사용
)

xgb_clf.fit(X_train, y_train)
xgb_pred = xgb_clf.predict_proba(X_test)[:, 1]

xgb_auc = roc_auc_score(y_test, xgb_pred)
print(f"XGBoost AUC: {xgb_auc}")



XGBoost AUC: 0.8837698494002837


## light gbm

In [10]:
# LightGBM 모델 설정 및 훈련
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',  # Change metric to logloss for probability predictions
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Early stopping을 위한 callback 설정
callbacks = [lgb.early_stopping(stopping_rounds=10)]

bst = lgb.train(params, train_data, num_boost_round=100, valid_sets=[test_data], callbacks=callbacks)

y_prob = bst.predict(X_test, num_iteration=bst.best_iteration)
lgb_auc = roc_auc_score(y_test, y_prob)
print(f'LightGBM AUC: {lgb_auc}')


[LightGBM] [Info] Number of positive: 993308, number of negative: 7221464
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.225283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 668
[LightGBM] [Info] Number of data points in the train set: 8214772, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120917 -> initscore=-1.983772
[LightGBM] [Info] Start training from score -1.983772
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.249123
LightGBM AUC: 0.8794111293372895


## ensemble1: voting

In [13]:
# XGBoost와 LightGBM 예측 결과를 앙상블
ensemble_pred = (xgb_pred + y_prob) / 2

# 앙상블 모델의 AUC 계산
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
fpr, tpr, thresholds = roc_curve(y_test, ensemble_pred)

print(f'Ensemble ROC AUC: {ensemble_auc}')

Ensemble ROC AUC: 0.8826170467995988


## ensemble2: stacking->실행해야 할 모델이 많아 지금처럼 큰 데이터셋에는 적합하지 않아보임


In [11]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [16]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# XGBoost 모델 설정
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='auc',
    n_estimators=1500,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.75,
    gamma=0.2,
    min_child_weight=1,
    reg_alpha=0.01,
    reg_lambda=0.8,
    tree_method='gpu_hist'  # GPU 가속을 위한 설정
)

# LightGBM 모델 설정
lgb_clf = lgb.LGBMClassifier(
    n_estimators=1500,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.75
)

# 스태킹 모델 설정
estimators = [
    ('xgb', xgb_clf),
    ('lgb', lgb_clf)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    passthrough=True  # 기본 모델의 입력 데이터도 메타 모델에 전달
)

# 스태킹 모델 훈련
stacking_clf.fit(X_train, y_train)

# 예측 및 평가
stacking_pred = stacking_clf.predict_proba(X_test)[:, 1]
stacking_auc = roc_auc_score(y_test, stacking_pred)

fpr, tpr, thresholds = roc_curve(y_test, stacking_pred)

print(f'Stacking ROC AUC: {stacking_auc}')


[LightGBM] [Info] Number of positive: 993649, number of negative: 7221123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.725162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 8214772, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120959 -> initscore=-1.983382
[LightGBM] [Info] Start training from score -1.983382


KeyboardInterrupt: 

## ensemble3: random forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
# 랜덤 포레스트 모델 생성
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# 모델 훈련
rf_clf.fit(X_train, y_train)

# 예측 수행
y_pred = rf_clf.predict(X_test)

# 정확도와 성능 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=data.target_names)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

KeyboardInterrupt: 

## Catboost(전처리 하지 않은 버전)

In [4]:
data = pd.read_csv('drive/MyDrive/data/train.csv')

In [6]:
data["Region_Code"] = data["Region_Code"].astype(np.int8)
test["Region_Code"] = test["Region_Code"].astype(np.int8)

data["Policy_Sales_Channel"] = data["Policy_Sales_Channel"].astype(np.int16)
test["Policy_Sales_Channel"] = test["Policy_Sales_Channel"].astype(np.int16)

In [8]:
target="Response"
a= data.drop(target, axis=1)
b = data[target]

In [9]:
skfold = StratifiedKFold(5, shuffle=True, random_state=42)

In [10]:
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 3000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False
}

In [11]:
import catboost as cb

In [None]:
oof_preds = []
oof_aucs = []

test_pool = cb.Pool(test.astype(str), cat_features=a.columns.values)

for fold, (train_idx, test_idx) in enumerate(skfold.split(a, b)):
    X_train, y_train = a.iloc[train_idx], b[train_idx]
    X_test, y_test = a.iloc[test_idx], b[test_idx]

    X_train_pool = cb.Pool(X_train.astype(str), y_train, cat_features=a.columns.values)
    X_test_pool = cb.Pool(X_test.astype(str), y_test, cat_features=a.columns.values)

    cat_clf = cb.CatBoostClassifier(**cat_params)
    cat_clf = cat_clf.fit(X=X_train_pool,
                          eval_set=X_test_pool,
                          verbose=500,
                          early_stopping_rounds=200)

    test_pred = cat_clf.predict_proba(test_pool)[:, 1]

    oof_preds.append(test_pred)
    auc = cat_clf.best_score_['validation']['AUC']
    oof_aucs.append(auc)
    print(f"\n---- Fold {fold}: ROC-AUC Score: {auc:.6f}\n")

    del X_train, y_train, X_test, y_test
    del X_train_pool, X_test_pool
    del cat_clf
    gc.collect()

auc_mean = np.mean(oof_aucs)
auc_std = np.std(oof_aucs)
print(f"\n---> ROC-AUC Score: {auc_mean:.6f} \xB1 {auc_std:.6f}\n")

test_pred_cat = np.mean(oof_preds, axis=0)