In [10]:
import numpy as np
import pandas as pd
import warnings
import gc

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve

from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
import lightgbm as lgb

warnings.filterwarnings('ignore')


In [11]:
dataset = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

# Training Model

# XGBClassifier

In [None]:
cat_cols = ["Gender", "Driving_License", "Previously_Insured", "Vehicle_Age", "Vehicle_Damage"]
num_cols = ["Age", "Region_Code", "Annual_Premium", "Policy_Sales_Channel", "Vintage"]
target = "Response"

In [None]:
X = dataset[cat_cols + num_cols]
X["Vehicle_Age"] = X["Vehicle_Age"].replace({"1-2 Year": "Between", "< 1 Year": "less", "> 2 Years": "greater"})
X = pd.get_dummies(X, columns=cat_cols)

test = df_test[cat_cols + num_cols]
test["Vehicle_Age"] = test["Vehicle_Age"].replace({"1-2 Year": "Between", "< 1 Year": "less", "> 2 Years": "greater"})
test = pd.get_dummies(test, columns=cat_cols)

y = dataset[target]

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)

In [None]:
xgb_clf =xgb_clf = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='auc', 
    n_estimators=1500, 
    learning_rate=0.02, 
    max_depth=6, 
    subsample=0.85, 
    colsample_bytree=0.75, 
    gamma=0.2, 
    min_child_weight=1,
    reg_alpha=0.01,
    reg_lambda=0.8,
    device="gpu"
)


xgb_clf.fit(X_train, y_train)


In [None]:

xgb_pred = xgb_clf.predict_proba(X_test)[:, 1]

xgb_auc = roc_auc_score(y_test, xgb_pred)
print(f"XGBoost AUC: {xgb_auc}")


# lightgbm

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define the parameters
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',  # Change metric to logloss for probability predictions
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
bst = lgb.train(params, train_data, num_boost_round=100, valid_sets=[test_data])

# Predictions
y_prob = bst.predict(X_test, num_iteration=bst.best_iteration)


In [None]:


# Evaluate using AUC-ROC
roc_auc = roc_auc_score(y_test, y_prob)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

print(f'ROC AUC: {roc_auc}')


# validation set으로 판단

## logistic regression

In [None]:
# Logistic Regression
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_val_pred_lr = model_lr.predict_proba(X_test)[:, 1]
roc_auc_lr = roc_auc_score(y_test, y_val_pred_lr)
print(f"Logistic Regression Validation ROC AUC: {roc_auc_lr}")

## naive bayes

In [None]:
# Logistic Regression
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_val_pred_lr = model_lr.predict_proba(X_test)[:, 1]
roc_auc_lr = roc_auc_score(y_test, y_val_pred_lr)
print(f"Logistic Regression Validation ROC AUC: {roc_auc_lr}")

## catboost

In [19]:


# 데이터 준비
# X = pd.DataFrame(...)  # 적절한 데이터프레임으로 대체하세요.
# y = pd.Series(...)     # 적절한 시리즈로 대체하세요.
# X_test = pd.DataFrame(...)  # 테스트 데이터프레임으로 대체하세요.

# 학습 및 검증 데이터로 분할
X_train = X
y_train = y.values
X_test_pool = Pool(X_test)

# CatBoost 모델 초기화
model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    class_names=[0, 1],
    learning_rate=0.075,
    iterations=5000,
    depth=9,
    random_strength=0,
    l2_leaf_reg=0.5,
    max_leaves=512,
    fold_permutation_block=64,
    task_type='GPU',
    random_seed=42,
    verbose=False
)

# 학습 데이터 풀 생성
X_train_pool = Pool(X_train, y_train)

# 모델 학습
model.fit(X=X_train_pool, 
          verbose=500, 
          early_stopping_rounds=200)


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 122ms	remaining: 10m 10s
500:	total: 39.4s	remaining: 5m 53s
1000:	total: 1m 19s	remaining: 5m 19s
1500:	total: 2m 1s	remaining: 4m 43s
2000:	total: 2m 43s	remaining: 4m 5s
2500:	total: 3m 25s	remaining: 3m 25s
3000:	total: 4m 8s	remaining: 2m 45s
3500:	total: 4m 50s	remaining: 2m 4s
4000:	total: 5m 33s	remaining: 1m 23s
4500:	total: 6m 16s	remaining: 41.7s
4999:	total: 6m 59s	remaining: 0us


KeyError: 'AUC'

In [28]:
# 검증 데이터에 대해 예측 수행
val_preds = model.predict_proba(X_test_pool)[:, 1]

# AUC 계산
val_auc = roc_auc_score(y_test, val_preds)
print('Validation ROC-AUC score: ', val_auc)

Validation ROC-AUC score:  0.8817075826679148


## adaboost

In [None]:
model = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=9),
    n_estimators=500,
    learning_rate=0.075,
    random_state=42
)

# 모델 학습
model.fit(X_train, y_train)

# 학습 데이터에 대한 AUC 출력 (참고용)
# 검증 데이터에 대해 예측 수행
val_preds = model.predict_proba(X_test_pool)[:, 1]

# AUC 계산
val_auc = roc_auc_score(y_test, val_preds)
print('Validation ROC-AUC score: ', val_auc)