# ***Install Depedencies***

In [None]:
!pip install optuna lightgbm scikit-learn pandas numpy matplotlib seaborn tqdm catboost xgboost category_encoders

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.model_selection import GridSearchCV

import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve, f1_score, classification_report, roc_auc_score


# ***Import Data Cleaned***

In [None]:
df_train = pd.read_csv('/content/new_train_2_no_encode.csv')
df_test = pd.read_csv('/content/new_test_2_no_encode.csv')

df_train.shape, df_test.shape

((7000, 25), (3000, 25))

In [None]:
df_train.drop(columns=['ID'])
df_train.fillna('NA', inplace=True)
df_train['coppaRisk'].value_counts()

Unnamed: 0_level_0,count
coppaRisk,Unnamed: 1_level_1
0.0,6304
1.0,696


# ***HyperParameter Tuning With Optuna***

we train LGBM, GBC, CatBoost, and XGBoost with hyperparameter to compare the metrics

## LGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np
import warnings
warnings.filterwarnings('ignore', category=UserWarning)  # Ignore DART warning

X = df_train.drop(columns=['coppaRisk'])
y = df_train['coppaRisk']

# Handle categorical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    X[col] = X[col].astype('category')

# Imbalance handling
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 256),
        'max_depth': trial.suggest_int('max_depth', -1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 300),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': scale_pos_weight,
        'random_state': 42,
        'n_jobs': -1
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        # Dynamic callbacks based on boosting_type
        callbacks = [lgb.log_evaluation(0)]
        if param['boosting_type'] == 'gbdt':
            callbacks.append(lgb.early_stopping(stopping_rounds=50, verbose=False))

        model = lgb.LGBMClassifier(**param)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='auc',
            callbacks=callbacks,
            categorical_feature=categorical_cols
        )

        preds = model.predict_proba(X_valid)[:, 1]
        aucs.append(roc_auc_score(y_valid, preds))

    return np.mean(aucs)

# Run Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Results
best_params = study.best_params
print("Best AUC:", study.best_value)
print("Best params:", best_params)

[I 2025-04-15 19:53:43,858] A new study created in memory with name: no-name-1e0320d9-8774-453a-a8f6-0edbd4d25806
[I 2025-04-15 19:53:44,551] Trial 0 finished with value: 0.8745712818406688 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.0001661447902813353, 'num_leaves': 238, 'max_depth': 8, 'min_child_samples': 70, 'reg_alpha': 2.006354521425547, 'reg_lambda': 0.7828416911656316, 'subsample': 0.5974901746616017, 'colsample_bytree': 0.6283598292273906}. Best is trial 0 with value: 0.8745712818406688.
[I 2025-04-15 19:53:45,170] Trial 1 finished with value: 0.8737623662289803 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.00023279737047044786, 'num_leaves': 107, 'max_depth': 14, 'min_child_samples': 208, 'reg_alpha': 0.0011780893253384413, 'reg_lambda': 0.001772091057815134, 'subsample': 0.8080399876958773, 'colsample_bytree': 0.79505196688829}. Best is trial 0 with value: 0.8745712818406688.
[I 2025-04-15 19:53:49,355] Trial 2 finished with value: 0.87609731

Best AUC: 0.8922775611293321
Best params: {'boosting_type': 'gbdt', 'learning_rate': 0.06135072923663973, 'num_leaves': 218, 'max_depth': 9, 'min_child_samples': 158, 'reg_alpha': 6.740032554952442e-06, 'reg_lambda': 0.00393802503933637, 'subsample': 0.7004990484061593, 'colsample_bytree': 0.6647822214185323}


### *AUC SCORE: 0.8922775611293321*

## GBC

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import optuna
import numpy as np
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Prepare data
X = df_train.drop(columns=['coppaRisk'])
y = df_train['coppaRisk']

# Encode categorical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Imbalance handling
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])

def objective(trial):
    param = {
        'loss': 'log_loss',  # For probabilistic output
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'random_state': 42
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        # Handle class imbalance
        if scale_pos_weight > 1:
            class_weight = {1: scale_pos_weight, 0: 1.0}
        else:
            class_weight = None

        model = GradientBoostingClassifier(**param)
        model.fit(X_train, y_train)

        preds = model.predict_proba(X_valid)[:, 1]
        aucs.append(roc_auc_score(y_valid, preds))

    return np.mean(aucs)

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=3600)  # 1 hour timeout

# Results
best_params = study.best_params
print("Best AUC:", study.best_value)
print("Best params:", best_params)

# Train final model with best parameters
final_model = GradientBoostingClassifier(**best_params)
final_model.fit(X, y)

[I 2025-04-16 15:56:23,261] A new study created in memory with name: no-name-82eeff55-1836-4523-af28-c9e42ad1621f
[I 2025-04-16 15:56:27,947] Trial 0 finished with value: 0.8832679714617339 and parameters: {'learning_rate': 0.08401786894741851, 'n_estimators': 53, 'max_depth': 13, 'min_samples_split': 135, 'min_samples_leaf': 9, 'max_features': 0.8414279823346444, 'subsample': 0.5784829699000998}. Best is trial 0 with value: 0.8832679714617339.
[I 2025-04-16 15:56:41,050] Trial 1 finished with value: 0.8780580240496141 and parameters: {'learning_rate': 0.0018202937720081976, 'n_estimators': 283, 'max_depth': 7, 'min_samples_split': 29, 'min_samples_leaf': 29, 'max_features': 0.3503087701818477, 'subsample': 0.645519838668831}. Best is trial 0 with value: 0.8832679714617339.
[I 2025-04-16 15:57:07,729] Trial 2 finished with value: 0.8829208758129038 and parameters: {'learning_rate': 0.00013543478530360096, 'n_estimators': 290, 'max_depth': 7, 'min_samples_split': 20, 'min_samples_leaf':

Best AUC: 0.8881233410979545
Best params: {'learning_rate': 0.007520874517350156, 'n_estimators': 482, 'max_depth': 6, 'min_samples_split': 95, 'min_samples_leaf': 22, 'max_features': 0.8607127566705017, 'subsample': 0.7052450020816448}


### *AUC SCORE: 0.8881233410979545*

## Catboost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np

# Prepare data
X = df_train.drop(columns=['coppaRisk'])
y = df_train['coppaRisk']

# Identify categorical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# Imbalance handling
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])

def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'auto_class_weights': 'Balanced',
        'eval_metric': 'AUC',
        'task_type': 'CPU',
        'random_seed': 42,
        'verbose': False
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = CatBoostClassifier(**param)
        model.fit(
            X_train, y_train,
            eval_set=(X_valid, y_valid),
            cat_features=categorical_cols,
            early_stopping_rounds=100,
            use_best_model=True,
            verbose=False
        )

        preds = model.predict_proba(X_valid)[:, 1]
        aucs.append(roc_auc_score(y_valid, preds))

    return np.mean(aucs)

# Run Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=3600)

# Results
print("Best AUC:", study.best_value)
print("Best params:", study.best_params)

# Train final model
final_cb = CatBoostClassifier(
    **study.best_params,
    cat_features=categorical_cols,
    random_seed=42,
    verbose=100
)
final_cb.fit(X, y)

[I 2025-04-17 05:38:59,352] A new study created in memory with name: no-name-ed7fceea-4b1a-4f45-9fde-160cc3f53c5f
[I 2025-04-17 05:39:17,685] Trial 0 finished with value: 0.8731112538120145 and parameters: {'iterations': 1517, 'learning_rate': 0.22520332336085705, 'depth': 7, 'l2_leaf_reg': 8.330736569793979e-08, 'random_strength': 3.0730437882299593, 'bagging_temperature': 0.840962705350022, 'border_count': 66, 'grow_policy': 'Depthwise'}. Best is trial 0 with value: 0.8731112538120145.
[I 2025-04-17 05:39:27,740] Trial 1 finished with value: 0.8855155782218486 and parameters: {'iterations': 901, 'learning_rate': 0.14127913454662266, 'depth': 7, 'l2_leaf_reg': 0.00687894507426123, 'random_strength': 2.889028127013081, 'bagging_temperature': 0.03822588539365357, 'border_count': 89, 'grow_policy': 'Lossguide'}. Best is trial 1 with value: 0.8855155782218486.
[I 2025-04-17 05:39:39,498] Trial 2 finished with value: 0.8881680842090962 and parameters: {'iterations': 1897, 'learning_rate': 

Best AUC: 0.8910796072459226
Best params: {'iterations': 937, 'learning_rate': 0.07072666970267609, 'depth': 8, 'l2_leaf_reg': 3.6670271829213004, 'random_strength': 3.113436307243024, 'bagging_temperature': 0.8353210873300276, 'border_count': 233, 'grow_policy': 'Lossguide'}
0:	learn: 0.6273055	total: 37.6ms	remaining: 35.2s
100:	learn: 0.2055836	total: 1.6s	remaining: 13.3s
200:	learn: 0.1722158	total: 3.37s	remaining: 12.3s
300:	learn: 0.1450428	total: 5.23s	remaining: 11s
400:	learn: 0.1212444	total: 7s	remaining: 9.36s
500:	learn: 0.1037964	total: 9.35s	remaining: 8.14s
600:	learn: 0.0899387	total: 12s	remaining: 6.71s
700:	learn: 0.0776062	total: 13.7s	remaining: 4.61s
800:	learn: 0.0693911	total: 15.4s	remaining: 2.62s
900:	learn: 0.0609713	total: 17.1s	remaining: 684ms
936:	learn: 0.0579499	total: 17.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7a09403ccf50>

### *AUC SCORE: 0.8910796072459226*

## Xgboost

In [None]:
df_train = pd.read_csv('/content/new_train_2_no_encode.csv')

df_train.drop(columns=['ID'])
df_train.shape
# df_train.fillna('NA', inplace=True)

(7000, 25)

In [None]:
from xgboost import XGBClassifier
from category_encoders import TargetEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 1. Prepare Data
X = df_train.drop(columns=['coppaRisk'])
y = df_train['coppaRisk']

# 2. Handle Missing Values for Categorical Columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
X[categorical_cols] = X[categorical_cols].fillna('__MISSING__')

# 3. Target Encoding
encoder = TargetEncoder(
    cols=categorical_cols,
    handle_missing='value',
    handle_unknown='value',
    smoothing=0.1
)
X_encoded = encoder.fit_transform(X, y)

# 4. Handle Class Imbalance
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])

# 5. Optuna Objective Function
def objective(trial):
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'eta': trial.suggest_float('eta', 0.005, 0.3, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'scale_pos_weight': scale_pos_weight,
        'random_state': 42,
        'n_jobs': -1,
        'early_stopping_rounds': 100  # MOVED FROM FIT TO CONSTRUCTOR
    }

    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_float('rate_drop', 1e-8, 1.0, log=True)
        param['skip_drop'] = trial.suggest_float('skip_drop', 1e-8, 1.0, log=True)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in cv.split(X_encoded, y):
        X_train, X_valid = X_encoded.iloc[train_idx], X_encoded.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBClassifier(**param)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )

        preds = model.predict_proba(X_valid)[:, 1]
        aucs.append(roc_auc_score(y_valid, preds))

    return np.mean(aucs)

# 6. Run Optuna Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=3600)

# Results
print("Best AUC:", study.best_value)
print("Best params:", study.best_params)

[I 2025-04-17 06:51:29,121] A new study created in memory with name: no-name-85752102-d8c2-4a7b-8fc6-fdb21b36265c
[I 2025-04-17 06:52:53,478] Trial 0 finished with value: 0.8962032392729912 and parameters: {'booster': 'dart', 'lambda': 1.432862667204198e-06, 'alpha': 0.6363742673235065, 'max_depth': 6, 'eta': 0.07830728233500409, 'gamma': 0.01222946813715198, 'grow_policy': 'lossguide', 'subsample': 0.780860778040575, 'colsample_bytree': 0.6795466865528991, 'min_child_weight': 54, 'max_delta_step': 8, 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 7.2692100542392055e-06, 'skip_drop': 1.2148935499331546e-08}. Best is trial 0 with value: 0.8962032392729912.
[I 2025-04-17 06:52:54,715] Trial 1 finished with value: 0.8932750751671721 and parameters: {'booster': 'gbtree', 'lambda': 3.251454044728646e-08, 'alpha': 0.5896328467262978, 'max_depth': 10, 'eta': 0.022447362510726918, 'gamma': 0.00015450346067538093, 'grow_policy': 'depthwise', 'subsample': 0.8924127034799609, 

Best AUC: 0.8978442342178952
Best params: {'booster': 'dart', 'lambda': 3.4952982478372176e-08, 'alpha': 3.640883392151904e-05, 'max_depth': 4, 'eta': 0.0660631547926777, 'gamma': 6.433364192096545e-07, 'grow_policy': 'depthwise', 'subsample': 0.6286254970393418, 'colsample_bytree': 0.6681196170084813, 'min_child_weight': 17, 'max_delta_step': 7, 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 2.9882344667672615e-08, 'skip_drop': 0.24540010020182235}


### *AUC SCORE: 0.8978442342178952*

# Predict df_test with best Model

*BEST MODEL:* LGBM with AUC = 0.8922

*best_params* = 
```Python
{'boosting_type': 'gbdt',
                'learning_rate': 0.08538198953344486,
                'num_leaves': 16,
                'max_depth': 18,
                'min_child_samples': 72,
                'reg_alpha': 0.09583530070653148,
                'reg_lambda': 0.00018088945132819172,
                'subsample': 0.504004645051515,
                'colsample_bytree': 0.7372947543289512}
```

In [None]:
best_params = {'boosting_type': 'gbdt',
                'learning_rate': 0.08538198953344486,
                'num_leaves': 16,
                'max_depth': 18,
                'min_child_samples': 72,
                'reg_alpha': 0.09583530070653148,
                'reg_lambda': 0.00018088945132819172,
                'subsample': 0.504004645051515,
                'colsample_bytree': 0.7372947543289512}



In [None]:
df_test = pd.read_csv('/content/new_test_2_no_encode.csv')

# preprocess
df_temp = df_test.copy()
df_test = df_test.drop(columns=['coppaRisk'])
df_test.fillna('NA', inplace=True)
df_test.shape

(3000, 24)

In [None]:
# prompt: untuk mengecek kolom mana saja yang meupakan numerik
# Check for numeric columns
numeric_cols = df_train.select_dtypes(include=['number']).columns
print("Numeric columns:")
numeric_cols


Numeric columns:


Index(['userRatingCount', 'downloads', 'hasTermsOfServiceLinkRating',
       'isCorporateEmailScore', 'adSpent', 'appAge', 'averageUserRating',
       'appContentBrandSafetyRating', 'appDescriptionBrandSafetyRating',
       'mfaRating', 'ID', 'coppaRisk', 'developerCountry_missing',
       'downloads_min', 'downloads_max', 'isCorporateEmailScore_missing',
       'adSpent_missing', 'appAge_missing', 'averageUserRating_missing'],
      dtype='object')

In [None]:
df_train.shape

(7000, 25)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve
import numpy as np
import pandas as pd

# 1. Persiapan Data
X = df_train.drop(columns=['coppaRisk'])
y = df_train['coppaRisk']

# Handle kolom kategorikal
categorical_cols = X.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    X[col] = X[col].astype('category')
    df_test[col] = df_test[col].astype('category')

# 2. Train-Test Split (untuk validasi)
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 3. Training Model Final dengan Best Params
final_model = lgb.LGBMClassifier(
    **best_params,
    n_estimators=2000,
    importance_type='gain',
    random_state=42
)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='auc',
    categorical_feature=categorical_cols,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(50)
    ]
)

# 4. Threshold Optimization
y_pred_proba = final_model.predict_proba(X_valid)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_proba)
optimal_threshold = thresholds[np.argmax(precision + recall)]

# Prediksi probabilities
test_probs = final_model.predict_proba(df_test)[:, 1]

# Konversi ke label dengan optimal threshold
test_preds = (test_probs >= optimal_threshold).astype(int)

# 5. Membuat Submission File
submission = pd.DataFrame({
    'ID': df_test['ID'],
    'coppaRisk': test_preds
})

# 7. Evaluasi Model (jika diperlukan)
print("\nModel Performance:")
print(f"Optimal Threshold: {optimal_threshold:.4f}")
print(f"Validation AUC: {roc_auc_score(y_valid, y_pred_proba):.4f}")

# 8. Save Submission File
submission.to_csv('submission_lgbm_hyperparameter_optuna.csv', index=False)
print("\nSubmission file saved as 'submission_lgbm_hyperparameter_optuna.csv'")

[LightGBM] [Info] Number of positive: 557, number of negative: 5043
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001484 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1045
[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.099464 -> initscore=-2.203191
[LightGBM] [Info] Start training from score -2.203191
Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.887842	valid_0's binary_logloss: 0.218586
[100]	valid_0's auc: 0.87705	valid_0's binary_logloss: 0.227531
Early stopping, best iteration is:
[27]	valid_0's auc: 0.891895	valid_0's binary_logloss: 0.221455

Model Performance:
Optimal Threshold: 0.0919
Validation AUC: 0.8919

Submission file saved as 'submission_lgbm_hyperparameter_optuna.csv'


In [None]:
submission.drop(columns=['coppaRisk'], inplace= True)
submission

Unnamed: 0,ID
0,2807
1,1742
2,806
3,2635
4,9047
...,...
2995,2288
2996,5541
2997,9259
2998,3477


In [None]:
submission['coppaRisk'] = test_probs

In [None]:
submission

Unnamed: 0,ID,coppaRisk
0,2807,0.020374
1,1742,0.218612
2,806,0.416965
3,2635,0.043896
4,9047,0.018722
...,...,...
2995,2288,0.021574
2996,5541,0.020934
2997,9259,0.018416
2998,3477,0.017856


In [None]:
submission.to_csv('submission_lgbm_hyperparameter_optuna_probs.csv', index=False)