In [1]:
import sys
sys.path.append("utils/")

In [None]:
# %pip install -qU xgboost lightgbm catboost optuna

In [2]:
from data_loader import load_final_data

dataset_path = "../dataset/"
train_df, test_df = load_final_data(dataset_path)

‚úÖ File: ../dataset/train\balance.parquet Completed!
‚úÖ File: ../dataset/train\bill.parquet Completed!
‚úÖ File: ../dataset/train\channel.parquet Completed!
‚úÖ File: ../dataset/train\credit.parquet Completed!
‚úÖ File: ../dataset/train\marketing.parquet Completed!
‚úÖ File: ../dataset/train\member.parquet Completed!
‚úÖ File: ../dataset/train\perf.parquet Completed!
‚úÖ File: ../dataset/train\tx.parquet Completed!
üîπ Shape : (2400000, 191)

‚úÖ File: ../dataset/test\balance.parquet Completed!
‚úÖ File: ../dataset/test\bill.parquet Completed!
‚úÖ File: ../dataset/test\channel.parquet Completed!
‚úÖ File: ../dataset/test\credit.parquet Completed!
‚úÖ File: ../dataset/test\marketing.parquet Completed!
‚úÖ File: ../dataset/test\member.parquet Completed!
‚úÖ File: ../dataset/test\perf.parquet Completed!
‚úÖ File: ../dataset/test\tx.parquet Completed!
üîπ Shape : (600000, 190)


In [3]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

X = train_df.drop(columns=['ID', 'Í∏∞Ï§ÄÎÖÑÏõî', 'Segment'])
X_test = test_df.drop(columns=['ID', 'Í∏∞Ï§ÄÎÖÑÏõî'])
y = train_df['Segment']

le = LabelEncoder()
y_encoded = le.fit_transform(y).astype(np.uint8)

In [4]:
X.replace([np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.inf, -np.inf], 0, inplace=True)

In [17]:
counts = np.bincount(y_encoded)
n_classes = len(counts)
total_samples = counts.sum()

class_weights = {i: total_samples / (n_classes * count) for i, count in enumerate(counts)}
class_weights

{0: 493.82716049382714,
 1: 3333.3333333333335,
 2: 3.7620503174229953,
 3: 1.3744051402752246,
 4: 0.2497330977517778}

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

models = {
    "decision_tree": DecisionTreeClassifier(random_state=42, max_depth=10),
    "random_forest": RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1, n_estimators=100, max_depth=10),
    "xgboost": XGBClassifier(random_state=42, n_estimators=100,),
    "lightgbm": LGBMClassifier(random_state=42, n_jobs=-1, n_estimators=100, max_depth=10, class_weight='balanced',),
    "catboost": CatBoostClassifier(random_state=42, n_estimators=100, max_depth=10, class_weights=class_weights),
}

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np

def kfold_train_eval(X, y, model, k=5):
    skf = StratifiedKFold(n_splits=k)
    scores = []

    for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        score = f1_score(y_val, y_pred, average='macro')
        scores.append(score)
        print(f"Fold {i+1}/{k} - F1 Score: {score:.4f}")

    return np.mean(scores), np.std(scores)

In [20]:
for name, model in models.items():
    print(f"\nüìå Model: {name}")
    mean_score, std_score = kfold_train_eval(X, y_encoded, model)
    print(f"Mean F1 Score: {mean_score:.4f} ¬± {std_score:.4f}")


üìå Model: decision_tree
Fold 1/5 - F1 Score: 0.4686
Fold 2/5 - F1 Score: 0.4645
Fold 3/5 - F1 Score: 0.4686
Fold 4/5 - F1 Score: 0.4646
Fold 5/5 - F1 Score: 0.4334
Mean F1 Score: 0.4599 ¬± 0.0134

üìå Model: random_forest
Fold 1/5 - F1 Score: 0.5755
Fold 2/5 - F1 Score: 0.6088
Fold 3/5 - F1 Score: 0.6389
Fold 4/5 - F1 Score: 0.6054
Fold 5/5 - F1 Score: 0.5737
Mean F1 Score: 0.6005 ¬± 0.0242

üìå Model: xgboost
Fold 1/5 - F1 Score: 0.7017
Fold 2/5 - F1 Score: 0.7490
Fold 3/5 - F1 Score: 0.7929
Fold 4/5 - F1 Score: 0.7599
Fold 5/5 - F1 Score: 0.7073
Mean F1 Score: 0.7422 ¬± 0.0340

üìå Model: lightgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.342860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30963
[LightGBM] [Info] Number of data points in the train set: 1920000, number of used features: 188
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from sc

| Model         | Mean F1 Score   | Std (¬±)           | 
| ------------- | --------------- | ----------------- | 
| Decision Tree | 0.4599          | ¬±0.0134           | 
| Random Forest | 0.6005          | ¬±0.0242           | 
| XGBoost       | 0.7422          | ¬±0.0340           | 
| LightGBM      | 0.7372          | ¬±0.0181           | 
| CatBoost      | **0.7501**      | ¬± 0.0324          | 


In [21]:
# Using catboost for final training

model = CatBoostClassifier(random_state=42, n_estimators=100, max_depth=10, class_weights=class_weights)
model.fit(X, y_encoded, verbose=20)

model.save_model("catboost_final_model.cbm")
print("Î™®Îç∏Ïù¥ Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§...!")

# Predicting on test data
y_pred = model.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred)

test_data = test_df.copy()  # ÏõêÎ≥∏ Ïú†ÏßÄ
test_data["pred_label"] = y_pred_labels

submission = test_data.groupby("ID")["pred_label"].agg(lambda x: x.value_counts().idxmax()).reset_index()
submission.columns = ["ID", "Segment"]

submission.to_csv("submission.csv", index=False)

Learning rate set to 0.5
0:	learn: 0.9298213	total: 12.2s	remaining: 20m 5s
20:	learn: 0.3238663	total: 4m 59s	remaining: 18m 45s
40:	learn: 0.2787143	total: 10m 5s	remaining: 14m 31s
60:	learn: 0.2528473	total: 15m 5s	remaining: 9m 38s
80:	learn: 0.2333696	total: 20m 8s	remaining: 4m 43s
99:	learn: 0.2199032	total: 25m 4s	remaining: 0us
Î™®Îç∏Ïù¥ Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§...!


  y = column_or_1d(y, warn=True)


In [23]:
# Using XGBoost for final training
xgb_model = XGBClassifier(random_state=42, n_estimators=100, n_jobs=-1, verbosity=2)
xgb_model.fit(X, y_encoded)

xgb_model.save_model("xgboost_final_model.json")
print("Î™®Îç∏Ïù¥ Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§...!")

y_pred_xgb = xgb_model.predict(X_test)
y_pred_labels_xgb = le.inverse_transform(y_pred_xgb)

test_data = test_df.copy()  # ÏõêÎ≥∏ Ïú†ÏßÄ
test_data["pred_label"] = y_pred_labels_xgb

submission_xgb = test_data.groupby("ID")["pred_label"].agg(lambda x: x.value_counts().idxmax()).reset_index()
submission_xgb.columns = ["ID", "Segment"]
submission_xgb.to_csv("submission_xgb.csv", index=False)

[14:12:19] INFO: C:\actions-runner\_work\xgboost\xgboost\src\data\iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (2400000, 188, 451200000).
Î™®Îç∏Ïù¥ Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§...!


In [24]:
# Using LightGBM for final training

lgb_model = LGBMClassifier(random_state=42, n_jobs=-1, verbosity=2, class_weight='balanced')
lgb_model.fit(X, y_encoded)

y_pred_lgb = lgb_model.predict(X_test)
y_pred_labels_lgb = le.inverse_transform(y_pred_lgb)

test_data = test_df.copy()  # ÏõêÎ≥∏ Ïú†ÏßÄ
test_data["pred_label"] = y_pred_labels_lgb

submission_lgb = test_data.groupby("ID")["pred_label"].agg(lambda x: x.value_counts().idxmax()).reset_index()
submission_lgb.columns = ["ID", "Segment"]
submission_lgb.to_csv("submission_lgb.csv", index=False)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.818184
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.485316
[LightGBM] [Debug] init for col-wise cost 0.312528 seconds, init for row-wise cost 1.072043 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.270136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30942
[LightGBM] [Info] Number of data points in the train set: 2400000, number of used features: 188
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 

In [28]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


def cat_objective(trial):

    param = {
        "objective": trial.suggest_categorical("objective", ["MultiClass", "Logloss", "MultiClassOneVsAll"]), 
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "random_state": 42,
        "depth": trial.suggest_int("depth", 1, 15),
        "class_weights": class_weights,
        "iterations": trial.suggest_int("iterations", 100, 1000),
    }

    catboost = CatBoostClassifier(**param)

    X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    catboost.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
    
    y_pred = catboost.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='macro')
    return f1

In [None]:
cat_study = optuna.create_study(direction='maximize')
cat_study.optimize(cat_objective, n_trials=100)

print("Best trial:")
trial = cat_study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-05-08 00:14:25,778] A new study created in memory with name: no-name-519770ba-b61d-4db8-9aa7-d21490ae91d7
[I 2025-05-08 00:36:45,608] Trial 0 finished with value: 0.7415044833679177 and parameters: {'objective': 'MultiClassOneVsAll', 'learning_rate': 0.2391117216058121, 'depth': 6, 'iterations': 692}. Best is trial 0 with value: 0.7415044833679177.
