<a href="https://colab.research.google.com/github/09334677600f/Code-for-Chapter-4/blob/main/%DA%AF%D8%B1%D8%A7%D8%AF%DB%8C%D8%A7%D9%86_%D8%AA%D9%82%D9%88%DB%8C%D8%AA_%D8%B4%D8%AF%D9%87_%D8%A8%D9%82%D8%A7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-survival

In [None]:
import numpy as np
import pandas as pd
import gc

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

import optuna


In [None]:
df = pd.read_excel("/content/final_file1 (2) (1).xlsx")

In [None]:
# پیش‌پردازش داده‌ها
columns_to_drop = [
    "ID", "Accident_ID", "تعداد تصادف", "تاریخ",
    "زمان تماس", "ساعت رسیدن به محل حادثه"
]

df = df.drop(columns=[c for c in columns_to_drop if c in df.columns])

df = df.dropna()
print(f"تعداد رکورد بعد از حذف NaN: {df.shape[0]}")

# انتخاب ویژگی‌ها و هدف
X_cols = [
    "سن بیمار", "جنسیت", "زمان شب و روز",
    "نوع روز", "فصل", "ساعت اوج ترافیک"
]

X = df[X_cols].copy()
y = df[["مدت زمان", "وضعیت"]].copy()   # 1 = event, 0 = censored

# تعریف ستون‌های دسته‌ای و عددی
categorical_cols = ["جنسیت", "زمان شب و روز", "نوع روز", "فصل", "ساعت اوج ترافیک"]
numeric_cols = ["سن بیمار"]

# پیش‌پردازش
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ],
    remainder='drop'
)

X_processed = preprocessor.fit_transform(X)
print(f"شکل داده پس از پیش‌پردازش: {X_processed.shape}")


feature_names = (
    numeric_cols +
    preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist()
)
print("ویژگی‌ها:", feature_names)

In [None]:
# تبدیل به فرمت مورد نیاز sksurv
event_indicator = y["وضعیت"].astype(bool)
time = y["مدت زمان"].astype(float)

y_structured = np.array(
    list(zip(event_indicator, time)),
    dtype=[('event', bool), ('time', float)]
)

# تقسیم train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_structured,
    test_size=0.2,
    random_state=42,
    stratify=event_indicator
)

print(f"Train: {X_train.shape[0]} رکورد | Test: {X_test.shape[0]} رکورد")
print(f"نرخ رویداد در train: {y_train['event'].mean():.3f}")

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, step=25),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "max_features": "sqrt",
        "random_state": 42,
    }

    model = GradientBoostingSurvivalAnalysis(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    c_indexes = []

    for train_idx, val_idx in kf.split(X_train):
        X_tr = X_train[train_idx]
        X_val = X_train[val_idx]
        y_tr = y_train[train_idx]
        y_val = y_train[val_idx]

        model.fit(X_tr, y_tr)
        risk_scores = model.predict(X_val)

        c_idx = concordance_index_censored(
            y_val["event"], y_val["time"], risk_scores
        )[0]
        c_indexes.append(c_idx)

        del X_tr, X_val, y_tr, y_val
        gc.collect()

    mean_cindex = np.mean(c_indexes)

    del model
    gc.collect()

    return mean_cindex

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n" + "="*60)
print("بهترین پارامترها:", study.best_params)
print(f"بهترین میانگین C-index: {study.best_value:.4f}")
print("="*60 + "\n")

In [None]:
best_params = study.best_params.copy()
best_params["max_features"] = "sqrt"
best_params["random_state"] = 42

gbs_final = GradientBoostingSurvivalAnalysis(**best_params)
gbs_final.fit(X_train, y_train)

print("مدل نهایی با موفقیت آموزش دید.")

In [None]:
# ───────────────────────────────────────────────────────────────
# 6) محاسبه معیارهای MAE و MAPE
# ───────────────────────────────────────────────────────────────

# استخراج توابع بقا برای داده تست
surv_funcs = gbs_final.predict_survival_function(X_test)

# تعیین τ = بیشینه زمان مشاهده‌شده در داده train
tau = y_train["time"].max()

predicted_times = []

for fn in surv_funcs:
    times = fn.x
    surv_probs = fn.y

    # محدود کردن انتگرال به τ
    mask = times <= tau
    rmst = np.trapz(surv_probs[mask], times[mask])

    predicted_times.append(rmst)

predicted_times = np.array(predicted_times)

# داده‌های واقعی
time_true = y_test["time"]
event_true = y_test["event"]

# MAE
mae = mean_absolute_error(time_true, predicted_times)

# MAPE
mape = np.mean(
    np.abs((time_true - predicted_times) / np.maximum(time_true, 1e-6))
) * 100

# C-index
risk_test = gbs_final.predict(X_test)
c_index = concordance_index_censored(
    event_true,
    time_true,
    risk_test
)[0]

print("\n" + "="*60)
print("        نتایج نهایی مدل GBM-Survival")
print("="*60)
print(f"MAE          : {mae:.3f} دقیقه")
print(f"MAPE         : {mape:.2f}%")
print(f"C-index      : {c_index:.4f}")
print("="*60)
