<a href="https://colab.research.google.com/github/09334677600f/Code-for-Chapter-4/blob/main/RSF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-survival optuna

In [None]:
# 1) کتابخانه‌ها

import numpy as np
import pandas as pd
import gc

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored

import optuna

In [None]:
# 2) بارگذاری داده‌ها

df = pd.read_excel("/content/final_file1 (2) (1).xlsx")

columns_to_drop = [
    "ID", "Accident_ID", "تعداد تصادف", "تاریخ",
    "زمان تماس", "ساعت رسیدن به محل حادثه"
]

df = df.drop(columns=[c for c in columns_to_drop if c in df.columns])
df = df.dropna()
print(f"تعداد رکورد بعد از حذف NaN: {df.shape[0]}")


X_cols = [
    "سن بیمار", "جنسیت", "زمان شب و روز",
    "نوع روز", "فصل", "ساعت اوج ترافیک"
]

X = df[X_cols].copy()
y = df[["مدت زمان", "وضعیت"]].copy()

categorical_cols = ["جنسیت", "زمان شب و روز", "نوع روز", "فصل", "ساعت اوج ترافیک"]
numeric_cols = ["سن بیمار"]


preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_cols)
    ],
    remainder="drop"
)

X_processed = preprocessor.fit_transform(X)
print(f"شکل داده پس از پیش‌پردازش: {X_processed.shape}")

feature_names = (
    numeric_cols +
    preprocessor.named_transformers_["cat"]
    .get_feature_names_out(categorical_cols)
    .tolist()
)
print("ویژگی‌ها:", feature_names)

In [None]:
# 3) ساخت داده بقا

event_indicator = y["وضعیت"].astype(bool).values
time = y["مدت زمان"].astype(float).values

y_structured = np.array(
    list(zip(event_indicator, time)),
    dtype=[("event", bool), ("time", float)]
)


# 4) تقسیم Train / Test

X_train, X_test, y_train, y_test = train_test_split(
    X_processed,
    y_structured,
    test_size=0.2,
    random_state=42,
    stratify=event_indicator
)

print(f"Train: {X_train.shape[0]} | Test: {X_test.shape[0]}")
print(f"نرخ رویداد در Train: {y_train['event'].mean():.3f}")

In [None]:

# 5) جستجوی ابرپارامترهای RSF با 5-Fold CV



def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100, step=10),
        "max_features": "sqrt",
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 8),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "random_state": 42,
        "n_jobs": 1
    }

    model = RandomSurvivalForest(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    c_indexes = []

    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        model.fit(X_tr, y_tr)
        risk_scores = model.predict(X_val)

        c_idx = concordance_index_censored(
            y_val["event"], y_val["time"], risk_scores
        )[0]
        c_indexes.append(c_idx)

        del X_tr, X_val, y_tr, y_val
        gc.collect()

    mean_cindex = np.mean(c_indexes)

    del model
    gc.collect()

    return mean_cindex


In [None]:
# 6) اجرای Optuna

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n" + "="*60)
print("بهترین پارامترها:", study.best_params)
print(f"بهترین میانگین C-index: {study.best_value:.4f}")
print("="*60 + "\n")

best_params = study.best_params.copy()
best_params["random_state"] = 42
best_params["n_jobs"] = 1


In [None]:
# 7) آموزش مدل نهایی RSF

rsf_final = RandomSurvivalForest(**best_params)
rsf_final.fit(X_train, y_train)

print("مدل نهایی با موفقیت آموزش یافت.")

In [None]:

# 8)  ارزیابی نهایی

surv_funcs = rsf_final.predict_survival_function(X_test)

tau = y_train["time"].max()

predicted_times = []

for fn in surv_funcs:
    times = fn.x
    surv_probs = fn.y

    mask = times <= tau
    rmst = np.trapz(surv_probs[mask], times[mask])
    predicted_times.append(rmst)

predicted_times = np.array(predicted_times)

time_true = y_test["time"]
event_true = y_test["event"]

mae = mean_absolute_error(time_true, predicted_times)

mape = np.mean(
    np.abs((time_true - predicted_times) / np.maximum(time_true, 1e-6))
) * 100

risk_test = rsf_final.predict(X_test)
c_index = concordance_index_censored(
    event_true,
    time_true,
    risk_test
)[0]

print("\n" + "="*60)
print("       نتایج نهایی Random Survival Forest")
print("="*60)
print(f"MAE          : {mae:.3f} دقیقه")
print(f"MAPE         : {mape:.2f}%")
print(f"C-index      : {c_index:.4f}")
print("="*60)
