Group Name   : Lucid_Jespen

Members      : Xiao SHEN, Yuanye XU

Project Name : Delay Rate of DB 20-25.07.2024

In [None]:
!pip install geopy

In [None]:
# Data Cleaning

import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim
import time



df = pd.read_csv("trains_db_hbfs.csv")


# make data buckets for scheduled time by hour
df['scheduled_time'] = pd.to_datetime(df['scheduled_time'], errors='coerce')
df['hour_bucket'] = df['scheduled_time'].dt.hour


# fill in column departure_city by removing 'hbf' and space in column Hbf
if 'Hbf' in df.columns:
    df['departure_city'] = df['Hbf'].astype(str).str.replace(r'Hbf\s*$', '', regex=True).str.strip()
else:
    print("Warning: Cannot find column 'Hbf' ")


# eg. route: someplace,sometime ; train_model: AB 123
# create columns route_cleaned and train_model_cleaned by removing numbers
if 'route' in df.columns:
    df['route_cleaned'] = df['route'].astype(str).str.replace(r'\d+', '', regex=True).str.strip()
else:
    print("Warning: Cannot find 'route'")

if 'train_model' in df.columns:
    df['train_model_cleaned'] = df['train_model'].astype(str).str.replace(r'\d+', '', regex=True).str.strip()
else:
    print("Warning: Cannot find 'train_model'")


# clean ",:" in the end of route_cleaned 
if 'route_cleaned' in df.columns:
    df['route_cleaned'] = (
        df['route_cleaned']
        .astype(str)
        .str.replace(r'[:,]+$', '', regex=True)  
        .str.strip()
    )
else:
    print("Warning: Cannot find 'route_cleaned'")

# clean "()" in the end of train_model_cleaned
if 'train_model_cleaned' in df.columns:
    df['train_model_cleaned'] = (
        df['train_model_cleaned']
        .astype(str)
        .str.replace(r'\(\s*\)$', '', regex=True)  
        .str.strip()
    )
else:
    print("Warning: Cannot find 'train_model_cleaned'")


# eg. real_time_due_to_delay: 13.05, Grund: Gleiswechseln
# keep time in the column, move causes to another column
if 'real_time_due_to_delay' in df.columns:
    split_real = df['real_time_due_to_delay'].astype(str).str.split(',', n=1, expand=True)

    df['real_time_due_to_delay'] = split_real[0].str.strip()

    if split_real.shape[1] > 1:
        df['causes'] = split_real.iloc[:, 1].str.strip()
    else:
        df['causes'] = np.nan
else:
    print(" Warning: cannot find 'real_time_due_to_delay'")




# calculate delay time

# transform time into datetime
df['real_time_due_to_delay'] = pd.to_datetime(df['real_time_due_to_delay'], errors='coerce').dt.time
df['expected_time'] = pd.to_datetime(df['expected_time'], errors='coerce').dt.time

# time into min, make calculation easier
def time_to_minutes(t):
    if pd.isna(t):
        return None
    return t.hour * 60 + t.minute

df['real_minutes'] = df['real_time_due_to_delay'].apply(time_to_minutes)
df['expected_minutes'] = df['expected_time'].apply(time_to_minutes)

# calculate delay time(min) by real arrival time - expected arrival time
def calculate_delay(row):
    if pd.isna(row['real_minutes']) or pd.isna(row['expected_minutes']):
        return None
    delay = row['real_minutes'] - row['expected_minutes']
    if delay < 0:
        delay += 24 * 60  # if delay < 0 : arriving on the next day, + 24h
    return delay

df['real_delay_min'] = df.apply(calculate_delay, axis=1)
df.drop(columns=['real_minutes', 'expected_minutes'], inplace=True)



# return arrival city by station name
cache = {}
def get_city_from_station(station):
    if station in cache:
        return cache[station]
    try:
        location = geolocator.geocode(station + ", Germany")
        if location:
            city = location.address.split(",")[-4].strip()
            cache[station] = city
            return city
    except:
        return None
geolocator = Nominatim(user_agent="db_station_locator")

df["city"] = None

for i, s in df["route_cleaned"].items():
    if pd.notna(s):
        df.loc[i, "city"] = get_city_from_station(s)
        print(df.loc[i, "city"])
        #time.sleep(0.5)   

df.to_csv("trains_db_wash_1.csv", index=False)
print("new document created：trains_db_wash_1.csv")




In [None]:
# random_forest_punctuality.py
# This script trains a RandomForestClassifier to predict punctuality (is_punctual = 1 for on-time)
# It uses the same train/test split (random_state=42, stratify by target) so results are directly
# comparable to a Logistic Regression baseline trained with the same split.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn

# ---------- CONFIG ----------
INPUT_PATH = r"E:\3model.xlsx"      # change to your file path
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_ESTIMATORS = 200
OUTPUT_CM = "./rf_confusion_matrix.png"
OUTPUT_IMPORTANCES = "./rf_feature_importances.csv"
# ----------------------------

print("scikit-learn version:", sklearn.__version__)

# 1) Load data
df = pd.read_excel(INPUT_PATH)

# 2) clean column names
df.columns = df.columns.str.strip()

# 3) ensure punctuality target column exists
# If you have only 'has_delay' (1 = delayed, 0 = on-time), create is_punctual = 1 - has_delay
if "is_punctual" not in df.columns:
    if "has_delay" in df.columns:
        df["is_punctual"] = 1 - df["has_delay"]
    else:
        raise KeyError("No 'is_punctual' or 'has_delay' column found in the input file.")

# 4) Define features and target
features = ["Hbf", "arrive_station", "train_category", "depart_hour_bucket"]
target = "is_punctual"

# sanity check
for col in features + [target]:
    if col not in df.columns:
        raise ValueError(f"Missing column: {col} in the input file")

X = df[features].copy()
y = df[target].copy()

# 5) Create train/test split ONCE so it matches the one used for logistic regression baseline
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# 6) Build OneHotEncoder with compatibility for different sklearn versions
# newer sklearn (>=1.2) uses sparse_output, older uses sparse
ohe_kwargs = {"handle_unknown": "ignore"}
try:
    # try the newer parameter name
    enc = OneHotEncoder(**ohe_kwargs, sparse_output=False)
except TypeError:
    # fall back to older parameter name
    enc = OneHotEncoder(**ohe_kwargs, sparse=False)

column_transformer = ColumnTransformer(
    transformers=[("cat", enc, features)],
    remainder="drop",
    verbose_feature_names_out=False  # keep feature names clean if supported
)

# 7) Fit encoder on training raw data and transform both train and test
# We fit the encoder only on train to avoid leakage
column_transformer.fit(X_train_raw)
X_train = column_transformer.transform(X_train_raw)
X_test = column_transformer.transform(X_test_raw)

# 8) Train Random Forest on encoded features
rf = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE)
rf.fit(X_train, y_train)

# 9) Predict and evaluate
y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, digits=4))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 10) Plot and save confusion matrix (labels: Delay=0, On-time=1)
labels = ["Delay (0)", "On-time (1)"]
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels, cbar=False)
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.title("Random Forest Confusion Matrix (Punctuality Prediction)")
plt.tight_layout()
plt.savefig(OUTPUT_CM, dpi=300)
plt.close()
print(f"Saved confusion matrix to {os.path.abspath(OUTPUT_CM)}")

# 11) Map feature importances back to one-hot feature names
# get feature names from the fitted OneHotEncoder
ohe = column_transformer.named_transformers_["cat"]
try:
    feature_names = ohe.get_feature_names_out(features)
except Exception:
    # fallback: manually build feature names
    categories = ohe.categories_
    feature_names = []
    for col, cats in zip(features, categories):
        feature_names += [f"{col}__{str(c)}" for c in cats]

importances = rf.feature_importances_
if len(feature_names) != len(importances):
    print("Warning: feature names length does not match importances length.")
# create dataframe of importances
imp_df = pd.DataFrame({"feature": feature_names, "importance": importances})
imp_df = imp_df.sort_values("importance", ascending=False).reset_index(drop=True)
imp_df.to_csv(OUTPUT_IMPORTANCES, index=False)
print(f"Saved feature importances to {os.path.abspath(OUTPUT_IMPORTANCES)}")

# 12) Optional: plot top 30 importances
top_k = 30
plt.figure(figsize=(8, max(4, 0.25 * min(top_k, len(imp_df)))))
imp_df.head(top_k).sort_values("importance").plot.barh(x="feature", y="importance", legend=False, color="skyblue")
plt.title("Top feature importances (Random Forest)")
plt.xlabel("Importance")
plt.tight_layout()
plt.savefig("./rf_top_importances.png", dpi=300)
plt.close()
print(f"Saved top feature importances plot to {os.path.abspath('./rf_top_importances.png')}")


In [None]:

# logreg_punctuality.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ---------- CONFIG ----------
INPUT_PATH = r"E:\3model.xlsx"
OUTPUT_IMAGE = r"./confusion_matrix.png"
OUTPUT_COEF_CSV = r"./logreg_feature_coefs.csv"
RANDOM_STATE = 42
TEST_SIZE = 0.2
# ----------------------------

# 1. Load data
df = pd.read_excel(INPUT_PATH)

# 2. Ensure target column 'is_punctual' exists (1 = on-time, 0 = delay)
# If your file only has 'has_delay' (1 = delay, 0 = on-time), create is_punctual = 1 - has_delay
if "is_punctual" not in df.columns:
    if "has_delay" in df.columns:
        df["is_punctual"] = 1 - df["has_delay"]
    else:
        raise KeyError("No 'is_punctual' or 'has_delay' column found in the input file.")

# 3. Define features and target
features = ["Hbf", "arrive_station", "train_category", "depart_hour_bucket"]
target = "is_punctual"

X = df[features].copy()
y = df[target].copy()

# 4. Build column transformer with OneHotEncoder for categorical columns
# handle_unknown='ignore' prevents errors when test set contains unseen categories
cat_transformer = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), features)
    ],
    remainder="drop",
    verbose_feature_names_out=False  # available in newer sklearn; keeps names clean
)

# 5. Build pipeline: encoder -> classifier
pipeline = Pipeline([
    ("encoder", cat_transformer),
    ("clf", LogisticRegression(max_iter=1000, solver="lbfgs"))
])

# 6. Train-test split (randomly mix July and September data to reduce seasonality bias)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# 7. Fit model
pipeline.fit(X_train, y_train)

# 8. Predict and evaluate
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 9. Plot and save confusion matrix
labels = ["Delay (0)", "On-time (1)"]
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels, cbar=False)
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.title("Confusion Matrix - Logistic Regression (Punctuality Prediction)")
plt.tight_layout()
plt.savefig(OUTPUT_IMAGE, dpi=300)
plt.close()
print(f"\nSaved confusion matrix to {os.path.abspath(OUTPUT_IMAGE)}")

# 10. Extract one-hot feature names and coefficients, then save to CSV
# Get the fitted OneHotEncoder from the pipeline
ohe = pipeline.named_steps["encoder"].named_transformers_["cat"]
# sklearn >=1.0: use get_feature_names_out
try:
    feat_names = ohe.get_feature_names_out(features)
except AttributeError:
    # fallback: construct names manually
    categories = ohe.categories_
    feat_names = []
    for col, cats in zip(features, categories):
        feat_names += [f"{col}__{str(c)}" for c in cats]

# Get coefficients from the logistic regression step
coefs = pipeline.named_steps["clf"].coef_[0]
coef_df = pd.DataFrame({
    "feature": feat_names,
    "coefficient": coefs
}).sort_values(by="coefficient", key=abs, ascending=False)

coef_df.to_csv(OUTPUT_COEF_CSV, index=False)
print(f"Saved logistic regression feature coefficients to {os.path.abspath(OUTPUT_COEF_CSV)}")



In [None]:
#Basic Visualization and Descriptive Analysis

import pandas as pd
import matplotlib.pyplot as plt

# 1️⃣ Load the dataset
df = pd.read_excel(r"E:\3model.xlsx")

# 2️⃣ Verify that essential columns exist
required_cols = ["depart_hour_bucket", "train_category", "has_delay"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing column: {col}")

# 3️⃣ Calculate hourly on-time rates (1 - delay rate)
on_time_by_hour = (1 - df.groupby("depart_hour_bucket")["has_delay"].mean()) * 100
print("Hourly Punctuality Rate (%):")
print(on_time_by_hour.round(2))
print("\n")

# 4️⃣ Calculate on-time rates by train category
on_time_by_type = (1 - df.groupby("train_category")["has_delay"].mean()) * 100
print("Punctuality Rate by Train Category (%):")
print(on_time_by_type.round(2))
print("\n")

# 5️⃣ Create a pivot table for hour × category on-time rates
pivot_on_time = (1 - pd.pivot_table(
    df,
    values="has_delay",
    index="depart_hour_bucket",
    columns="train_category",
    aggfunc="mean"
)) * 100
print("Punctuality Rate by Hour × Train Category (%):")
print(pivot_on_time.round(2))
print("\n")

# 6️⃣ Plot hourly on-time rate (line chart)
plt.figure(figsize=(10, 5))
plt.plot(on_time_by_hour.index, on_time_by_hour.values, marker='o', color='seagreen')
plt.title("Train Punctuality by Hour", fontsize=14)
plt.xlabel("Hour of the Day (1–24)")
plt.ylabel("Punctuality Rate (%)")
plt.grid(True, linestyle="--", alpha=0.6)
plt.xticks(range(0, 25, 2))
plt.tight_layout()
plt.show()

# 7️⃣ Plot on-time rate by train category (bar chart)
plt.figure(figsize=(7, 5))
plt.bar(on_time_by_type.index, on_time_by_type.values, color=['teal', 'orange', 'tomato'])
plt.title("Average Punctuality by Train Category", fontsize=14)
plt.xlabel("Train Category")
plt.ylabel("Punctuality (%)")
plt.grid(axis='y', linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# multi_model_same_split.py
"""
Train multiple models using the exact same train/test split (random split).
Target is 'is_punctual' (1 = on-time). If original file has 'has_delay' (1=delay),
the script creates is_punctual = 1 - has_delay.

LogisticRegression uses OneHotEncoder (fit on X_train only).
Tree models (RandomForest, LightGBM, optional XGBoost) use integer label mapping.
All models use the same train/test rows to ensure fair comparison.
"""

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Optional boosters
try:
    from lightgbm import LGBMClassifier
except Exception as e:
    raise ImportError("Please install lightgbm: pip install lightgbm") from e

try:
    import xgboost as xgb
except Exception:
    xgb = None

print("scikit-learn version:", sklearn.__version__)

# ---------------- CONFIG ----------------
FILE_PATH = r"E:\3model.xlsx"   # change to your path
RANDOM_STATE = 42
TEST_SIZE = 0.2
FEATURES = ["Hbf", "arrive_station", "train_category", "depart_hour_bucket"]
ORIG_TARGET = "has_delay"   # if present, 1=delay
TARGET = "is_punctual"      # 1 = on-time
# ----------------------------------------

# 1) load data
df = pd.read_excel(FILE_PATH)
df.columns = df.columns.str.strip()

# 2) build target
if TARGET not in df.columns:
    if ORIG_TARGET in df.columns:
        df[TARGET] = 1 - df[ORIG_TARGET].astype(int)
        print("Created 'is_punctual' from 'has_delay'")
    else:
        raise KeyError("Provide 'has_delay' (1=delay) or 'is_punctual' in the input file.")

# 3) basic cleaning for categorical columns
for c in FEATURES:
    if c not in df.columns:
        raise ValueError(f"Missing feature column: {c}")
    df[c] = df[c].astype(str).str.strip().fillna("MISSING")

# 4) Compose dataset to use for splitting
#    If you want only July+Sept rows, filter first; otherwise use full df.
#    Example: filter to July+Sept (uncomment and adapt column name if you have a date column)
# date_col = "date"  # change if applicable
# df = df[df[date_col].dt.month.isin([7,9])]

X_full = df[FEATURES].copy()
y_full = df[TARGET].astype(int).copy()

# 5) single random split used for all models
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_full, y_full, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_full
)
print(f"Train rows: {len(X_train_raw)}, Test rows: {len(X_test_raw)}")

# 6) Prepare OneHot for Logistic Regression (fit on training rows only)
ohe_kwargs = {"handle_unknown": "ignore"}
try:
    encoder = OneHotEncoder(**ohe_kwargs, sparse_output=False)
except TypeError:
    encoder = OneHotEncoder(**ohe_kwargs, sparse=False)

ohe_cols = FEATURES  # treat all features as categorical for logistic model
encoder.fit(X_train_raw[ohe_cols])
X_train_ohe = encoder.transform(X_train_raw[ohe_cols])
X_test_ohe  = encoder.transform(X_test_raw[ohe_cols])

# 7) Prepare integer label mapping for tree models (fit mapping on train only)
label_maps = {}
X_train_tree = pd.DataFrame(index=X_train_raw.index)
X_test_tree  = pd.DataFrame(index=X_test_raw.index)

for col in FEATURES:
    uniques = X_train_raw[col].unique().tolist()
    mapping = {v: i for i, v in enumerate(uniques)}
    unknown_idx = len(mapping)
    # apply mapping
    X_train_tree[col + "_idx"] = X_train_raw[col].map(mapping).fillna(unknown_idx).astype(int)
    X_test_tree[col + "_idx"]  = X_test_raw[col].map(lambda v: mapping.get(v, unknown_idx)).astype(int)
    label_maps[col] = {"mapping": mapping, "unknown_index": unknown_idx}

# 8) Train models on the same train split
print("\nTraining models on identical train/test split...")

# Logistic Regression (One-Hot input)
logreg = LogisticRegression(max_iter=2000, class_weight='balanced', solver='lbfgs')
logreg.fit(X_train_ohe, y_train)

# Random Forest (integer-coded input)
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, class_weight='balanced')
rf.fit(X_train_tree, y_train)

# LightGBM (integer-coded input)
lgbm = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=31, random_state=RANDOM_STATE, class_weight='balanced')
lgbm.fit(X_train_tree, y_train)

# XGBoost (optional)
xgb_model = None
if xgb is not None:
    try:
        xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)
        xgb_model.fit(X_train_tree, y_train, eval_set=[(X_test_tree, y_test)], verbose=False)
    except Exception as e:
        print("Warning: XGBoost training failed:", e)
        xgb_model = None

# 9) Evaluate helper
def evaluate(name, model, X, y_true, is_prob_model=True):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:,1] if is_prob_model and hasattr(model, "predict_proba") else None
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    auc  = roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan
    cm   = confusion_matrix(y_true, y_pred)
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f}  Precision: {prec:.4f}  Recall: {rec:.4f}  F1: {f1:.4f}  AUC: {auc:.4f}")
    print("Confusion matrix:\n", cm)
    return {"name": name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc, "cm": cm, "proba": y_proba}

results = []
results.append(evaluate("Logistic Regression", logreg, X_test_ohe, y_test, is_prob_model=True))
results.append(evaluate("Random Forest", rf, X_test_tree, y_test, is_prob_model=True))
results.append(evaluate("LightGBM", lgbm, X_test_tree, y_test, is_prob_model=True))
if xgb_model is not None:
    results.append(evaluate("XGBoost", xgb_model, X_test_tree, y_test, is_prob_model=True))

# 10) ROC plot (if probabilities exist)
plt.figure(figsize=(8,6))
plotted = False
for res in results:
    proba = res["proba"]
    if proba is not None and not np.all(np.isnan(proba)):
        fpr, tpr, _ = roc_curve(y_test, proba)
        auc_val = roc_auc_score(y_test, proba)
        plt.plot(fpr, tpr, label=f"{res['name']} (AUC={auc_val:.3f})")
        plotted = True
if plotted:
    plt.plot([0,1],[0,1],"k--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve - All models (same random split)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("roc_same_split.png", dpi=300)
    plt.close()
    print("Saved ROC plot -> roc_same_split.png")
else:
    print("No probability outputs available; skipped ROC plot.")

# 11) Summary table and save
summary = pd.DataFrame(results).set_index("name")[["accuracy","precision","recall","f1","auc"]]
print("\nModel comparison (on test set):")
print(summary)
summary.to_csv("model_comparison_same_split.csv", index=True)
print("Saved summary -> model_comparison_same_split.csv")

# 12) Save confusion matrix for Random Forest
cm_rf = results[1]["cm"]
plt.figure(figsize=(6,5))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Blues", xticklabels=["Delay(0)","On-time(1)"], yticklabels=["Delay(0)","On-time(1)"], cbar=False)
plt.title("Random Forest - Confusion Matrix (same split)")
plt.tight_layout()
plt.savefig("rf_confusion_same_split.png", dpi=300)
plt.close()
print("Saved rf_confusion_same_split.png")

# 13) Save feature importances for tree models (they are integer-coded columns)
rf_feats = X_train_tree.columns.tolist()
imp_df = pd.DataFrame({"feature": rf_feats, "importance": rf.feature_importances_}).sort_values("importance", ascending=False)
imp_df.to_csv("rf_importances_same_split.csv", index=False)
print("Saved rf_importances_same_split.csv")

print("All done.")

