In [None]:
import os
import math
import random
import re
import time
import hashlib
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

import torch
from joblib import Parallel, delayed


FILE_PATHS = [
    "/content/Dallas_Animal_Shelter_Data_Fiscal_Year_2023_-_2026_20251125.csv",
    "/content/dogs_intake_outcome_2021_2025.xlsx",
]

OUT_DIR = "/content/tree_output"
os.makedirs(OUT_DIR, exist_ok=True)

TREE_RESULTS_TXT = os.path.join(OUT_DIR, "survclass_tree_family_summary.txt")
RESULTS_CSV = os.path.join(OUT_DIR, "survclass_results_all_configs.csv")

ADOPTION_ONLY = True
SEED = 42

K_LIST = [14]

TOP_K_FS = 256
TOP_N_BREEDS = 20

PARALLEL_JOBS = max(1, (os.cpu_count() or 4) - 1)
CHUNK_SIZE_FOR_PARALLEL = 200

AGE_DAYS_MAP = {"puppy": 0.5 * 365.0, "adult": 5.0 * 365.0, "senior": 10.5 * 365.0}


def set_seed(seed: int):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def read_any_table(path: str) -> pd.DataFrame:
    if path.endswith(".xlsx"):
        return pd.read_excel(path)
    return pd.read_csv(path, low_memory=False)


def load_and_merge_sources(paths: List[str]) -> pd.DataFrame:
    dfs = []
    for p in paths:
        df = read_any_table(p)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)


def map_breed_use(breed: str) -> str:
    if not isinstance(breed, str):
        return "unknown"
    s = breed.upper()
    if "SHEPHERD" in s: return "herding"
    if "LABRADOR" in s or "RETRIEVER" in s: return "sporting"
    if "TERRIER" in s or "PIT" in s: return "terrier"
    if "HOUND" in s: return "hound"
    if "POODLE" in s: return "nonsporting"
    if "CHIHUAHUA" in s or "POMERANIAN" in s: return "toy"
    return "unknown"


def compute_stay_length_days(df: pd.DataFrame) -> pd.Series:
    stay = (df["Outcome_Date"] - df["Intake_Date"]).dt.total_seconds() / 86400.0
    return stay.round()


def build_features(df: pd.DataFrame):
    df = df[df["Animal_Type"].astype(str).str.upper() == "DOG"].copy()
    df["Intake_Date"] = pd.to_datetime(df["Intake_Date"], errors="coerce")
    df["Outcome_Date"] = pd.to_datetime(df["Outcome_Date"], errors="coerce")
    df["StayLength"] = compute_stay_length_days(df)
    df = df[df["StayLength"].notna() & (df["StayLength"] >= 0)]

    if ADOPTION_ONLY:
        df = df[df["Outcome_Type"].astype(str).str.upper() == "ADOPTION"]

    df["Breed_Use"] = df["Animal_Breed"].astype(str).apply(map_breed_use)
    df["AgeGroup"] = df.get("AgeGroup", "adult").fillna("adult").astype(str).str.lower()
    df["AgeDays"] = df["AgeGroup"].map(AGE_DAYS_MAP).fillna(AGE_DAYS_MAP["adult"])
    df["AgeLog"] = np.log1p(df["AgeDays"])

    df["Intake_Month"] = df["Intake_Date"].dt.month.fillna(0).astype(int)
    df["Intake_Weekday"] = df["Intake_Date"].dt.weekday.fillna(0).astype(int)
    df["IsWeekend"] = (df["Intake_Weekday"] >= 5).astype(int)

    top_breeds = df["Animal_Breed"].value_counts().index[:TOP_N_BREEDS]
    for b in top_breeds:
        df[f"Breed_{b[:15]}"] = (df["Animal_Breed"] == b).astype(int)
    df["Breed_Other"] = (~df["Animal_Breed"].isin(top_breeds)).astype(int)

    y = pd.qcut(df["StayLength"], q=3, labels=False).astype(int).to_numpy()

    num_cols = ["AgeLog", "Intake_Month", "Intake_Weekday", "IsWeekend"]
    X_num = df[num_cols].fillna(0).to_numpy()

    X_cat = pd.get_dummies(df["Breed_Use"], dummy_na=True).to_numpy()

    X = np.concatenate([X_num, X_cat], axis=1)

    if X.shape[1] > TOP_K_FS:
        lr = LogisticRegression(penalty="l1", solver="saga", max_iter=2000)
        lr.fit(X, y)
        coef = np.abs(lr.coef_).max(axis=0)
        idx = np.argsort(-coef)[:TOP_K_FS]
        X = X[:, idx]

    return X, y, df["StayLength"].to_numpy()


def concordance_index(times, preds):
    t = times.reshape(-1, 1)
    p = preds.reshape(-1, 1)
    valid = (t - t.T) != 0
    conc = ((t - t.T > 0) & (p - p.T > 0)) | ((t - t.T < 0) & (p - p.T < 0))
    return conc[valid].mean()


def train_eval(estimator, Xtr, Xva, Xte, ytr, yva, yte, ttr, tva, tte):
    estimator.fit(Xtr, ytr)
    mean_t = np.array([ttr[ytr == i].mean() for i in range(3)])

    def eval_split(X, y, t):
        yp = estimator.predict(X)
        pt = mean_t[yp]
        return {
            "f1": f1_score(y, yp, average="weighted"),
            "cidx": concordance_index(t, pt)
        }

    return eval_split(Xva, yva, tva), eval_split(Xte, yte, tte)


def main():
    set_seed(SEED)
    df = load_and_merge_sources(FILE_PATHS)
    X, y, t = build_features(df)

    idx_tr, idx_te = train_test_split(np.arange(len(y)), test_size=0.15, stratify=y, random_state=SEED)
    idx_tr, idx_va = train_test_split(idx_tr, test_size=0.1765, stratify=y[idx_tr], random_state=SEED)

    Xtr, Xva, Xte = X[idx_tr], X[idx_va], X[idx_te]
    ytr, yva, yte = y[idx_tr], y[idx_va], y[idx_te]
    ttr, tva, tte = t[idx_tr], t[idx_va], t[idx_te]

    models = {
        "DT": DecisionTreeClassifier(max_depth=8, random_state=SEED),
        "RF": RandomForestClassifier(n_estimators=300, random_state=SEED, n_jobs=-1),
        "ET": ExtraTreesClassifier(n_estimators=300, random_state=SEED, n_jobs=-1),
        "GB": GradientBoostingClassifier(n_estimators=300, random_state=SEED),
    }

    with open(TREE_RESULTS_TXT, "w") as f:
        for name, model in models.items():
            val_m, test_m = train_eval(model, Xtr, Xva, Xte, ytr, yva, yte, ttr, tva, tte)
            f.write(f"{name}\tval_f1={val_m['f1']:.4f}\tval_cidx={val_m['cidx']:.4f}\t"
                    f"test_f1={test_m['f1']:.4f}\ttest_cidx={test_m['cidx']:.4f}\n")


if __name__ == "__main__":
    main()

