Feature-Selection (97 % gain) & Re-evaluation – după split CT / NT
---------------------------------------------------------------------
Rulează:
    python feature_gain_reanalysis.py

Pentru fiecare sistem (CT, NT):
1. Încarcă dataset-ul procesat dedicat (Processed_Database_CT / NT).
2. Încarcă modelul XGBoost salvat.
3. Calculează importanțele (gain) și păstrează coloanele cu 90 % gain cumulativ.
4. Re-antrenează modelul pe subsetul redus şi compară performanţele (R², RMSE, MAE).


In [3]:
"""Train & tune XGBoost on 97 %‑gain subset (CT & NT) + metric summary
------------------------------------------------------------------------
Rulează:
    python train_reduced_xgb_cv.py

Output‑uri cheie:
• modele salvate în models/…_reduced97.pkl
• fișier JSON cu feature‑uri păstrate outputs/reduced_feature_lists.json
• tabel rezumat cu metricile FULL vs REDUCED + dif. procentuale.
"""
from __future__ import annotations
import json, math, pickle
from pathlib import Path

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ---------------- CONFIG ----------------
SEED = 42
TEST_SIZE = 0.2
GAIN_THRESHOLD = 0.97
OUT_MODELS = Path("models");
OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_FEATURES = Path("outputs/reduced_feature_lists.json");
OUT_FEATURES.parent.mkdir(parents=True, exist_ok=True)

DATA_CFG = [
    {
        "label": "Yield of CT",
        "data_path": Path("datasets/Processed_Database_CT.csv"),
        "target": "Yield of CT",
        "model_full": Path("best_model_Yield_of_CT_XGBoost.pkl"),
        "model_out": OUT_MODELS / "best_model_Yield_of_CT_XGBoost97.pkl",
    },
    {
        "label": "Yield of NT",
        "data_path": Path("datasets/Processed_Database_NT.csv"),
        "target": "Yield of NT",
        "model_full": Path("best_model_Yield_of_NT_XGBoost.pkl"),
        "model_out": OUT_MODELS / "best_model_Yield_of_NT_XGBoost97.pkl",
    },
]

PARAM_GRID = {
    "n_estimators": [200, 400, 600],
    "max_depth": [4, 6, 8],
    "learning_rate": [0.05, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.7, 1.0],
    "gamma": [0, 1],
}


# -------------- Helpers --------------

def get_gain_importance(model: xgb.XGBRegressor) -> pd.DataFrame:
    imp = model.get_booster().get_score(importance_type="gain")
    df = pd.DataFrame(imp.items(), columns=["feature", "gain"]).sort_values("gain", ascending=False)
    df["cum_gain"] = df["gain"].cumsum() / df["gain"].sum()
    return df.reset_index(drop=True)


def evaluate(model: xgb.XGBRegressor, X, y):
    pred = model.predict(X)
    return {
        "R2": r2_score(y, pred),
        "RMSE": mean_squared_error(y, pred, squared=False),
        "MAE": mean_absolute_error(y, pred),
    }


# -------------- Main loop --------------
feature_dict: dict[str, list[str]] = {}
results = []

for cfg in DATA_CFG:
    label = cfg["label"]
    print(f"\n===== {label} =====")

    # 1. Data split
    df = pd.read_csv(cfg["data_path"])
    X = df.drop(columns=[cfg["target"]]);
    y = df[cfg["target"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)

    # 2. Feature subset via gain from full model
    with open(cfg["model_full"], "rb") as f:
        model_full: xgb.XGBRegressor = pickle.load(f)
    imp_df = get_gain_importance(model_full)
    sel_feats = imp_df.loc[imp_df["cum_gain"] <= GAIN_THRESHOLD, "feature"].tolist()
    feature_dict[label] = sel_feats
    print(f"Păstrăm {len(sel_feats)} / {X.shape[1]} feature‑uri (≥ {GAIN_THRESHOLD * 100:.0f}% gain)")
    X_train_sel, X_test_sel = X_train[sel_feats], X_test[sel_feats]

    # 3. Hyper‑parameter tuning pe subset
    base = xgb.XGBRegressor(objective="reg:squarederror", random_state=SEED, n_jobs=-1)
    grid = GridSearchCV(base, PARAM_GRID, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, verbose=0)
    grid.fit(X_train_sel, y_train)
    best_model: xgb.XGBRegressor = grid.best_estimator_
    print("Best params:", grid.best_params_)

    # 4. Metrics
    m_full = evaluate(model_full, X_test, y_test)
    m_red = evaluate(best_model, X_test_sel, y_test)
    print("Metrics FULL   —  R²={R2:.3f}  RMSE={RMSE:.1f}  MAE={MAE:.1f}".format(**m_full))
    print("Metrics REDUCED —  R²={R2:.3f}  RMSE={RMSE:.1f}  MAE={MAE:.1f}".format(**m_red))
    results.append({"label": label, "set": "Full", **m_full})
    results.append({"label": label, "set": "Reduced", **m_red})

    # 5. Save tuned model
    with open(cfg["model_out"], "wb") as f:
        pickle.dump(best_model, f)
    print("Model salvat la:", cfg["model_out"].resolve())

# 6. Save feature lists
with open(OUT_FEATURES, "w") as fp:
    json.dump(feature_dict, fp, indent=2)
print("\nListele de feature‑uri salvate în:", OUT_FEATURES.resolve())

# 7. Summary table
res_df = pd.DataFrame(results)
print("\n===== Rezumat =====")
print(res_df.pivot(index="label", columns="set"))

print("\nDiferențe procentuale (Reduced vs Full):")
for lbl in res_df["label"].unique():
    full = res_df[(res_df.label == lbl) & (res_df.set == "Full")].iloc[0]
    red = res_df[(res_df.label == lbl) & (res_df.set == "Reduced")].iloc[0]
    pct = lambda a, b: (a - b) / b * 100 if b != 0 else 0
    print(
        f"{lbl}: ΔR²={pct(red.R2, full.R2):+.2f}%   ΔRMSE={pct(red.RMSE, full.RMSE):+.2f}%   ΔMAE={pct(red.MAE, full.MAE):+.2f}%")



===== Yield of CT =====


FileNotFoundError: [Errno 2] No such file or directory: 'datasets/Processed_Database_CT.csv'