In [3]:
# -*- coding: utf-8 -*-
import pandas as pd, numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor, MLPClassifier

# XGBoost (opcjonalnie)
HAS_XGB = True
try:
    from xgboost import XGBRegressor, XGBClassifier
except Exception:
    HAS_XGB = False

# ---------- Pomocnicze ----------
def find_col(cols, keywords):
    for c in cols:
        cl = str(c).lower()
        for k in keywords:
            if k in cl:
                return c
    return None

def safe_mape(y_true, y_pred, eps=1e-8):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)   # zabezpieczenie przy ~0
    return float(np.mean(np.abs((y_true - y_pred) / denom)) * 100.0)

def regression_metrics(y_true, y_pred):
    mse = float(mean_squared_error(y_true, y_pred))
    rmse = float(np.sqrt(mse))
    mae = float(mean_absolute_error(y_true, y_pred))
    mape = safe_mape(y_true, y_pred)
    r2 = float(r2_score(y_true, y_pred))
    return {"MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE%": mape, "R2": r2}

def build_preprocessor(df, drop_cols):
    features = [c for c in df.columns if c not in drop_cols]
    num = [c for c in features if pd.api.types.is_numeric_dtype(df[c])]
    cat = [c for c in features if not pd.api.types.is_numeric_dtype(df[c])]
    pre = ColumnTransformer(
        transformers=[
            ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num),
            ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                              ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat)
        ]
    )
    return pre

def split_70_15_15(X, y, stratify=None, seed=42):
    X_tr, X_tmp, y_tr, y_tmp = train_test_split(X, y, train_size=0.70, random_state=seed,
                                                shuffle=True, stratify=stratify)
    X_va, X_te, y_va, y_te = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=seed,
                                              shuffle=True, stratify=(y_tmp if stratify is not None else None))
    return X_tr, X_va, X_te, y_tr, y_va, y_te

# ---------- Wczytanie ----------
path = Path("daneMed.xlsx")             # <- plik obok skryptu
df = pd.read_excel(path)
df.columns = [str(c).strip() for c in df.columns]

target = find_col(df.columns, ["kreatyn", "creat"])        # "kreatynina" / "creatinine"
sexcol = find_col(df.columns, ["płe", "plec", "płec", "pleć", "sex", "gender"])
if target is None:
    raise SystemExit("Nie znaleziono kolumny z kreatyniną (np. 'kreatynina').")

if sexcol is not None:
    df[sexcol] = (df[sexcol].astype(str).str.strip().str.lower()
                  .replace({"k":"K","kobieta":"K","female":"K","f":"K",
                            "m":"M","mężczyzna":"M","mezczyzna":"M","male":"M"})
                  .str.upper().str[0])

print("INFO:", {"rows": len(df), "cols": len(df.columns), "target": target, "sex": sexcol})

# =========================================
# CZĘŚĆ A: Regressja 80:20 (train/test)
# =========================================
X = df.drop(columns=[target]); y = df[target].astype(float)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
pre_all = build_preprocessor(df, [target])

reg_models = [
    ("LinearRegression",
     Pipeline([("prep", pre_all), ("model", LinearRegression())])),
    ("KNeighborsRegressor",
     Pipeline([("prep", pre_all), ("scaler", StandardScaler(with_mean=False)),
               ("model", KNeighborsRegressor(n_neighbors=7))])),
    # Uwaga: RF i XGB z "konserwatywnymi" ograniczeniami, by zmniejszyć przeuczenie
    ("RandomForestRegressor",
     Pipeline([("prep", pre_all),
               ("model", RandomForestRegressor(
                    n_estimators=300, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1))]))
]
if HAS_XGB:
    reg_models.append(
        ("XGBRegressor",
         Pipeline([("prep", pre_all),
                   ("model", XGBRegressor(
                       objective="reg:squarederror",
                       n_estimators=400, learning_rate=0.1, max_depth=6,
                       subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
                       tree_method="hist", random_state=42, n_jobs=-1))]))
    )

rows = []
for name, pipe in reg_models:
    pipe.fit(X_tr, y_tr)
    y1 = pipe.predict(X_tr)
    y2 = pipe.predict(X_te)
    mtr = regression_metrics(y_tr, y1)
    mte = regression_metrics(y_te, y2)
    rec = {"Model": name, **{f"train_{k}": v for k, v in mtr.items()},
                     **{f"test_{k}": v for k, v in mte.items()}}
    rows.append(rec)
reg80 = pd.DataFrame(rows).sort_values("test_R2", ascending=False)
reg80.to_csv("out_regression_80_20.csv", index=False)
print("Zapisano: out_regression_80_20.csv")

# =========================================
# CZĘŚĆ B: Regressja osobno (K/M), y<=10, 70:15:15
# + „krótki” dobór parametrów (konserwatywnie)
# =========================================
def eval_reg_block(df_sub, label):
    df_sub = df_sub[df_sub[target].astype(float) <= 10.0].copy()
    if len(df_sub) < 40:
        print(f"[{label}] Za mało próbek po filtrze (<=10): N={len(df_sub)}")
        return pd.DataFrame()
    X = df_sub.drop(columns=[target]); y = df_sub[target].astype(float)
    X_tr, X_va, X_te, y_tr, y_va, y_te = split_70_15_15(X, y)
    pre = build_preprocessor(df_sub, [target])

    models = [
        ("RandomForestRegressor",
         Pipeline([("prep", pre),
                   ("model", RandomForestRegressor(
                       n_estimators=400, max_depth=10, min_samples_leaf=2,
                       random_state=42, n_jobs=-1))])),
        ("MLPRegressor",
         Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                   ("model", MLPRegressor(hidden_layer_sizes=(128,64),
                                          max_iter=500, random_state=42,
                                          early_stopping=True))]))
    ]
    if HAS_XGB:
        models.append(
            ("XGBRegressor",
             Pipeline([("prep", pre),
                       ("model", XGBRegressor(
                           objective="reg:squarederror",
                           n_estimators=500, learning_rate=0.07, max_depth=6,
                           subsample=0.9, colsample_bytree=0.9, reg_lambda=1.5,
                           tree_method="hist", random_state=42, n_jobs=-1))]))
        )

    recs = []
    for name, pipe in models:
        pipe.fit(X_tr, y_tr)
        for split, (X_, y_) in [("Train",(X_tr,y_tr)), ("Validation",(X_va,y_va)), ("Test",(X_te,y_te))]:
            yp = pipe.predict(X_)
            recs.append({"Model": name, "Split": split, **regression_metrics(y_, yp)})
    out = pd.DataFrame(recs)
    out.to_csv(f"out_regression_70_15_15_{label}.csv", index=False)
    print(f"Zapisano: out_regression_70_15_15_{label}.csv")
    return out

if sexcol is not None:
    for lab in ["K","M"]:
        eval_reg_block(df[df[sexcol]==lab], lab)

# =========================================
# CZĘŚĆ C: Klasyfikacja wg płci (TOP3)
# =========================================
if sexcol is not None:
    y = df[sexcol].astype('category').cat.codes # Convert to numeric
    X = df.drop(columns=[sexcol])
    X_tr, X_va, X_te, y_tr, y_va, y_te = split_70_15_15(X, y, stratify=y)

    pre = build_preprocessor(df, [sexcol])
    clfs = [
        ("LogisticRegression", Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                                        ("model", LogisticRegression(max_iter=600, class_weight="balanced"))])),
        ("RandomForestClassifier", Pipeline([("prep", pre),
                                            ("model", RandomForestClassifier(n_estimators=300, random_state=42,
                                                                             class_weight="balanced", n_jobs=-1))])),
        ("SVC", Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                          ("model", SVC(probability=True, class_weight="balanced"))])),
        ("KNeighborsClassifier", Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                                           ("model", KNeighborsClassifier(n_neighbors=7))])),
        ("MLPClassifier", Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                                    ("model", MLPClassifier(hidden_layer_sizes=(128,64),
                                                            random_state=42, max_iter=400, early_stopping=True))])),
    ]
    if HAS_XGB:
        clfs.append(("XGBClassifier", Pipeline([("prep", pre),
                                                ("model", XGBClassifier(
                                                    n_estimators=400, learning_rate=0.1, max_depth=6,
                                                    subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
                                                    random_state=42, n_jobs=-1, tree_method="hist",
                                                    eval_metric="logloss"))])))

    rows, details = [], []
    for name, clf in clfs:
        clf.fit(X_tr, y_tr)
        yv = clf.predict(X_va)
        try:
            yv_prob = clf.predict_proba(X_va)
            auc_v = float(roc_auc_score(y_va, yv_prob[:,1])) if yv_prob.shape[1]==2 else np.nan
        except Exception:
            auc_v = np.nan
        rows.append({"Classifier": name,
                     "val_Accuracy": float(accuracy_score(y_va, yv)),
                     "val_F1_weighted": float(f1_score(y_va, yv, average="weighted")),
                     "val_AUC": auc_v})
        for split,(X_,y_) in [("Train",(X_tr,y_tr)),("Validation",(X_va,y_va)),("Test",(X_te,y_te))]:
            yp = clf.predict(X_)
            try:
                yp_prob = clf.predict_proba(X_)
                auc = float(roc_auc_score(y_, yp_prob[:,1])) if yp_prob.shape[1]==2 else np.nan
            except Exception:
                auc = np.nan
            details.append({"Classifier": name, "Split": split,
                            "Accuracy": float(accuracy_score(y_, yp)),
                            "F1_weighted": float(f1_score(y_, yp, average="weighted")),
                            "AUC": auc})
    rank = pd.DataFrame(rows).sort_values(by=["val_F1_weighted","val_Accuracy"], ascending=False)
    det  = pd.DataFrame(details)
    top3 = rank.Classifier.head(3).tolist()
    det_top3 = det[det.Classifier.isin(top3)]
    rank.to_csv("out_class_sex_ranking.csv", index=False)
    det_top3.to_csv("out_class_sex_top3.csv", index=False)
    print("Zapisano: out_class_sex_ranking.csv, out_class_sex_top3.csv")

# =========================================
# CZĘŚĆ D: Klasyfikacja wg kreatyniny (<=5 vs >5), tylko rekordy y<=10
# (bez przecieku — kreatynina NIE w cechach!)
# =========================================
df_c = df[df[target].astype(float) <= 10.0].copy()
df_c["grp"] = (df_c[target].astype(float) > 5.0).astype(int)   # 0: <=5, 1: >5
y = df_c["grp"].astype(int) # Convert to numeric
X = df_c.drop(columns=[target, "grp"])
X_tr, X_va, X_te, y_tr, y_va, y_te = split_70_15_15(X, y, stratify=y)
pre = build_preprocessor(df_c, [target, "grp"])

clfs2 = [
    ("LogisticRegression", Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                                    ("model", LogisticRegression(max_iter=600, class_weight="balanced"))])),
    ("RandomForestClassifier", Pipeline([("prep", pre),
                                        ("model", RandomForestClassifier(n_estimators=300, random_state=42,
                                                                         class_weight="balanced", n_jobs=-1))])),
    ("SVC", Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                      ("model", SVC(probability=True, class_weight="balanced"))])),
    ("KNeighborsClassifier", Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                                       ("model", KNeighborsClassifier(n_neighbors=7))])),
    ("MLPClassifier", Pipeline([("prep", pre), ("scaler", StandardScaler(with_mean=False)),
                                ("model", MLPClassifier(hidden_layer_sizes=(128,64),
                                                        random_state=42, max_iter=400, early_stopping=True))])),
]
if HAS_XGB:
    clfs2.append(("XGBClassifier", Pipeline([("prep", pre),
                                            ("model", XGBClassifier(
                                                n_estimators=400, learning_rate=0.1, max_depth=6,
                                                subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
                                                random_state=42, n_jobs=-1, tree_method="hist",
                                                eval_metric="logloss"))])))

rows2, details2 = [], []
for name, clf in clfs2:
    clf.fit(X_tr, y_tr)
    yv = clf.predict(X_va)
    try:
        yv_prob = clf.predict_proba(X_va)
        auc_v = float(roc_auc_score(y_va, yv_prob[:,1]))
    except Exception:
        auc_v = np.nan
    rows2.append({"Classifier": name,
                  "val_Accuracy": float(accuracy_score(y_va, yv)),
                  "val_F1_weighted": float(f1_score(y_va, yv, average="weighted")),
                  "val_AUC": auc_v})
    for split,(X_,y_) in [("Train",(X_tr,y_tr)),("Validation",(X_va,y_va)),("Test",(X_te,y_te))]:
        yp = clf.predict(X_)
        try:
            yp_prob = clf.predict_proba(X_)
            auc = float(roc_auc_score(y_, yp_prob[:,1]))
        except Exception:
            auc = np.nan
        details2.append({"Classifier": name, "Split": split,
                         "Accuracy": float(accuracy_score(y_, yp)),
                         "F1_weighted": float(f1_score(y_, yp, average="weighted")),
                         "AUC": auc})
rank2 = pd.DataFrame(rows2).sort_values(by=["val_F1_weighted","val_Accuracy"], ascending=False)
det2  = pd.DataFrame(details2)
top3b = rank2.Classifier.head(3).tolist()
det2_top3 = det2[det2.Classifier.isin(top3b)]
rank2.to_csv("out_class_creat_ranking.csv", index=False)
det2_top3.to_csv("out_class_creat_top3.csv", index=False)
print("Zapisano: out_class_creat_ranking.csv, out_class_creat_top3.csv")

print("\nGotowe. Pliki CSV znajdziesz obok skryptu.")

INFO: {'rows': 1150, 'cols': 12, 'target': 'KREATYNINA', 'sex': 'PLEĆ'}




Zapisano: out_regression_80_20.csv
[K] Za mało próbek po filtrze (<=10): N=0
[M] Za mało próbek po filtrze (<=10): N=0




Zapisano: out_class_sex_ranking.csv, out_class_sex_top3.csv




Zapisano: out_class_creat_ranking.csv, out_class_creat_top3.csv

Gotowe. Pliki CSV znajdziesz obok skryptu.




In [None]:
from google.colab import drive
drive.mount('/content/drive')