In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [2]:
df_train = pd.read_csv("04_data/train_data.csv")
df_test = pd.read_csv("04_data/test_data.csv")

In [3]:
df_train.head()

Unnamed: 0,ID,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,...,BMI,Heart Rate,Serum Creatinine,Uric Acid,Hemoglobin,GFR,Serum Sodium,Serum Potassium,Serum Albumin,Lactate
0,1,62.300654,White,Widowed,T1,N1,,Moderately differentiated,2,Regional,...,25.228195,98.865449,4.343779,3.781804,13.704187,112.038394,136.560377,-1.506035,4.699045,-3.200633
1,2,37.268422,White,Married,T2,N1,IIB,Well differentiated,1,Regional,...,31.027525,81.547091,-5.053593,1.990754,20.685675,109.605432,147.569841,-0.033068,1.676842,2.013738
2,3,55.864953,White,Single,T1,N1,IIA,Well differentiated,1,Regional,...,20.009729,77.214648,-0.683623,11.299137,10.565341,112.964603,147.176105,-7.076607,6.776799,-7.266369
3,4,60.586799,White,Divorced,T1,N1,IIA,Poorly differentiated,3,Regional,...,33.217567,86.513469,0.317514,0.23822,7.512031,63.477023,135.443017,3.189928,4.519103,-1.6371
4,5,48.197741,White,Separated,T2,N1,IIB,Moderately differentiated,2,Regional,...,33.909838,90.401178,-1.838835,10.500072,19.806571,98.437718,136.071561,8.390279,7.027023,11.230639


In [4]:
df_train.info()
df_train.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3219 entries, 0 to 3218
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      3219 non-null   int64  
 1   Age                     3219 non-null   float64
 2   Race                    3219 non-null   object 
 3   Marital Status          3219 non-null   object 
 4   T Stage                 3219 non-null   object 
 5   N Stage                 3219 non-null   object 
 6   6th Stage               3116 non-null   object 
 7   differentiate           3219 non-null   object 
 8   Grade                   3219 non-null   object 
 9   A Stage                 3219 non-null   object 
 10  Tumor Size              2256 non-null   float64
 11  Estrogen Status         3219 non-null   object 
 12  Progesterone Status     3219 non-null   object 
 13  Regional Node Examined  3219 non-null   float64
 14  Reginol Node Positive   3219 non-null   

Unnamed: 0,ID,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,...,BMI,Heart Rate,Serum Creatinine,Uric Acid,Hemoglobin,GFR,Serum Sodium,Serum Potassium,Serum Albumin,Lactate
count,3219.0,3219.0,3219,3219,3219,3219,3116,3219,3219.0,3219,...,3219.0,3219.0,3219.0,3219.0,3219.0,3219.0,3219.0,3219.0,3219.0,3219.0
unique,,,3,5,4,3,5,4,4.0,2,...,,,,,,,,,,
top,,,White,Married,T2,N1,IIA,Moderately differentiated,2.0,Regional,...,,,,,,,,,,
freq,,,2709,2103,1430,2176,927,1867,1867.0,3145,...,,,,,,,,,,
mean,1610.0,53.901473,,,,,,,,,...,29.114799,79.638509,0.917828,5.3216,15.163777,90.243284,140.053107,4.24846,4.272023,1.069535
std,929.389585,10.294228,,,,,,,,,...,7.942761,12.760401,5.006427,5.100319,5.25543,17.720652,5.728874,5.039367,4.829481,5.01694
min,1.0,19.751908,,,,,,,,,...,5.612847,47.926433,-15.806928,-12.841881,-2.859851,45.64546,121.729688,-13.244634,-16.834811,-14.882563
25%,805.5,46.604853,,,,,,,,,...,23.261407,69.429444,-2.454253,1.677006,11.688549,75.794487,136.254325,0.809133,1.01648,-2.44236
50%,1610.0,54.251208,,,,,,,,,...,29.199006,79.280944,0.873854,5.269914,15.14794,90.674069,140.048988,4.223347,4.27504,1.144759
75%,2414.5,61.676225,,,,,,,,,...,34.858202,89.917913,4.396527,8.791919,18.711793,104.821243,144.011247,7.66834,7.460993,4.45303


In [5]:
# Subtask 1
def renal_fn(gfr):
    if gfr >= 90:
        return "Normal"
    elif 60 <= gfr < 90:
        return "Mildly Decreased"
    else:
        return "Unknown"


df_test["Renal_Function"] = df_test["GFR"].apply(renal_fn)
df_test["Renal_Function"]

0                Normal
1                Normal
2                Normal
3      Mildly Decreased
4      Mildly Decreased
             ...       
800              Normal
801              Normal
802              Normal
803    Mildly Decreased
804              Normal
Name: Renal_Function, Length: 805, dtype: object

In [6]:
# Subtask 2
quartiles = df_train["Serum Creatinine"].quantile([0.25, 0.5, 0.75])
Q1, Q2, Q3 = quartiles[0.25], quartiles[0.5], quartiles[0.75]


def creat_risk(creat):
    if creat <= Q1:
        return "Very Low"
    elif creat <= Q2:
        return "Low"
    elif creat <= Q3:
        return "High"
    else:
        return "Very High"


df_test["Creatinine_Risk"] = df_test["Serum Creatinine"].apply(creat_risk)
df_test["Creatinine_Risk"]

0      High
1      High
2       Low
3      High
4       Low
       ... 
800    High
801    High
802    High
803     Low
804    High
Name: Creatinine_Risk, Length: 805, dtype: object

In [7]:
# Subtask 3
bmi_median = df_train["BMI"].median()
df_test["BMI_gt_median"] = (df_test["BMI"] > bmi_median).astype(int)
df_test["BMI_gt_median"]

0      0
1      1
2      0
3      1
4      1
      ..
800    1
801    0
802    1
803    0
804    0
Name: BMI_gt_median, Length: 805, dtype: int64

In [8]:
# Subtask 4
train_t_counts = df_train["T Stage"].value_counts()
df_test["TStage_count_train"] = (
    df_test["T Stage"].map(train_t_counts).fillna(0).astype(int)
)

df_test["TStage_count_train"]

0      1281
1      1281
2      1430
3       418
4      1281
       ... 
800    1281
801    1430
802    1430
803    1281
804    1281
Name: TStage_count_train, Length: 805, dtype: int64

In [9]:
target = "Status"
id_col = "ID"


def engineer_features(df):
    df = df.copy()

    df["T_stage_num"] = df["T Stage"].str.extract(r"(\d+)").astype(float)
    df["N_stage_num"] = df["N Stage"].str.extract(r"(\d+)").astype(float)

    df["Age_group"] = pd.cut(
        df["Age"], bins=[0, 40, 60, 80, 150], labels=["<40", "40-60", "60-80", ">80"]
    )

    df["BMI_cat"] = pd.cut(
        df["BMI"],
        bins=[0, 18.5, 25, 30, 100],
        labels=["Under", "Normal", "Over", "Obese"],
    )

    df["Creat_GFR_ratio"] = df["Serum Creatinine"] / df["GFR"].replace(0, np.nan)

    df["Hyponatremia"] = (df["Serum Sodium"] < 135).astype(int)
    df["Hypernatremia"] = (df["Serum Sodium"] > 145).astype(int)
    df["Hypoalbumin"] = (df["Serum Albumin"] < 3.5).astype(int)
    df["Anemia"] = (df["Hemoglobin"] < 12).astype(int)
    df["Hyperkalemia"] = (df["Serum Potassium"] > 5.0).astype(int)

    size_median = df_train["Tumor Size"].median()
    df["Large_Tumor"] = (df["Tumor Size"] > size_median).astype(int)

    return df


df_train_fe = engineer_features(df_train)
df_test_fe = engineer_features(df_test)

In [10]:
dead_df = df_train_fe[df_train_fe[target] == "Dead"]
alive_df = df_train_fe[df_train_fe[target] == "Alive"]

desired_dead = int(0.8 * len(alive_df))
dead_oversampled = dead_df.sample(
    n=max(0, desired_dead - len(dead_df)), replace=True, random_state=42
)
df_train_balanced = pd.concat([alive_df, dead_df, dead_oversampled]).sample(
    frac=1, random_state=42
)

In [11]:
feature_cols = [c for c in df_train_balanced.columns if c not in [target, id_col]]

X = df_train_balanced[feature_cols]
y = df_train_balanced[target]

X_test = df_test_fe[feature_cols]

numeric_feats = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_feats = X.select_dtypes(include=["object", "category"]).columns.tolist()

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_feats),
        ("cat", categorical_transformer, categorical_feats),
    ]
)

model = GradientBoostingClassifier(random_state=42)

pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
pipeline.fit(X_train, y_train)
y_val_pred = pipeline.predict(X_val)

prec_dead = precision_score(y_val, y_val_pred, pos_label="Dead")
print(f"Precision (Dead): {prec_dead:.3f}")
print(classification_report(y_val, y_val_pred))

Precision (Dead): 0.790
              precision    recall  f1-score   support

       Alive       0.78      0.85      0.82       545
        Dead       0.79      0.71      0.75       436

    accuracy                           0.79       981
   macro avg       0.79      0.78      0.78       981
weighted avg       0.79      0.79      0.78       981



In [13]:
y_test_pred = pipeline.predict(X_test)
df_test["Status_Prediction"] = y_test_pred

rows = []
for _, r in df_test.iterrows():
    idx = r["ID"]
    rows.append((1, idx, r["Renal_Function"]))
    rows.append((2, idx, r["Creatinine_Risk"]))
    rows.append((3, idx, int(r["BMI_gt_median"])))
    rows.append((4, idx, int(r["TStage_count_train"])))
    rows.append((5, idx, r["Status_Prediction"]))

submission = pd.DataFrame(rows, columns=["subtaskID", "datapointID", "answer"])

# Salvăm în formatul cerut
submission.to_csv("04_data/submission.csv", index=False)