In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
train.head(1)

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence


In [3]:
FEATURES = train.drop(columns=["Heart Disease", "id"], axis=1)
TARGET = train["Heart Disease"]

**exploratory data analysis**

In [4]:
print(f"features Size: {FEATURES.shape}\n\ncolumns: {FEATURES.columns}.shape")

features Size: (630000, 13)

columns: Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium'],
      dtype='object').shape


**feature engineer**

In [5]:
import numpy as np
import pandas as pd

FEATURES = FEATURES.copy()

# --- Demográficas ---
FEATURES["male"] = FEATURES["Sex"]
FEATURES["age_group"] = pd.cut(
    FEATURES["Age"], [0, 40, 60, 120], labels=[0, 1, 2]
).astype(int)

# --- Dor torácica ---
FEATURES["typical_angina"] = (FEATURES["Chest pain type"] == 4).astype(int)
FEATURES["angina_score"] = FEATURES["Chest pain type"]

# --- Pressão e metabolismo ---
FEATURES["hypertension"] = (FEATURES["BP"] >= 140).astype(int)
FEATURES["hyperchol"] = (FEATURES["Cholesterol"] >= 240).astype(int)
FEATURES["metabolic_risk"] = (
    FEATURES["hypertension"] + FEATURES["hyperchol"]
)
FEATURES["diabetes"] = FEATURES["FBS over 120"]

# --- ECG e esforço ---
FEATURES["ekg_abnormal"] = (FEATURES["EKG results"] != 0).astype(int)

FEATURES["pred_max_hr"] = 220 - FEATURES["Age"]
FEATURES["chronotropic_reserve"] = (
    FEATURES["Max HR"] / FEATURES["pred_max_hr"]
)

FEATURES["exercise_ischemia"] = (
    (FEATURES["Exercise angina"] == 1) |
    (FEATURES["ST depression"] > 0)
).astype(int)

FEATURES["st_depression_severe"] = (
    FEATURES["ST depression"] >= 2
).astype(int)

FEATURES["st_slope_pathologic"] = (
    FEATURES["Slope of ST"].isin([2, 3])
).astype(int)

# --- Capacidade funcional ---
FEATURES["low_exercise_capacity"] = (
    FEATURES["chronotropic_reserve"] < 0.8
).astype(int)

# --- Anatomia coronária ---
FEATURES["multi_vessel_disease"] = (
    FEATURES["Number of vessels fluro"] >= 2
).astype(int)

FEATURES["thallium_abnormal"] = (
    FEATURES["Thallium"] != 3
).astype(int)

# --- Score clínico final ---
FEATURES["global_cad_risk"] = (
    FEATURES["typical_angina"] +
    FEATURES["hypertension"] +
    FEATURES["diabetes"] +
    FEATURES["ekg_abnormal"] +
    FEATURES["exercise_ischemia"] +
    FEATURES["multi_vessel_disease"]
)


In [6]:
numeric_features = FEATURES.select_dtypes(include=["int", "float"])
numeric_features.head(1)

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,...,ekg_abnormal,pred_max_hr,chronotropic_reserve,exercise_ischemia,st_depression_severe,st_slope_pathologic,low_exercise_capacity,multi_vessel_disease,thallium_abnormal,global_cad_risk
0,58,1,4,152,239,0,0,158,1,3.6,...,0,162,0.975309,1,1,1,0,1,1,4


**baseline**

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

In [8]:
scaler = StandardScaler()
pca = PCA(n_components=0.90)
model = LogisticRegression()

In [9]:
pipe = Pipeline([
    ("scaler", scaler), 
    # ("pca", pca),
    ("stimator", model)
])

baseline = cross_val_score(
    pipe, 
    FEATURES,
    TARGET,
    scoring="roc_auc",
    verbose=3,
    cv=10, 
)

[CV] END ................................ score: (test=0.953) total time=   6.7s
[CV] END ................................ score: (test=0.954) total time=   5.2s
[CV] END ................................ score: (test=0.952) total time=   6.7s
[CV] END ................................ score: (test=0.953) total time=   6.4s
[CV] END ................................ score: (test=0.954) total time=   6.6s
[CV] END ................................ score: (test=0.954) total time=   6.6s
[CV] END ................................ score: (test=0.953) total time=   6.5s
[CV] END ................................ score: (test=0.952) total time=   6.6s
[CV] END ................................ score: (test=0.953) total time=   6.6s
[CV] END ................................ score: (test=0.953) total time=   6.5s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.1min finished


In [10]:
baseline_score = baseline.mean()
round(baseline_score, 4)

np.float64(0.9529)

train

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    FEATURES, TARGET, test_size=0.2, random_state=42)

In [13]:
parameters = {"stimator__solver":["lbfgs", "liblinear", "newton-cg"]}

In [14]:
clf = GridSearchCV(pipe, parameters, cv=10, verbose=2, scoring="roc_auc")
clf

In [15]:
clf.fit(X_train, y_train)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] END .............................stimator__solver=lbfgs; total time=   5.6s
[CV] END .............................stimator__solver=lbfgs; total time=   5.0s
[CV] END .............................stimator__solver=lbfgs; total time=   5.0s
[CV] END .............................stimator__solver=lbfgs; total time=   4.2s
[CV] END .............................stimator__solver=lbfgs; total time=   5.4s
[CV] END .............................stimator__solver=lbfgs; total time=   5.6s
[CV] END .............................stimator__solver=lbfgs; total time=   5.6s
[CV] END .............................stimator__solver=lbfgs; total time=   5.7s
[CV] END .............................stimator__solver=lbfgs; total time=   5.7s
[CV] END .............................stimator__solver=lbfgs; total time=   5.5s
[CV] END .........................stimator__solver=liblinear; total time=   9.4s
[CV] END .........................stimator__solv

In [16]:
clf.best_score_

np.float64(0.9529983052356477)

In [17]:
test = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")
test.columns

Index(['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol',
       'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina',
       'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium'],
      dtype='object')

In [18]:
FEATURES_T = test.drop(columns=["id"], axis=1)

In [19]:
print(f"features Size: {FEATURES_T.shape}\n\ncolumns: {FEATURES_T.columns}.shape")

features Size: (270000, 13)

columns: Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium'],
      dtype='object').shape


In [20]:
# --- Demográficas ---
FEATURES_T["male"] = FEATURES_T["Sex"]
FEATURES_T["age_group"] = pd.cut(
    FEATURES_T["Age"], [0, 40, 60, 120], labels=[0, 1, 2]
).astype(int)

# --- Dor torácica ---
FEATURES_T["typical_angina"] = (FEATURES_T["Chest pain type"] == 4).astype(int)
FEATURES_T["angina_score"] = FEATURES_T["Chest pain type"]

# --- Pressão e metabolismo ---
FEATURES_T["hypertension"] = (FEATURES_T["BP"] >= 140).astype(int)
FEATURES_T["hyperchol"] = (FEATURES_T["Cholesterol"] >= 240).astype(int)
FEATURES_T["metabolic_risk"] = (
    FEATURES_T["hypertension"] + FEATURES_T["hyperchol"]
)
FEATURES_T["diabetes"] = FEATURES_T["FBS over 120"]

# --- ECG e esforço ---
FEATURES_T["ekg_abnormal"] = (FEATURES_T["EKG results"] != 0).astype(int)

FEATURES_T["pred_max_hr"] = 220 - FEATURES_T["Age"]
FEATURES_T["chronotropic_reserve"] = (
    FEATURES_T["Max HR"] / FEATURES_T["pred_max_hr"]
)

FEATURES_T["exercise_ischemia"] = (
    (FEATURES_T["Exercise angina"] == 1) |
    (FEATURES_T["ST depression"] > 0)
).astype(int)

FEATURES_T["st_depression_severe"] = (
    FEATURES_T["ST depression"] >= 2
).astype(int)

FEATURES_T["st_slope_pathologic"] = (
    FEATURES_T["Slope of ST"].isin([2, 3])
).astype(int)

# --- Capacidade funcional ---
FEATURES_T["low_exercise_capacity"] = (
    FEATURES_T["chronotropic_reserve"] < 0.8
).astype(int)

# --- Anatomia coronária ---
FEATURES_T["multi_vessel_disease"] = (
    FEATURES_T["Number of vessels fluro"] >= 2
).astype(int)

FEATURES_T["thallium_abnormal"] = (
    FEATURES_T["Thallium"] != 3
).astype(int)

# --- Score clínico final ---
FEATURES_T["global_cad_risk"] = (
    FEATURES_T["typical_angina"] +
    FEATURES_T["hypertension"] +
    FEATURES_T["diabetes"] +
    FEATURES_T["ekg_abnormal"] +
    FEATURES_T["exercise_ischemia"] +
    FEATURES_T["multi_vessel_disease"]
)

In [21]:
numeric_features = FEATURES_T.select_dtypes(include=["int", "float"])
numeric_features.head(1)

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,...,ekg_abnormal,pred_max_hr,chronotropic_reserve,exercise_ischemia,st_depression_severe,st_slope_pathologic,low_exercise_capacity,multi_vessel_disease,thallium_abnormal,global_cad_risk
0,58,1,3,120,288,0,2,145,1,0.8,...,1,162,0.895062,1,0,1,0,1,0,3


In [22]:
numeric_features.columns

Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium', 'male',
       'age_group', 'typical_angina', 'angina_score', 'hypertension',
       'hyperchol', 'metabolic_risk', 'diabetes', 'ekg_abnormal',
       'pred_max_hr', 'chronotropic_reserve', 'exercise_ischemia',
       'st_depression_severe', 'st_slope_pathologic', 'low_exercise_capacity',
       'multi_vessel_disease', 'thallium_abnormal', 'global_cad_risk'],
      dtype='object')

In [23]:
predictions = clf.predict_proba(FEATURES_T)[:, 1]

In [24]:
sub = pd.read_csv("/kaggle/input/playground-series-s6e2/sample_submission.csv")
sub["Heart Disease"] = predictions.round(2)
sub

Unnamed: 0,id,Heart Disease
0,630000,0.95
1,630001,0.00
2,630002,1.00
3,630003,0.01
4,630004,0.26
...,...,...
269995,899995,0.13
269996,899996,0.55
269997,899997,0.05
269998,899998,0.26


In [25]:
from sklearn.metrics import roc_auc_score, roc_curve

yhat = clf.predict_proba(X_test)[:, 1]

auc = roc_auc_score(y_test, yhat)
print("ROC AUC:", auc)

ROC AUC: 0.9527586277908351


In [26]:
sub.to_csv("baseline22:06feb10.csv", index=False)