In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Si NO estás en Jupyter, descomenta esto:
# from IPython.display import display

# =========================
# CONFIG
# =========================
DATA_PATH = "data/heart.csv"
TARGET = "HeartDisease"
RANDOM_STATE = 42

# =========================
# PASO 1) LOAD + HOLD-OUT SPLIT
# =========================
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
print("\nColumnas:")
print(df.columns.tolist())

print("\nInfo:")
print(df.info())

print("\nDistribución del target:")
print(df[TARGET].value_counts(normalize=True))

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    stratify=y,
    random_state=RANDOM_STATE
)

print("\nSplit final:")
print("Train:", X_train.shape, " Test:", X_test.shape)
print("Target train mean:", y_train.mean())
print("Target test mean :", y_test.mean())

# ============================================================
# PASO 2 — EDA TÉCNICO (mínimo, defendible)
# ============================================================

# 1) Tipos de variables
categorical_cols = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]
binary_cols = ["FastingBS"]
numeric_cols = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]

print("\n=== Tipología de variables ===")
print("Categóricas:", categorical_cols)
print("Binarias:", binary_cols)
print("Numéricas:", numeric_cols)

# 2) Estadísticas básicas (train)
print("\n=== Estadísticas básicas (X_train) ===")
display(X_train[numeric_cols + binary_cols].describe().T)

# 3) Chequeo de valores sospechosos (ceros no fisiológicos)
cols_with_zero_issue = ["RestingBP", "Cholesterol", "MaxHR"]

print("\n=== Chequeo de ceros sospechosos (X_train) ===")
for col in cols_with_zero_issue:
    n_zero = int((X_train[col] == 0).sum())
    pct = n_zero / len(X_train)
    print(f"{col}: {n_zero} valores = 0 ({pct:.2%})")

# 4) Cardinalidad de categóricas
print("\n=== Cardinalidad de variables categóricas (X_train) ===")
for col in categorical_cols:
    print(f"\n{col}:")
    print(X_train[col].value_counts(dropna=False))

# 5) Señal básica vs target (sin modelar)
print("\n=== Señal básica vs target (P(HeartDisease=1) por categoría) ===")

def rate_by_category(col):
    tmp = pd.DataFrame({col: X_train[col], TARGET: y_train})
    return tmp.groupby(col)[TARGET].mean().sort_values(ascending=False)

for col in ["Sex", "ExerciseAngina", "ChestPainType", "ST_Slope", "RestingECG"]:
    print(f"\n{col}:")
    print(rate_by_category(col))

# 6) Señal básica numérica (medianas por clase)
print("\n=== Medianas de numéricas por clase (train) ===")
tmp_num = pd.concat([X_train[numeric_cols], y_train.rename(TARGET)], axis=1)
print(tmp_num.groupby(TARGET)[numeric_cols].median())

# 7) Conclusiones (texto para el informe)
print("\n=== Conclusiones EDA (resumen) ===")
print("- No hay nulos explícitos en train/test.")
print("- Puede haber 'missing codificado como 0' (especialmente Cholesterol; revisar RestingBP).")
print("- Hay mezcla de numéricas y categóricas -> necesitaremos One-Hot Encoding.")
print("- Para modelos lineales será recomendable escalar numéricas; para árboles no es necesario.")
print("- Existe señal básica: algunas categorías muestran distinta proporción de HeartDisease=1.")


Shape: (918, 12)

Columnas:
['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB
None

Distribu

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,734.0,53.848774,9.440574,29.0,47.0,55.0,61.0,77.0
RestingBP,734.0,132.874659,18.08033,92.0,120.0,130.0,141.0,200.0
Cholesterol,734.0,203.22752,108.402067,0.0,180.0,225.0,269.75,603.0
MaxHR,734.0,136.377384,25.832297,60.0,119.0,138.0,156.0,202.0
Oldpeak,734.0,0.865123,1.056964,-2.6,0.0,0.5,1.5,5.6
FastingBS,734.0,0.228883,0.4204,0.0,0.0,0.0,0.0,1.0



=== Chequeo de ceros sospechosos (X_train) ===
RestingBP: 0 valores = 0 (0.00%)
Cholesterol: 129 valores = 0 (17.57%)
MaxHR: 0 valores = 0 (0.00%)

=== Cardinalidad de variables categóricas (X_train) ===

Sex:
Sex
M    579
F    155
Name: count, dtype: int64

ChestPainType:
ChestPainType
ASY    403
NAP    155
ATA    136
TA      40
Name: count, dtype: int64

RestingECG:
RestingECG
Normal    447
LVH       146
ST        141
Name: count, dtype: int64

ExerciseAngina:
ExerciseAngina
N    434
Y    300
Name: count, dtype: int64

ST_Slope:
ST_Slope
Flat    374
Up      313
Down     47
Name: count, dtype: int64

=== Señal básica vs target (P(HeartDisease=1) por categoría) ===

Sex:
Sex
M    0.625216
F    0.283871
Name: HeartDisease, dtype: float64

ExerciseAngina:
ExerciseAngina
Y    0.846667
N    0.350230
Name: HeartDisease, dtype: float64

ChestPainType:
ChestPainType
ASY    0.786600
TA     0.450000
NAP    0.335484
ATA    0.139706
Name: HeartDisease, dtype: float64

ST_Slope:
ST_Slope
Flat    