In [None]:
# 1_eda_cleaning.ipynb - celdas en orden
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from src.data_utils import unzip_all, load_csv_guess



ModuleNotFoundError: No module named 'src'

In [None]:

# 0) Descomprimir y cargar
unzip_all(raw_dir='data/raw', out_dir='data/raw_unzipped')
df = load_csv_guess(in_dir='data/raw_unzipped')
df.head()

In [None]:
# 1) Diagnóstico inicial
print(df.shape)
df.info()
df.describe(include='all').T

In [None]:

# 2) Normalizar nombres de columnas (ejemplo estándar)
cols_map = {c: c.strip().lower().replace(' ', '_') for c in df.columns}
df.rename(columns=cols_map, inplace=True)

In [None]:

# 3) Chequear columnas esperadas
expected = ['age','sex','cp','trestbps','chol','fbs','thalach','target']
for e in expected:
    if e not in df.columns:
        print("Advertencia: columna", e, "no encontrada")

In [None]:

# 4) Reemplazar ceros que signifiquen missing en ciertas columnas
# (en algunos datasets 0 indica missing para chol, trestbps)
for col in ['trestbps','chol','thalach']:
    if col in df.columns:
        # detectamos ceros plausibles
        nzeros = (df[col] == 0).sum()
        print(col, "zeros:", nzeros)
        df[col] = df[col].replace(0, np.nan)

In [None]:

# 5) Imputación simple: imputamos más tarde en pipeline; aquí solo chequeo
missing = df.isna().sum()
print("Missing per column:\n", missing)

In [None]:

# 6) Análisis univariado y bivariado
sns.histplot(df['age'].dropna(), kde=True)
plt.title('Distribución de edad')
plt.show()

# correlaciones
num_cols = df.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(10,8))
sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Matriz de correlación')
plt.show()

In [None]:

# 7) Convertir target a binario si viene multiclase (ej. 0-4)
if 'target' in df.columns:
    unique = df['target'].unique()
    print("target unique:", unique)
    # convertir >0 a 1
    df['target'] = df['target'].apply(lambda x: 1 if x>0 else 0)

In [None]:

# 8) Guardar dataset procesable (raw processed)
os.makedirs('data/processed', exist_ok=True)
df.to_csv('data/processed/heart_raw_processed.csv', index=False)
print("Saved processed CSV")