In [17]:
import os, glob, pandas as pd, numpy as np, joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [18]:
DEFAULT_COLS = [
    'age','sex','cp','trestbps','chol','fbs','restecg',
    'thalach','exang','oldpeak','slope','ca','thal','num'
]

In [19]:
def _try_read_csv(path):
    for sep in [',', r'\s+', ';', '\t']:
        try:
            df = pd.read_csv(path, header=None, sep=sep, engine='python', na_values=['?',''])
            df = df.dropna(how='all')
            if df.shape[1] >= 5:
                return df
        except Exception:
            continue
    with open(path, 'r', errors='ignore') as f:
        lines = [l.strip() for l in f if l.strip()]
    rows = [l.split() for l in lines]
    maxcols = max(len(r) for r in rows)
    rows = [r + [np.nan]*(maxcols-len(r)) for r in rows]
    return pd.DataFrame(rows)

In [20]:
def find_data_files(data_dir='data'):
    patterns = [os.path.join(data_dir, 'processed.*.data'),
                os.path.join(data_dir, '*.data'),
                os.path.join(data_dir, '*.csv')]
    files = []
    for p in patterns:
        files.extend(glob.glob(p))
    return sorted(list(dict.fromkeys(files)))

In [21]:
def load_and_combine_data(data_dir='data'):
    if not os.path.exists(data_dir):
        alt_path = os.path.join("..", data_dir)
        if os.path.exists(alt_path):
            data_dir = alt_path

    files = find_data_files(data_dir)
    if not files:
        raise FileNotFoundError(f"No data files found in {data_dir}")

    parts = [_try_read_csv(f) for f in files]
    df = pd.concat(parts, ignore_index=True, sort=False)

    if df.shape[1] == len(DEFAULT_COLS):
        df.columns = DEFAULT_COLS
    else:
        n = min(len(DEFAULT_COLS), df.shape[1])
        df.columns = DEFAULT_COLS[:n] + [f'col_{i}' for i in range(n, df.shape[1])]

    df.replace(['?', '\x00', '\x00\x00', 'NA', 'na', 'null'], np.nan, inplace=True)
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    if 'target' not in df.columns:
        if 'num' in df.columns:
            df['target'] = df['num'].apply(lambda x: 1 if (pd.notna(x) and x > 0) else 0)
        else:
            raise ValueError("No 'num' or 'target' column found.")

    id_cols = [c for c in df.columns if 'id' in c.lower()]
    feature_cols = [c for c in df.columns if c not in (id_cols + ['target','num'])]

    return {
        'original': df.copy(),
        'features': df[feature_cols].copy(),
        'targets': df[['target']].copy(),
        'ids': df[id_cols].copy() if id_cols else pd.DataFrame(index=df.index),
        'headers': list(df.columns)
    }

In [25]:
dataset = load_and_combine_data("data")
df = dataset["original"]
X = dataset["features"]
y = dataset["targets"]["target"]

In [26]:
cat_cols = [c for c in X.columns if (X[c].dtype == object) or (X[c].nunique() <= 6)]
num_cols = [c for c in X.columns if c not in cat_cols]

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
preprocessor = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("oh", ohe)]), cat_cols)
])

In [None]:
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)
df.to_csv("data/heart_disease_clean.csv", index=False)
joblib.dump(preprocessor, "models/preprocessor.pkl")

print("Cleaned dataset and preprocessor saved.")


Cleaned dataset and preprocessor saved.
