In [26]:
import joblib, os, glob, numpy as np, pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [27]:
DEFAULT_COLS = [
    'age','sex','cp','trestbps','chol','fbs','restecg',
    'thalach','exang','oldpeak','slope','ca','thal','num'
]

In [28]:
def _try_read_csv(path):
    for sep in [',', r'\s+', ';', '\t']:
        try:
            df = pd.read_csv(path, header=None, sep=sep, engine='python', na_values=['?',''])
            df = df.dropna(how='all')
            if df.shape[1] >= 5:
                return df
        except Exception:
            continue
    with open(path, 'r', errors='ignore') as f:
        lines = [l.strip() for l in f if l.strip()]
    rows = [l.split() for l in lines]
    maxcols = max(len(r) for r in rows)
    rows = [r + [np.nan]*(maxcols-len(r)) for r in rows]
    return pd.DataFrame(rows)

In [29]:
def find_data_files(data_dir='data'):
    patterns = [os.path.join(data_dir, 'processed.*.data'),
                os.path.join(data_dir, '*.data'),
                os.path.join(data_dir, '*.csv')]
    files = []
    for p in patterns:
        files.extend(glob.glob(p))
    return sorted(list(dict.fromkeys(files)))

In [30]:
def load_and_combine_data(data_dir='data'):
    if not os.path.exists(data_dir):
        alt_path = os.path.join("..", data_dir)
        if os.path.exists(alt_path):
            data_dir = alt_path

    files = find_data_files(data_dir)
    if not files:
        raise FileNotFoundError(f"No data files found in {data_dir}")

    parts = [_try_read_csv(f) for f in files]
    df = pd.concat(parts, ignore_index=True, sort=False)

    if df.shape[1] == len(DEFAULT_COLS):
        df.columns = DEFAULT_COLS
    else:
        n = min(len(DEFAULT_COLS), df.shape[1])
        df.columns = DEFAULT_COLS[:n] + [f'col_{i}' for i in range(n, df.shape[1])]

    df.replace(['?', '\x00', '\x00\x00', 'NA', 'na', 'null'], np.nan, inplace=True)
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    if 'target' not in df.columns:
        if 'num' in df.columns:
            df['target'] = df['num'].apply(lambda x: 1 if (pd.notna(x) and x > 0) else 0)
        else:
            raise ValueError("No 'num' or 'target' column found.")

    id_cols = [c for c in df.columns if 'id' in c.lower()]
    feature_cols = [c for c in df.columns if c not in (id_cols + ['target','num'])]

    return {
        'original': df.copy(),
        'features': df[feature_cols].copy(),
        'targets': df[['target']].copy(),
        'ids': df[id_cols].copy() if id_cols else pd.DataFrame(index=df.index),
        'headers': list(df.columns)
    }

In [31]:
dataset = load_and_combine_data("data")
X = dataset["features"]
y = dataset["targets"]["target"]

In [32]:
preproc = joblib.load("models/preprocessor.pkl")
X_proc = preproc.fit_transform(X)
feat_names = preproc.get_feature_names_out()

In [33]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_proc, y)
importances = rf.feature_importances_
imp_sorted = sorted(zip(feat_names, importances), key=lambda x: -x[1])
print("Top 10 by RF:", imp_sorted[:10])

Top 10 by RF: [('num__oldpeak', 0.14456521432200378), ('num__exang', 0.13757259036510322), ('num__thalach', 0.12684037013391664), ('cat__col_28_1.0', 0.08792498380569119), ('cat__col_28_0.0', 0.07594651029997225), ('num__trestbps', 0.07361953888988268), ('num__age', 0.06897736841394743), ('num__cp', 0.06789046405551255), ('num__chol', 0.06014723258968327), ('num__thal', 0.05214064597499033)]


In [34]:
logreg = LogisticRegression(max_iter=2000)
rfe = RFE(logreg, n_features_to_select=10).fit(X_proc, y)
selected = [f for f, s in zip(feat_names, rfe.support_) if s]
print("RFE selected:", selected)

RFE selected: ['num__trestbps', 'cat__col_14_-9.0', 'cat__col_14_1.0', 'cat__col_19_0.0', 'cat__col_21_2116.0', 'cat__col_22_3.0', 'cat__col_24_-9.0', 'cat__col_24_1.0', 'cat__col_28_0.0', 'cat__col_28_1.0']
