In [2]:
import joblib, os, glob, pandas as pd, numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [3]:
DEFAULT_COLS = [
    'age','sex','cp','trestbps','chol','fbs','restecg',
    'thalach','exang','oldpeak','slope','ca','thal','num'
]

In [4]:
def _try_read_csv(path):
    for sep in [',', r'\s+', ';', '\t']:
        try:
            df = pd.read_csv(path, header=None, sep=sep, engine='python', na_values=['?',''])
            df = df.dropna(how='all')
            if df.shape[1] >= 5:
                return df
        except Exception:
            continue
    with open(path, 'r', errors='ignore') as f:
        lines = [l.strip() for l in f if l.strip()]
    rows = [l.split() for l in lines]
    maxcols = max(len(r) for r in rows)
    rows = [r + [np.nan]*(maxcols-len(r)) for r in rows]
    return pd.DataFrame(rows)

In [5]:
def find_data_files(data_dir='data'):
    patterns = [os.path.join(data_dir, 'processed.*.data'),
                os.path.join(data_dir, '*.data'),
                os.path.join(data_dir, '*.csv')]
    files = []
    for p in patterns:
        files.extend(glob.glob(p))
    return sorted(list(dict.fromkeys(files)))

In [6]:
def load_and_combine_data(data_dir='data'):
    if not os.path.exists(data_dir):
        alt_path = os.path.join("..", data_dir)
        if os.path.exists(alt_path):
            data_dir = alt_path

    files = find_data_files(data_dir)
    if not files:
        raise FileNotFoundError(f"No data files found in {data_dir}")

    parts = [_try_read_csv(f) for f in files]
    df = pd.concat(parts, ignore_index=True, sort=False)

    if df.shape[1] == len(DEFAULT_COLS):
        df.columns = DEFAULT_COLS
    else:
        n = min(len(DEFAULT_COLS), df.shape[1])
        df.columns = DEFAULT_COLS[:n] + [f'col_{i}' for i in range(n, df.shape[1])]

    df.replace(['?', '\x00', '\x00\x00', 'NA', 'na', 'null'], np.nan, inplace=True)
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    if 'target' not in df.columns:
        if 'num' in df.columns:
            df['target'] = df['num'].apply(lambda x: 1 if (pd.notna(x) and x > 0) else 0)
        else:
            raise ValueError("No 'num' or 'target' column found.")

    id_cols = [c for c in df.columns if 'id' in c.lower()]
    feature_cols = [c for c in df.columns if c not in (id_cols + ['target','num'])]

    return {
        'original': df.copy(),
        'features': df[feature_cols].copy(),
        'targets': df[['target']].copy(),
        'ids': df[id_cols].copy() if id_cols else pd.DataFrame(index=df.index),
        'headers': list(df.columns)
    }

In [7]:
dataset = load_and_combine_data("data")
X = dataset["features"]
y = dataset["targets"]["target"]

In [8]:
preproc = joblib.load("models/preprocessor.pkl")

In [9]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVC": SVC(probability=True, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42)
}

In [11]:
for name, clf in models.items():
    pipe = Pipeline([("preproc", preproc), ("clf", clf)])
    pipe.fit(X, y)
    y_pred = pipe.predict(X)
    y_proba = pipe.predict_proba(X)[:,1] if hasattr(pipe, "predict_proba") else None
    auc = roc_auc_score(y, y_proba) if y_proba is not None else None
    print(f"\n{name}")
    print(classification_report(y, y_pred))
    print("AUC:", auc)


LogisticRegression
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     84686
           1       0.88      0.40      0.55      1890

    accuracy                           0.99     86576
   macro avg       0.93      0.70      0.77     86576
weighted avg       0.98      0.99      0.98     86576

AUC: 0.9805452498223441

RandomForest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84686
           1       1.00      1.00      1.00      1890

    accuracy                           1.00     86576
   macro avg       1.00      1.00      1.00     86576
weighted avg       1.00      1.00      1.00     86576

AUC: 1.0

SVC
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     84686
           1       1.00      0.34      0.51      1890

    accuracy                           0.99     86576
   macro avg       0.99      0.67      0.75     86576
we

In [12]:
models = {
    "SVC": SVC(probability=True, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42)
}

In [13]:
for name, clf in models.items():
    pipe = Pipeline([("preproc", preproc), ("clf", clf)])
    pipe.fit(X, y)
    y_pred = pipe.predict(X)
    y_proba = pipe.predict_proba(X)[:,1] if hasattr(pipe, "predict_proba") else None
    auc = roc_auc_score(y, y_proba) if y_proba is not None else None
    print(f"\n{name}")
    print(classification_report(y, y_pred))
    print("AUC:", auc)


SVC
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     84686
           1       1.00      0.34      0.51      1890

    accuracy                           0.99     86576
   macro avg       0.99      0.67      0.75     86576
weighted avg       0.99      0.99      0.98     86576

AUC: 0.9748705676131697

DecisionTree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84686
           1       1.00      1.00      1.00      1890

    accuracy                           1.00     86576
   macro avg       1.00      1.00      1.00     86576
weighted avg       1.00      1.00      1.00     86576

AUC: 1.0
