In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

In [2]:
uci_clean = pd.read_csv("UCI_combined.csv")
fram_clean = pd.read_csv("framingham.csv")

In [3]:
numeric_cols_uci = uci_clean.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols_uci:
    uci_clean[col] = uci_clean[col].fillna(uci_clean[col].mean())

In [4]:
numeric_cols_fram = fram_clean.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols_fram:
    fram_clean[col] = fram_clean[col].fillna(fram_clean[col].mean())

In [5]:
X_uci = uci_clean.drop(columns=['target'])
y_uci = uci_clean['target']

In [6]:
X_fram = fram_clean.drop(columns=['TenYearCHD'])
y_fram = fram_clean['TenYearCHD']

In [7]:
rf_uci = joblib.load("../model/model_uci.pkl")
rf_fram = joblib.load("../model/model_fram.pkl")

In [16]:
X_train_uci, X_val_uci, y_train_uci, y_val_uci = train_test_split(X_uci, y_uci, test_size=0.2, random_state=42)
X_train_fram, X_val_fram, y_train_fram, y_val_fram = train_test_split(X_fram, y_fram, test_size=0.2, random_state=42)

In [17]:
p_uci_val = rf_uci.predict_proba(X_val_uci)[:, 1]
p_fram_val = rf_fram.predict_proba(X_val_fram)[:, 1]

In [18]:
meta_uci = pd.DataFrame({
    'p_uci': p_uci_val,
    'p_fram': np.zeros_like(p_uci_val),
    'y': y_val_uci
})

In [19]:
meta_fram = pd.DataFrame({
    'p_uci': np.zeros_like(p_fram_val),
    'p_fram': p_fram_val,
    'y': y_val_fram
})

In [20]:
meta_dataset = pd.concat([meta_uci, meta_fram], axis=0).reset_index(drop=True)

In [21]:
meta_dataset = meta_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [23]:
meta_dataset.to_csv("meta_dataset.csv", index=False)