Loading the required dataset in csv  (processed.cleveland.data -> heart_disease.csv)

In [7]:
import pandas as pd

# Load processed.cleveland (space-delimited or comma-delimited depending on file)
df = pd.read_csv(r"C:\Users\pc\Documents\Basel BME\Programming\Python\SPRINTS Heart Disease Project\processed.cleveland.data", header=None)

# Add column names
df.columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

# Replace missing values "?" with NaN
df.replace("?", pd.NA, inplace=True)

# Convert numeric columns
for col in ["ca", "thal"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Drop rows with missing values (or impute)
df = df.dropna()

# Binarize target (0 = no disease, 1 = disease)
df["target"] = (df["target"] > 0).astype(int)
df.to_csv(r"C:\Users\pc\Documents\Basel BME\Programming\Python\SPRINTS Heart Disease Project\python\data\heart_disease.csv", index=False)


print(df.shape)
print(df.head())


(297, 14)
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  target  
0    3.0  0.0   6.0       0  
1    2.0  3.0   3.0       1  
2    2.0  2.0   7.0       1  
3    3.0  0.0   3.0       0  
4    1.0  0.0   3.0       0  


In [11]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load
df = pd.read_csv(r'C:\Users\pc\Documents\Basel BME\Programming\Python\SPRINTS Heart Disease Project\python\data\heart_disease.csv')



# If target is 0..4 (some UCI variations), binarize: 0 -> 0, 1-4 -> 1
if df['target'].nunique() > 2:
    df['target'] = (df['target'] > 0).astype(int)

# Example column typing (adjust if your dataset uses different names)
# numeric_cols = ['age','trestbps','chol','thalach','oldpeak']  # example
numeric_cols = df.select_dtypes(include=['int64','float64']).columns.drop('target').tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

# If categorical columns are numeric-coded (e.g., 'cp', 'restecg', 'slope', 'thal', 'ca') keep them as categorical
# Convert known-coded columns to str to one-hot encode
for c in ['cp','restecg','slope','thal','ca']:
    if c in df.columns:
        df[c] = df[c].astype(str)
        if c not in cat_cols:
            cat_cols.append(c)
        if c in numeric_cols:
            numeric_cols.remove(c)

# Impute and scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
])

# Save cleaned dataset
X = df.drop('target', axis=1)
y = df['target']
X_clean = pd.DataFrame(preprocessor.fit_transform(X),
                       columns=(numeric_cols + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols))))
cleaned = pd.concat([X_clean, y.reset_index(drop=True)], axis=1)
cleaned.to_csv(r'C:\Users\pc\Documents\Basel BME\Programming\Python\SPRINTS Heart Disease Project\python\data\heart_disease_clean.csv', index=False)
print("Saved cleaned CSV.")


Saved cleaned CSV.
