In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("oktayrdeki/heart-disease")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Francisco\.cache\kagglehub\datasets\oktayrdeki\heart-disease\versions\1


In [37]:
#df = pd.read_csv("./data/heart_disease.csv")

# Display first few rows
#print(df.head())


In [3]:
# All imports needed
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from enum import Enum

In [4]:
import pandas as pd
def load_data(file_path: str):
    """
    Returns a DataFrame object of the csv file passed in.

    :param file_path: String of the file path to load in

    :return: A DataFrame object of the csv data
    """
    assert(isinstance(file_path, str)), "File path must be a valid path"
    # file_path = "./data/heart_disease.csv"
    df = pd.read_csv(file_path)
    return df

In [5]:
def get_data_info(data_frame):
    """
    View the structure of the data frame

    :param data_frame: The data frame to get the structure of
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"
    print("Summary of Dataset:")
    data_frame.info()
    print("Get missing count")
    data_frame.isnull().sum() 

In [6]:
def get_num_rows(data_frame: pd.DataFrame):
    """
    Get number of rows of the data frame

    :param data_frame: The data frame to get the number of rows
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"
    return data_frame.shape[0]

In [7]:
def get_num_cols(data_frame: pd.DataFrame):
    """
    Get number of columns of the data frame

    :param data_frame: The data frame to get the number of columns
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"
    return data_frame.shape[1]

In [8]:
def classify_non_numerical_columns(data_frame):
    """
    Classification model that predicts the output of non-numerical
    data in the data frame for missing entries.

    :param data_Frame: The data_frame.

    :return: A new data frame with all of the classified columns.
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"
    non_numeric_cols = data_frame.select_dtypes(exclude=['number']).columns
    for col in non_numeric_cols:
        label_encoder = LabelEncoder()
        data_frame[col] = label_encoder.fit_transform(data_frame[col])
    return data_frame

In [10]:
class ImputerMethod(Enum):
    KNN = "KNN"
    SIMPLE = "Simple"
    DROP = "Drop"

def clean_data(data_frame: pd.DataFrame, method: ImputerMethod):
    """ 
    Clean the data up from any missing values (if any) by just dropping
    these rows or by using KNN Imputer on numerical columns and
    Simple Imputer on non-numerical columns. Produces a cleaned
    data frame without missing entries. 

    :param data_frame: The data frame to clean up

    :param method: The method of cleaning the data. Can either drop all rows with missing entries,
    or use KNN Imputer on numerical columns and simple Imputer on non-numericla columns.
    Method is of type ImputerMethod enum.

    :return: The cleaned data frame if there are any rows that have missing entries
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"
    assert(isinstance(method, ImputerMethod)), "The input must be an imputer method either KNN or SIMPLE"

    if data_frame.isnull().any(axis=1).sum():
        if method == ImputerMethod.KNN:
            # KNNImputer only works on numerical data
            # Apply to numerical columns with missing values
            numerical_cols = data_frame.select_dtypes(include=['number']).columns
            knn_imputer = KNNImputer(n_neighbors=5)
            data_frame[numerical_cols] = knn_imputer.fit_transform(data_frame[numerical_cols])

            # Apply SimpleImputer for non-numerical columns
            non_numerical_cols = data_frame.select_dtypes(exclude=['number']).columns
            mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
            data_frame[non_numerical_cols] = mode_imputer.fit_transform(data_frame[non_numerical_cols])
        elif method == ImputerMethod.DROP:
            data_frame = data_frame.dropna()
    return data_frame

In [11]:
def main():
    """
    Main function to run complete data pipeline. Includes cleaning data,
    visualization, predictor.
    """
    df = load_data("./data/heart_disease.csv")
    print(f"Num rows before cleaning: {get_num_rows(df)}")
    print(f"Num of cols before clearning: {get_num_cols(df)}")

    df_cleaned = clean_data(df, ImputerMethod.DROP)
    print(f"Num of rows after cleaning: {get_num_rows(df_cleaned)}")
    print(f"Num of cols after clearning: {get_num_cols(df_cleaned)}")

    duplicate_counts = df_cleaned.duplicated().sum()
    print(f"Duplicate rows: {duplicate_counts}")
main()

Num rows before cleaning: 10000
Num of cols before clearning: 21
Num of rows after cleaning: 7067
Num of cols after clearning: 21
Duplicate rows: 0
