In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("oktayrdeki/heart-disease")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Francisco\.cache\kagglehub\datasets\oktayrdeki\heart-disease\versions\1


In [37]:
#df = pd.read_csv("./data/heart_disease.csv")

# Display first few rows
#print(df.head())


In [41]:
# All imports needed
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
import pandas as pd
def load_data(file_path: str):
    """
    Returns a DataFrame object of the csv file passed in.

    :param file_path: String of the file path to load in

    :return: A DataFrame object of the csv data
    """
    assert(isinstance(file_path, str)), "File path must be a valid path"
    # file_path = "./data/heart_disease.csv"
    df = pd.read_csv(file_path)
    return df

In [3]:
def get_data_info(data_frame):
    """
    View the structure of the data frame

    :param data_frame: The data frame to get the structure of
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"
    print("Summary of Dataset:")
    data_frame.info()
    print("Get missing count")
    data_frame.isnull().sum() 

In [4]:
def get_num_rows(data_frame):
    """
    Get number of rows of the data frame

    :param data_frame: The data frame to get the number of rows
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"
    return data_frame.shape[0]

In [47]:
def classify_non_numerical_columns(data_frame):
    """
    Classification model that predicts the output of non-numerical
    data in the data frame for missing entries.

    :param data_Frame: The data_frame.

    :return: A new data frame with all of the classified columns.
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"
    non_numeric_cols = data_frame.select_dtypes(exclude=['number']).columns
    for col in non_numeric_cols:
        label_encoder = LabelEncoder()
        data_frame[col] = label_encoder.fit_transform(data_frame[col])
    return data_frame

In [48]:
df = load_data("./data/heart_disease.csv")
print(classify_non_numerical_columns(df))

       Age  Gender  Blood Pressure  Cholesterol Level  Exercise Habits  \
0     56.0       1           153.0              155.0                0   
1     69.0       0           146.0              286.0                0   
2     46.0       1           126.0              216.0                1   
3     32.0       0           122.0              293.0                0   
4     60.0       1           166.0              242.0                1   
...    ...     ...             ...                ...              ...   
9995  25.0       0           136.0              243.0                2   
9996  38.0       1           172.0              154.0                2   
9997  73.0       1           152.0              201.0                0   
9998  23.0       1           142.0              299.0                1   
9999  38.0       0           128.0              193.0                2   

      Smoking  Family Heart Disease  Diabetes        BMI  High Blood Pressure  \
0           1                 

In [38]:
def clean_data(data_frame):
    """ 
    Clean the data up from any missing values using KNN Imputer on
    numerical columns and Simple Imputer on non-numerical columns.
    Produces a cleaned data frame without missing entries. 

    :param data_frame: The data frame to clean up

    :return: The cleaned data frame
    """
    assert(isinstance(data_frame, pd.DataFrame)), "The input must be DataFrame object"

    # KNNImputer only works on numerical data
    # Apply to numerical columns with missing values
    numerical_cols = data_frame.select_dtypes(include=['number']).columns
    knn_imputer = KNNImputer(n_neighbors=5)
    data_frame[numerical_cols] = knn_imputer.fit_transform(data_frame[numerical_cols])

    # Apply SimpleImputer for non-numerical columns
    non_numerical_cols = data_frame.select_dtypes(exclude=['number']).columns
    mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    data_frame[non_numerical_cols] = mode_imputer.fit_transform(data_frame[non_numerical_cols])

    # TODO: Create a classifier pipeline for the non-numerical columns
    return data_frame

In [39]:
def main():
    """
    Main function to run complete data pipeline. Includes cleaning data,
    visualization, predictor.
    """
    df = load_data("./data/heart_disease.csv")
    print(f"Before cleaning: {get_num_rows(df)}")

    df_cleaned = clean_data(df)
    print(f"After cleaning: {get_num_rows(df_cleaned)}")

    duplicate_counts = df_cleaned.duplicated().sum()
    print(f"Duplicate rows: {duplicate_counts}")
main()

Before cleaning: 10000
After cleaning: 10000
Duplicate rows: 0
