# Heart disease data cleaning

This notebook demonstrates:

- the ability to extract dataset from a CSV
- the ability to inspect the data
- the ability to clean the data. Since this data was large, we simply dropped rows with incomplete data as the focus is on data exploration.

In [11]:
# import pandas and define path for heart failure data
import pandas as pd

heart_failure_csv_path = '../data/heart_disease_uci.csv'

In [12]:
# Define target column

target_column = 'num'

In [13]:
# load csv and split features and target 

try:
    #read the CSV file
    heart_failure_df = pd.read_csv(heart_failure_csv_path, sep=',')

    print(f"Successfully loaded CSV file: {heart_failure_csv_path}")
    print(f"Loaded CSV shape: {heart_failure_df.shape}")
    print(f"CSV columns found: {heart_failure_df.columns.tolist()}")

except FileNotFoundError:
    print(f"ERROR: CSV file not found at path: {heart_failure_csv_path}")
    print("Please double check the file path")
except Exception as e:
    print(f"An error occured loading or processing the CSV: {e}")
    print("Please check file path, format (is it truly CSV?), separator, header row, and column name.") 

Successfully loaded CSV file: ../data/heart_disease_uci.csv
Loaded CSV shape: (920, 16)
CSV columns found: ['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']


In [14]:
# initial data analysis using descriptive statistics

print(heart_failure_df.info())
print(heart_failure_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB
None
               id         age    trestbps        chol      thalch     oldpeak  \
count  920.000000  920.000000  861.000000  

In [15]:
# explore missing values for cleaning

def print_unique_values(df):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    print("Numerical Columns")
    for col in numerical_cols:
        unique_vals = sorted(df[col].unique())
        print(f"\n{col} ({len(unique_vals)} unique values)")
        print(unique_vals)

    print("Categorical Columns")
    for col in categorical_cols:
        print(f"\n{col} - Value Counts")
        print(df[col].value_counts(dropna=False))

print_unique_values(heart_failure_df)

Numerical Columns

id (920 unique values)
[np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(66), np.int64(67), np.int64(68), np.int64(69), 

### Notes from analysis:

##### 1) trestbps and chol have 2 forms of missing values: nan and 0.0. It is physiologically impossible for a living patient to have a resting blood pressure or cholesterol level of 0.
##### 2) all other missing value types are nan
##### 3) presence of heart disease is labeled 0 (no presence) or 1-4 (presence), need to create simple binary output, 0 for no presence and 1 for presence

#### As the focus of this is visualization of data and statistical analysis, rows with data missing will just be dropped.

In [16]:
# drop rows with any missing values
rows_before = heart_failure_df.shape[0]
heart_failure_df.dropna(inplace=True)
heart_failure_df = heart_failure_df[(heart_failure_df['trestbps'] != 0.0) & (heart_failure_df['chol'] != 0.0)]
rows_after = heart_failure_df.shape[0]
print(f"Removed {rows_before - rows_after} rows with missing values.")
print(f"Dataframe shape is: {heart_failure_df.shape}")

Removed 621 rows with missing values.
Dataframe shape is: (299, 16)


In [17]:
# check that there are no missing values
print("Missing values after cleaning:")
print(heart_failure_df.isnull().sum())
# check for duplicates
duplicates = heart_failure_df.duplicated().sum()
if duplicates > 0:
    print(f"Found {duplicates} duplicate rows.")
    heart_failure_df.drop_duplicates(inplace=True)
    print(f"Removed {duplicates} duplicate rows.")
else:
    print("No duplicate rows found.")

Missing values after cleaning:
id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64
No duplicate rows found.


In [18]:
# check to see if the target column is binary
if heart_failure_df[target_column].nunique() > 2:
    print(f"Target column '{target_column}' is not binary. It has {heart_failure_df[target_column].nunique()} unique values.")
else:
    print(f"Target column '{target_column}' is binary. It has {heart_failure_df[target_column].nunique()} unique values.")

Target column 'num' is not binary. It has 5 unique values.


In [19]:
# Convert "num" column values to binary (0 or 1)
heart_failure_df['num'] = heart_failure_df['num'].apply(lambda x: 0 if x == 0 else 1)
print("Converted 'num' column to binary values.")
print(heart_failure_df['num'].value_counts())

Converted 'num' column to binary values.
num
0    160
1    139
Name: count, dtype: int64


In [20]:
# final check for cleaned dataset
print("Final cleaned dataset info:")
print(heart_failure_df.info())
print("Final cleaned dataset shape:")
print(heart_failure_df.shape)
print("heart_failure_df.describe():")
# save cleaned dataset to CSV
cleaned_heart_failure_csv_path = '../data/heart_disease_uci_cleaned.csv'
try:
    heart_failure_df.to_csv(cleaned_heart_failure_csv_path, index=False)
    print(f"Cleaned dataset saved to: {cleaned_heart_failure_csv_path}")
except Exception as e:
    print(f"An error occurred while saving the cleaned dataset: {e}")
    print("Please check file path and permissions.")

Final cleaned dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 299 entries, 0 to 748
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        299 non-null    int64  
 1   age       299 non-null    int64  
 2   sex       299 non-null    object 
 3   dataset   299 non-null    object 
 4   cp        299 non-null    object 
 5   trestbps  299 non-null    float64
 6   chol      299 non-null    float64
 7   fbs       299 non-null    object 
 8   restecg   299 non-null    object 
 9   thalch    299 non-null    float64
 10  exang     299 non-null    object 
 11  oldpeak   299 non-null    float64
 12  slope     299 non-null    object 
 13  ca        299 non-null    float64
 14  thal      299 non-null    object 
 15  num       299 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 39.7+ KB
None
Final cleaned dataset shape:
(299, 16)
heart_failure_df.describe():
Cleaned dataset saved to: ../data/