In [15]:
import pandas as pd
import numpy as np

# --- 1. Load the Dataset ---

In [16]:
# Read the data that is inside of the CSV
df = pd.read_csv("./Health_Data/heart.csv")
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [17]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [18]:
# Define the expected columns. This helps in verifying the data.
EXPECTED_COLUMNS = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]

# --- 2. Verify Columns ---

In [19]:
# --- 2. Verify Columns ---
# Check if all expected columns are present
missing_cols = [col for col in EXPECTED_COLUMNS if col not in df.columns]
if missing_cols:
    print(f"\nWarning: The following expected columns are missing from the dataset: {missing_cols}")
    # You might want to handle this, e.g., by exiting or creating placeholder columns
    # For now, we'll proceed with the available columns.
else:
    print("\nAll expected columns are present.")


All expected columns are present.


# --- 3. Handle Missing Values ---

In [20]:
print("\n--- Handling Missing Values ---")
print("Checking for missing values (NaNs):")
missing_values_summary = df.isnull().sum()
print(missing_values_summary[missing_values_summary > 0])

if missing_values_summary.sum() == 0:
    print("No missing values found in the dataset.")
else:
    print("\nMissing values detected. Here are some common strategies:")
    # Strategy 1: Drop rows with any missing values (use with caution, can lose a lot of data)
    # df_cleaned = df.dropna()
    # print(f"Shape after dropping rows with NaNs: {df_cleaned.shape}")

    # Strategy 2: Impute missing numerical values (e.g., with mean, median, or mode)
    # For 'age', 'trestbps', 'chol', 'thalach', 'oldpeak' (assuming these are numerical)
    numerical_cols_to_impute = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
    for col in numerical_cols_to_impute:
        if col in df.columns and df[col].isnull().any():
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Filled missing values in '{col}' with its median ({median_val}).")

    # Strategy 3: Impute missing categorical values (e.g., with mode)
    # For 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target' (assuming these are categorical/ordinal)
    # Note: 'ca' and 'thal' might have '?' or other non-numeric values representing missing data
    # Before imputation, convert non-numeric missing indicators to NaN
    df.replace('?', np.nan, inplace=True) # Replace '?' with NaN
    df.replace('N/A', np.nan, inplace=True) # Replace 'N/A' with NaN (add other common indicators if needed)

    categorical_cols_to_impute = ['cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'sex']
    for col in categorical_cols_to_impute:
        if col in df.columns and df[col].isnull().any():
            # Convert to numeric if possible before finding mode, as mode works better on numerical data
            # Or treat them as strings if they are truly categorical and not ordinal
            try:
                # Attempt to convert to numeric, coercing errors to NaN
                df[col] = pd.to_numeric(df[col], errors='coerce')
                mode_val = df[col].mode()[0] # mode() returns a Series, take the first element
                df[col].fillna(mode_val, inplace=True)
                print(f"Filled missing values in '{col}' with its mode ({mode_val}).")
            except Exception:
                # If conversion fails, treat as string and impute with string mode
                mode_val_str = df[col].astype(str).mode()[0]
                df[col].fillna(mode_val_str, inplace=True)
                print(f"Filled missing string values in '{col}' with its mode ({mode_val_str}).")


print("\nMissing values after imputation:")
print(df.isnull().sum())


--- Handling Missing Values ---
Checking for missing values (NaNs):
Series([], dtype: int64)
No missing values found in the dataset.

Missing values after imputation:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


# --- 4. Handle Duplicate Rows ---

In [21]:
print("\n--- Handling Duplicate Rows ---")
initial_rows = df.shape[0]
df.drop_duplicates(inplace=True)
duplicates_removed = initial_rows - df.shape[0]
if duplicates_removed > 0:
    print(f"Removed {duplicates_removed} duplicate rows.")
else:
    print("No duplicate rows found.")
print(f"Shape after removing duplicates: {df.shape}")


--- Handling Duplicate Rows ---
Removed 723 duplicate rows.
Shape after removing duplicates: (302, 14)


# --- 5. Check and Correct Data Types ---

In [22]:
print("\n--- Checking Data Types ---")
print("Current data types:")
print(df.dtypes)

# Define expected data types for better consistency
# Adjust these based on your understanding of the dataset
expected_data_types = {
    'age': 'int',
    'sex': 'int', # Assuming 0/1 for male/female
    'cp': 'int', # Chest Pain Type (ordinal/categorical)
    'trestbps': 'int', # Resting Blood Pressure
    'chol': 'int', # Serum Cholestoral
    'fbs': 'int', # Fasting Blood Sugar (> 120 mg/dl)
    'restecg': 'int', # Resting Electrocardiographic Results
    'thalach': 'int', # Maximum Heart Rate Achieved
    'exang': 'int', # Exercise Induced Angina
    'oldpeak': 'float', # ST depression induced by exercise relative to rest
    'slope': 'int', # The slope of the peak exercise ST segment
    'ca': 'int', # Number of major vessels (0-3) colored by flourosopy
    'thal': 'int', # Thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)
    'target': 'int' # Target variable (0 = no disease, 1 = disease)
}

for col, dtype in expected_data_types.items():
    if col in df.columns:
        try:
            # Use errors='coerce' to turn unparseable values into NaN
            # This is important if some non-numeric values slipped through
            if dtype == 'int':
                df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64') # Use Int64 for nullable integer
            elif dtype == 'float':
                df[col] = pd.to_numeric(df[col], errors='coerce').astype('float')
            # Add more type conversions if needed (e.g., 'category', 'bool')
            print(f"Converted '{col}' to {dtype}.")
        except Exception as e:
            print(f"Could not convert '{col}' to {dtype}: {e}")

# Re-check for NaNs introduced by type coercion
print("\nMissing values after type coercion (if any):")
print(df.isnull().sum()[df.isnull().sum() > 0])

# Handle any new NaNs introduced by coercion (e.g., impute again or drop)
# For simplicity, we'll impute with mode/median again for newly introduced NaNs
for col in df.columns:
    if df[col].isnull().any():
        if pd.api.types.is_numeric_dtype(df[col]):
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Filled new NaNs in '{col}' with median ({median_val}) after coercion.")
        else:
            mode_val = df[col].mode()[0]
            df[col].fillna(mode_val, inplace=True)
            print(f"Filled new NaNs in '{col}' with mode ({mode_val}) after coercion.")


print("\nData types after correction:")
print(df.dtypes)


--- Checking Data Types ---
Current data types:
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object
Converted 'age' to int.
Converted 'sex' to int.
Converted 'cp' to int.
Converted 'trestbps' to int.
Converted 'chol' to int.
Converted 'fbs' to int.
Converted 'restecg' to int.
Converted 'thalach' to int.
Converted 'exang' to int.
Converted 'oldpeak' to float.
Converted 'slope' to int.
Converted 'ca' to int.
Converted 'thal' to int.
Converted 'target' to int.

Missing values after type coercion (if any):
Series([], dtype: int64)

Data types after correction:
age           Int64
sex           Int64
cp            Int64
trestbps      Int64
chol          Int64
fbs           Int64
restecg       Int64
thalach       Int64
exang         Int64
oldpeak     fl

# --- 6. Outlier Detection (Basic Example) ---

In [23]:
print("\n--- Basic Outlier Detection ---")
# For numerical columns, you can use IQR (Interquartile Range) method
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

for col in numerical_cols:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not outliers.empty:
            print(f"Potential outliers detected in '{col}': {len(outliers)} rows.")
            # print(outliers[[col]]) # Uncomment to see the outlier rows
            # You might choose to:
            # 1. Cap outliers (e.g., replace with bounds)
            # df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
            # df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
            # 2. Remove outliers (use with caution)
            # df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        else:
            print(f"No significant outliers detected in '{col}' using IQR method.")


--- Basic Outlier Detection ---
No significant outliers detected in 'age' using IQR method.
Potential outliers detected in 'trestbps': 9 rows.
Potential outliers detected in 'chol': 5 rows.
Potential outliers detected in 'thalach': 1 rows.
Potential outliers detected in 'oldpeak': 5 rows.


# --- 7. Categorical Data Encoding (Conceptual) ---

In [24]:
print("\n--- Categorical Data Encoding (Conceptual) ---")
# The columns 'sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target'
# are likely already encoded as numerical (0/1 or other integer categories).
# If they were strings (e.g., 'Male', 'Female'), you would need to encode them.

# Example if 'sex' was 'Male'/'Female':
if 'sex' in df.columns and df['sex'].dtype == 'object': # Check if it's a string/object type
    df['sex'] = df['sex'].map({'Female': 0, 'Male': 1})
    print("Encoded 'sex' column.")

# For multi-category nominal features, use One-Hot Encoding:
# Example: If 'cp' was 'typical angina', 'atypical angina', etc.
df = pd.get_dummies(df, columns=['cp'], prefix='cp')
print("Applied One-Hot Encoding to 'cp'.")

print("Assuming categorical columns are already numerically encoded based on typical public health datasets.")
print("If any categorical columns were strings, One-Hot Encoding or Label Encoding would be applied here.")



--- Categorical Data Encoding (Conceptual) ---
Applied One-Hot Encoding to 'cp'.
Assuming categorical columns are already numerically encoded based on typical public health datasets.
If any categorical columns were strings, One-Hot Encoding or Label Encoding would be applied here.


# --- Final Summary ---

In [25]:
# --- Final Summary ---
print("\n--- Data Cleaning Complete ---")
print(f"Final dataset shape: {df.shape}")
print("\nFinal data types:")
print(df.dtypes)
print("\nFinal missing values check:")
print(df.isnull().sum())
print("\nCleaned data head:")
print(df.head())


--- Data Cleaning Complete ---
Final dataset shape: (302, 17)

Final data types:
age           Int64
sex           Int64
trestbps      Int64
chol          Int64
fbs           Int64
restecg       Int64
thalach       Int64
exang         Int64
oldpeak     float64
slope         Int64
ca            Int64
thal          Int64
target        Int64
cp_0          uint8
cp_1          uint8
cp_2          uint8
cp_3          uint8
dtype: object

Final missing values check:
age         0
sex         0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
cp_0        0
cp_1        0
cp_2        0
cp_3        0
dtype: int64

Cleaned data head:
   age  sex  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  ca  \
0   52    1       125   212    0        1      168      0      1.0      2   2   
1   53    1       140   203    1        0      155      1      3.1      0   0   
2   70    1       145

In [26]:
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3
0,52,1,125,212,0,1,168,0,1.0,2,2,3,0,1,0,0,0
1,53,1,140,203,1,0,155,1,3.1,0,0,3,0,1,0,0,0
2,70,1,145,174,0,1,125,1,2.6,0,0,3,0,1,0,0,0
3,61,1,148,203,0,1,161,0,0.0,2,1,3,0,1,0,0,0
4,62,0,138,294,1,1,106,0,1.9,1,3,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,68,0,120,211,0,0,115,0,1.5,1,0,2,1,0,0,1,0
733,44,0,108,141,0,1,175,0,0.6,1,0,2,1,0,0,1,0
739,52,1,128,255,0,1,161,1,0.0,2,1,3,0,1,0,0,0
843,59,1,160,273,0,0,125,0,0.0,2,0,2,0,0,0,0,1


In [27]:
df.columns

Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
       'oldpeak', 'slope', 'ca', 'thal', 'target', 'cp_0', 'cp_1', 'cp_2',
       'cp_3'],
      dtype='object')

In [28]:
# Optionally, save the cleaned data
df.to_csv('./Health_Data/cleaned_health.csv', index=False)