# Data Exploration & Cleaning Notebook

**Project**: Heart Disease Prediction

**Date**: 2026-01-20

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [21]:
# Load the datasets
data_path = Path('./data')
files = data_path.glob('*.data')

# Defining the column names 
cols = ["Age", "Sex", "Chest Pain", "Rest BP", "Chol", "FBS", "Rest ECG", "Max HR", "Ex Angina", "Oldpeak", "Slope", "Ca", "Thal", "CVD Class"]

dfs = [pd.read_csv(f, sep=",") for f in files]

# Assign the same column name to each dataframe
for df in dfs:
    df.columns = cols

df = pd.concat(dfs, ignore_index=True)


In [22]:
df.head()

Unnamed: 0,Age,Sex,Chest Pain,Rest BP,Chol,FBS,Rest ECG,Max HR,Ex Angina,Oldpeak,Slope,Ca,Thal,CVD Class
0,29.0,1.0,2.0,120,243,0,0,160,0,0.0,?,?,?,0
1,29.0,1.0,2.0,140,?,0,0,170,0,0.0,?,?,?,0
2,30.0,0.0,1.0,170,237,0,1,170,0,0.0,?,?,6,0
3,31.0,0.0,2.0,100,219,0,1,150,0,0.0,?,?,?,0
4,32.0,0.0,2.0,105,198,0,0,165,0,0.0,?,?,?,0


In [23]:
print(f"Dataset shape: {df.shape}")
print(f"Rows: {df.shape[0]:,}, Columns: {df.shape[1]}")

Dataset shape: (916, 14)
Rows: 916, Columns: 14


In [26]:
# Data types
print("\n=== Data Types ===")
print(df.dtypes.value_counts())
print("\nDetailed data types:")
print(df.dtypes)

# Basic info
print("\n=== Dataset Info ===")
df.info()


=== Data Types ===
object     10
float64     3
int64       1
Name: count, dtype: int64

Detailed data types:
Age           float64
Sex           float64
Chest Pain    float64
Rest BP        object
Chol           object
FBS            object
Rest ECG       object
Max HR         object
Ex Angina      object
Oldpeak        object
Slope          object
Ca             object
Thal           object
CVD Class       int64
dtype: object

=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         916 non-null    float64
 1   Sex         916 non-null    float64
 2   Chest Pain  916 non-null    float64
 3   Rest BP     916 non-null    object 
 4   Chol        916 non-null    object 
 5   FBS         916 non-null    object 
 6   Rest ECG    916 non-null    object 
 7   Max HR      916 non-null    object 
 8   Ex Angina   916 non-null  

In [30]:
# Replacing all the question marks with NaN values
df = df.replace("?", np.nan)

missing_data = pd.DataFrame({
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})

missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

missing_data

Unnamed: 0,Missing_Count,Missing_Percentage
Ca,608,66.38
Thal,483,52.73
Slope,308,33.62
FBS,89,9.72
Oldpeak,62,6.77
Rest BP,59,6.44
Ex Angina,55,6.0
Max HR,55,6.0
Chol,30,3.28
Rest ECG,2,0.22
