In [1]:
# Ignoring Warnings
import warnings
warnings.simplefilter("ignore")

In [2]:
import pandas as pd

# Reading data
merged_file_path = "../data/heart_disease_combined.csv"
df_merged = pd.read_csv(merged_file_path, na_values="?")
df_merged.head()

Unnamed: 0,Age,Sex,Chest Pain,Rest BP,Cholesterol,Fasting Sugar,Rest ECG,Max HR,Ex Angina,ST Depression,ST Slope,Fluoroscopy,Thalassemia,Heart Disease
0,28.0,1.0,2.0,130.0,132.0,0.0,2.0,185.0,0.0,0.0,,,,0
1,29.0,1.0,2.0,120.0,243.0,0.0,0.0,160.0,0.0,0.0,,,,0
2,29.0,1.0,2.0,140.0,,0.0,0.0,170.0,0.0,0.0,,,,0
3,30.0,0.0,1.0,170.0,237.0,0.0,1.0,170.0,0.0,0.0,,,6.0,0
4,31.0,0.0,2.0,100.0,219.0,0.0,1.0,150.0,0.0,0.0,,,,0


In [3]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            720 non-null    float64
 1   Sex            720 non-null    float64
 2   Chest Pain     720 non-null    float64
 3   Rest BP        717 non-null    float64
 4   Cholesterol    697 non-null    float64
 5   Fasting Sugar  637 non-null    float64
 6   Rest ECG       718 non-null    float64
 7   Max HR         718 non-null    float64
 8   Ex Angina      718 non-null    float64
 9   ST Depression  714 non-null    float64
 10  ST Slope       513 non-null    float64
 11  Fluoroscopy    307 non-null    float64
 12  Thalassemia    400 non-null    float64
 13  Heart Disease  720 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 78.9 KB


In [4]:
# Checking for duplicate rows
duplicate_rows = df_merged.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 1


In [5]:
# Dropping Duplicate
df_merged = df_merged.drop_duplicates()

In [6]:
# Checking for missing values
missing_values = df_merged.isnull().sum()

# Calculating the percentage of missing values
missing_percentage = (missing_values / len(df_merged)) * 100

# DataFrame for the missing values table
missing_values_table = pd.DataFrame({
    'Column Name': df_merged.columns,
    'Missing Values': missing_values,
    'Percentage of Missing Values': missing_percentage
})

missing_values_table.sort_values(by='Missing Values', ascending=False, inplace=True)
print("Missing Values Table:")
missing_values_table

Missing Values Table:


Unnamed: 0,Column Name,Missing Values,Percentage of Missing Values
Fluoroscopy,Fluoroscopy,412,57.301808
Thalassemia,Thalassemia,319,44.367177
ST Slope,ST Slope,206,28.650904
Fasting Sugar,Fasting Sugar,83,11.543811
Cholesterol,Cholesterol,22,3.059805
ST Depression,ST Depression,6,0.834492
Rest BP,Rest BP,3,0.417246
Rest ECG,Rest ECG,2,0.278164
Max HR,Max HR,2,0.278164
Ex Angina,Ex Angina,2,0.278164


In [7]:
numerical_columns = ['Rest BP', 'Cholesterol', 'Max HR', 'ST Depression']
categorical_columns = ['Fluoroscopy', 'Thalassemia', 'ST Slope', 'Fasting Sugar', 'Rest ECG', 'Ex Angina']

# Grouping the data
grouped = df_merged.groupby(['Sex', 'Heart Disease', 'Chest Pain'])

# Filling with mean for Numeric
for column in numerical_columns:
    df_merged[column] = grouped[column].transform(lambda x: x.fillna(x.mean()))

# Filling with mode for Categorical
for column in categorical_columns:
    df_merged[column] = grouped[column].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x))

print("Data after filling missing values:")
df_merged.head()

Data after filling missing values:


Unnamed: 0,Age,Sex,Chest Pain,Rest BP,Cholesterol,Fasting Sugar,Rest ECG,Max HR,Ex Angina,ST Depression,ST Slope,Fluoroscopy,Thalassemia,Heart Disease
0,28.0,1.0,2.0,130.0,132.0,0.0,2.0,185.0,0.0,0.0,1.0,0.0,3.0,0
1,29.0,1.0,2.0,120.0,243.0,0.0,0.0,160.0,0.0,0.0,1.0,0.0,3.0,0
2,29.0,1.0,2.0,140.0,230.753086,0.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0
3,30.0,0.0,1.0,170.0,237.0,0.0,1.0,170.0,0.0,0.0,1.0,0.0,6.0,0
4,31.0,0.0,2.0,100.0,219.0,0.0,1.0,150.0,0.0,0.0,1.0,0.0,3.0,0


In [8]:
# Craeting copy of data
df_encoded = df_merged.copy()

In [9]:
mappings = {
    'Sex': {1: 'male', 0: 'female'},
    'Chest Pain': {1: 'typical angina', 2: 'atypical angina', 3: 'non-anginal pain', 4: 'asymptomatic'},
    'Fasting Sugar': {1: 'true', 0: 'false'},
    'Rest ECG': {0: 'normal', 1: 'ST-T wave abnormality', 2: 'left ventricular hypertrophy'},
    'Ex Angina': {1: 'yes', 0: 'no'},
    'ST Slope': {1: 'upsloping', 2: 'flat', 3: 'downsloping'},
    'Fluoroscopy': {0: '0 vessels', 1: '1 vessel', 2: '2 vessels', 3: '3 vessels'},
    'Thalassemia': {3: 'normal', 6: 'fixed defect', 7: 'reversible defect'},
    'Heart Disease': {0: '< 50% diameter narrowing', 1: '> 50% diameter narrowing'},
}


# Encode the columns using the defined mappings
for column, mapping in mappings.items():
    if column in df_encoded.columns:
        df_encoded[column] = df_encoded[column].map(mapping)

print("Data after encoding categorical columns:")
df_encoded.head()

Data after encoding categorical columns:


Unnamed: 0,Age,Sex,Chest Pain,Rest BP,Cholesterol,Fasting Sugar,Rest ECG,Max HR,Ex Angina,ST Depression,ST Slope,Fluoroscopy,Thalassemia,Heart Disease
0,28.0,male,atypical angina,130.0,132.0,False,left ventricular hypertrophy,185.0,no,0.0,upsloping,0 vessels,normal,< 50% diameter narrowing
1,29.0,male,atypical angina,120.0,243.0,False,normal,160.0,no,0.0,upsloping,0 vessels,normal,< 50% diameter narrowing
2,29.0,male,atypical angina,140.0,230.753086,False,normal,170.0,no,0.0,upsloping,0 vessels,normal,< 50% diameter narrowing
3,30.0,female,typical angina,170.0,237.0,False,ST-T wave abnormality,170.0,no,0.0,upsloping,0 vessels,fixed defect,< 50% diameter narrowing
4,31.0,female,atypical angina,100.0,219.0,False,ST-T wave abnormality,150.0,no,0.0,upsloping,0 vessels,normal,< 50% diameter narrowing


In [10]:
# Saving data files
df_merged.to_csv("../data/preprocessed_data.csv", index=False)
df_encoded.to_csv("../data/encoded_data.csv", index=False)