In [25]:
# Main
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

# Outliers
from scipy import stats

# Encoding
from sklearn.preprocessing import LabelEncoder

# For the Model
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier

# If needed
import itertools
from itertools import combinations
import math

import warnings
warnings.filterwarnings("ignore")

# Dataframe
df = pd.read_csv("./data/heart_2022_with_nans.csv")

# df.head()

# Encoding

df_numeric = df.copy()

# Encode the target variable (HadHeartAttack: Yes/No to 1/0)
df_numeric['HadHeartAttack'] = df_numeric['HadHeartAttack'].map({'No': 0, 'Yes': 1})

# Define ordinal mappings for ordinal columns
ordinal_mappings = {
    'GeneralHealth': {'Poor': 0, 'Fair': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4},
    'AgeCategory': {
        'Age 18 to 24': 0, 'Age 25 to 29': 1, 'Age 30 to 34': 2, 'Age 35 to 39': 3,
        'Age 40 to 44': 4, 'Age 45 to 49': 5, 'Age 50 to 54': 6, 'Age 55 to 59': 7,
        'Age 60 to 64': 8, 'Age 65 to 69': 9, 'Age 70 to 74': 10, 'Age 75 to 79': 11,
        'Age 80 or older': 12
    }
}

# Apply ordinal encoding
for col, mapping in ordinal_mappings.items():
    df_numeric[col] = df_numeric[col].map(mapping)

# Encode nominal categorical columns with LabelEncoder
nominal_cols = [
    'State', 'Sex', 'LastCheckupTime', 'PhysicalActivities', 'RemovedTeeth',
    'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
    'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes',
    'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating',
    'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands',
    'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory',
    'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
    'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'
]

le = LabelEncoder()
for col in nominal_cols:
    df_numeric[col] = le.fit_transform(df_numeric[col].astype(str))

# Check for Duplicates

non_useful_cols = [
    'State', 'HeightInMeters', 'WeightInKilograms', 'TetanusLast10Tdap',
    'CovidPos', 'ECigaretteUsage', 'HighRiskLastYear', 'HIVTesting',
    'FluVaxLast12', 'BlindOrVisionDifficulty', 'DifficultyErrands', 'PneumoVaxEver'
]

df_filtered = df.drop(columns=non_useful_cols)

df_filtered.duplicated().sum()

# Handling Duplicates
df_filtered.drop_duplicates(inplace=True)
df_filtered.duplicated().sum() 
df_filtered.shape

# Check for Missing Values
df_filtered.isna().sum()
# Count missing values in each row
missing_counts = df_filtered.isna().sum(axis=1)

# Identify rows with 2 or more missing values and count them
num_rows_to_delete = (missing_counts >= 2).sum()
print(f"\nTotal number of rows with 2 or more missing values: {num_rows_to_delete}")

# Count occurrences of each missing value count
missing_distribution = missing_counts[missing_counts >= 2].value_counts().sort_index()
print("Distribution of missing values per row:")
print(missing_distribution)

# Calculate missing value percentages
missing_percentages = df_filtered.isnull().mean() * 100

# Categorize columns based on missing value percentage ranges
ranges = {
    "0-10%": [],
    "10-20%": [],
    "20-30%": [],
    "30-40%": [],
    "50-50%": [],
    "50-60%": [],
    "60-70%": [],
    "70-80%": [],
    "80-90%": [],
    "90-100%": []
}

for col, perc in missing_percentages.items():
    if 0 <= perc < 10:
        ranges["0-10%"].append(col)
    elif 10 <= perc < 20:
        ranges["10-20%"].append(col)
    elif 20 <= perc < 30:
        ranges["20-30%"].append(col)
    elif 30 <= perc < 40:
        ranges["30-40%"].append(col)
    elif 40 <= perc < 50:
        ranges["40-50%"].append(col)
    elif 50 <= perc < 60:
        ranges["50-60%"].append(col)
    elif 60 <= perc < 70:
        ranges["60-70%"].append(col)
    elif 70 <= perc < 80:
        ranges["70-80%"].append(col)
    elif 80 <= perc < 90:
        ranges["80-90%"].append(col)
    elif 90 <= perc <= 100:
        ranges["90-100%"].append(col)

# Print results
for range_label, cols in ranges.items():
    print(f"Columns with missing values in range {range_label}: {cols}")

# Handling missing values
# Remove rows with missing HadHeartAttack
df_filtered = df_filtered.dropna(subset=['HadHeartAttack'])
print("Rows after dropping missing HadHeartAttack:", df_filtered.shape[0])

# Remove rows with 10 or more missing values
missing_per_row = df_filtered.isna().sum(axis=1)
df_filtered = df_filtered[missing_per_row < 10]
print("Rows after dropping rows with 10+ missing values:", df_filtered.shape[0])

# Impute missing values
# Numeric columns: impute with median
numeric_cols = ['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'BMI']
df_filtered[numeric_cols] = df_filtered[numeric_cols].fillna(df_filtered[numeric_cols].median())

# Ordinal columns: impute with mode
ordinal_cols = ['GeneralHealth', 'AgeCategory']
df_filtered[ordinal_cols] = df_filtered[ordinal_cols].fillna(df_filtered[ordinal_cols].mode().iloc[0])

# Categorical columns: impute with mode
categorical_cols = [
    'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing',
    'LastCheckupTime', 'PhysicalActivities', 'RemovedTeeth', 'HadAngina', 'HadStroke',
    'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease',
    'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'SmokerStatus', 'ChestScan',
    'RaceEthnicityCategory', 'AlcoholDrinkers'
]
df_filtered[categorical_cols] = df_filtered[categorical_cols].fillna(df_filtered[categorical_cols].mode().iloc[0])

# Verify missing values are handled
print("Missing values after imputation:\n", df_filtered.isna().sum())
print("Final shape:", df_filtered.shape)


Total number of rows with 2 or more missing values: 63944
Distribution of missing values per row:
2     22907
3     12580
4      4854
5      2397
6      2436
7      1866
8     12420
9      2944
10      878
11      282
12      119
13       45
14       35
15       22
16       18
17        9
18       20
19       17
20       18
21       15
22       11
23        6
24       23
25        8
26       12
27        2
Name: count, dtype: int64
Columns with missing values in range 0-10%: ['Sex', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities', 'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'SmokerStatus', 'RaceEthnicityCategory', 'AgeCategory']
Columns with missing values in range 10-20%: ['ChestScan', 'BMI'