In [5]:
import pandas as pd

df = pd.read_csv('dataset/heart_2020_uncleaned.csv')

df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,no,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,no,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,no,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [2]:

df.isna().sum()

HeartDisease          0
BMI                 150
Smoking               0
AlcoholDrinking       0
Stroke                0
PhysicalHealth      120
MentalHealth          0
DiffWalking           0
Sex                   0
AgeCategory           0
Race                  0
Diabetic              0
PhysicalActivity      0
GenHealth             0
SleepTime           100
Asthma                0
KidneyDisease         0
SkinCancer            0
dtype: int64

In [37]:
df['BMI'].fillna(df['BMI'].median(), inplace=True)
df['BMI'] = df['BMI'].astype(float)

df['SleepTime'].fillna(df['SleepTime'].median(), inplace=True)
df['SleepTime'] = df['SleepTime'].astype(float)

physHealth = df['PhysicalHealth'].mode()[0]
df['PhysicalHealth'] = df['PhysicalHealth'].fillna(physHealth)

rows_with_empty = df[df.isna().any(axis=1)].head(5)
print(rows_with_empty)

Empty DataFrame
Columns: [HeartDisease, BMI, Smoking, AlcoholDrinking, Stroke, PhysicalHealth, MentalHealth, DiffWalking, Sex, AgeCategory, Race, Diabetic, PhysicalActivity, GenHealth, SleepTime, Asthma, KidneyDisease, SkinCancer]
Index: []


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['BMI'].fillna(df['BMI'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['SleepTime'].fillna(df['SleepTime'].median(), inplace=True)


In [3]:
from sklearn.preprocessing import RobustScaler 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

featureCols = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']  # numeric
categoricalCols = ['AgeCategory', 'Race', 'Diabetic', 'GenHealth']    # categorical

binaryCols = [
    'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 
    'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer'
]

for col in binaryCols:
    df[col] = df[col].astype(str).str.lower()

binary_map = {
    'yes': 1,
    'no': 0,
    'male': 1,
    'female': 0
}

df[binaryCols] = df[binaryCols].applymap(lambda x: binary_map.get(x, x))

# Split features and target
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Preprocessing pipelines:
numeric_transformer = RobustScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# ColumnTransformer applies the right transformer to each column set
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, featureCols),
        ('cat', categorical_transformer, categoricalCols),
        # binary columns are already numeric 0/1, so we passthrough
    ],
    remainder='passthrough'  # binary columns and any others left as-is
)

# Create pipeline (add model later)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Train/test split (do this before fitting to avoid leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit pipeline on training data and transform
X_train_processed = pipeline.fit_transform(X_train)

# Transform test data
X_test_processed = pipeline.transform(X_test)

print(f"Processed train shape: {X_train_processed.shape}")
print(f"Processed test shape: {X_test_processed.shape}")


  df[binaryCols] = df[binaryCols].applymap(lambda x: binary_map.get(x, x))


Processed train shape: (255836, 41)
Processed test shape: (63959, 41)


In [4]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

# Train LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train_processed, y_train)

# Predict
y_pred_lgb = lgb_model.predict(X_test_processed)

# Evaluate
print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("Classification Report:\n", classification_report(y_test, y_pred_lgb))


[LightGBM] [Info] Number of positive: 21781, number of negative: 234055
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025683 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 416
[LightGBM] [Info] Number of data points in the train set: 255836, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085137 -> initscore=-2.374518
[LightGBM] [Info] Start training from score -2.374518




LightGBM Accuracy: 0.9142575712565862
Classification Report:
               precision    recall  f1-score   support

          No       0.92      0.99      0.95     58367
         Yes       0.57      0.08      0.13      5592

    accuracy                           0.91     63959
   macro avg       0.75      0.54      0.54     63959
weighted avg       0.89      0.91      0.88     63959

