In [13]:
#########################################
# 1) IMPORTS
#########################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

# Splitting & Cross-validation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder

# Metrics
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    classification_report
)

# Scaling
from sklearn.preprocessing import StandardScaler

In [8]:
# Import dataset
original_dataset = pd.read_csv('/home/jovyan/work/data/merged_dataset.csv', index_col=0)

In [4]:
display(original_dataset.head())

Unnamed: 0,sex,weight,height,resting.bp.s,cholesterol,fasting.blood.sugar,physical_activity,age,ST.slope,chest.pain.type,exercise.angina,max.heart.rate,oldpeak,resting.ecg,target
0,Male,49.9,174.0,140,289,0,Low,41,1.0,2.0,0.0,172.0,0.0,0.0,normal
1,Female,65.3,180.0,160,180,0,Low,50,2.0,3.0,0.0,156.0,1.0,0.0,heart_disease
2,Male,65.5,174.0,130,283,0,Intermediate,38,1.0,2.0,0.0,98.0,0.0,1.0,normal
3,Female,77.9,160.0,138,214,0,Low,49,2.0,4.0,1.0,108.0,1.5,0.0,heart_disease
4,Male,98.3,175.0,150,195,0,Low,55,1.0,3.0,0.0,122.0,0.0,0.0,normal


In [9]:
%run utilities.ipynb

In [10]:
describe_categorical(original_dataset)

{'sex': {'Male': 909, 'Female': 281},
 'physical_activity': {'Low': 834,
  'Intermediate': 238,
  'High': 113,
  'Average': 5},
 'target': {'heart_disease': 629, 'normal': 561}}

# Variables encoding

In [18]:
# Encode 'sex' column (Binary Encoding)
# Encode 'sex' column
original_dataset["sex_encoded"] = original_dataset["sex"].map({"Male": 1, "Female": 0})

# One-Hot Encode 'physical_activity' column
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Drop first column to avoid multicollinearity
physical_activity_encoded = pd.DataFrame(encoder.fit_transform(original_dataset[['physical_activity']]), 
                                         columns=encoder.get_feature_names_out(['physical_activity']))

# Concatenate encoded columns with original dataframe
original_dataset_encoded = pd.concat([original_dataset, physical_activity_encoded], axis=1)

# Drop original categorical columns after encoding
original_dataset_encoded.drop(columns=['sex', 'physical_activity'], inplace=True)

In [19]:
display(original_dataset_encoded.head())

Unnamed: 0,weight,height,resting.bp.s,cholesterol,fasting.blood.sugar,age,ST.slope,chest.pain.type,exercise.angina,max.heart.rate,oldpeak,resting.ecg,target,sex_encoded,physical_activity_High,physical_activity_Intermediate,physical_activity_Low
0,49.9,174.0,140,289,0,41,1.0,2.0,0.0,172.0,0.0,0.0,normal,1,0.0,0.0,1.0
1,65.3,180.0,160,180,0,50,2.0,3.0,0.0,156.0,1.0,0.0,heart_disease,0,0.0,0.0,1.0
2,65.5,174.0,130,283,0,38,1.0,2.0,0.0,98.0,0.0,1.0,normal,1,0.0,1.0,0.0
3,77.9,160.0,138,214,0,49,2.0,4.0,1.0,108.0,1.5,0.0,heart_disease,0,0.0,0.0,1.0
4,98.3,175.0,150,195,0,55,1.0,3.0,0.0,122.0,0.0,0.0,normal,1,0.0,0.0,1.0


In [20]:
# Save the prepared dataset for training
original_dataset_encoded.to_csv('/home/jovyan/work/data/encoded_dataset.csv', index='False')