# Advanced model: Random Forest

In [24]:
# Import statements

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import pickle


## Data Loading

In [34]:
# load the cleaned data (after outlier treatment but before feature scaling. Random Forest doesn't need feature scaling.) 

data = pd.read_csv('../data/processed/data_abnormal_values_treated.csv') 
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0,40.0,M,ATA,140.0,289.0,0,Normal,172.0,N,0.0,Up,0
1,1,49.0,F,NAP,160.0,180.0,0,Normal,156.0,N,1.0,Flat,1
2,2,37.0,M,ATA,130.0,283.0,0,ST,98.0,N,0.0,Up,0
3,3,48.0,F,ASY,138.0,214.0,0,Normal,108.0,Y,1.5,Flat,1
4,4,54.0,M,NAP,150.0,195.0,0,Normal,122.0,N,0.0,Up,0


In [35]:
data.isna().sum()

Unnamed: 0        0
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [36]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 746 entries, 0 to 745
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      746 non-null    int64  
 1   Age             746 non-null    float64
 2   Sex             746 non-null    object 
 3   ChestPainType   746 non-null    object 
 4   RestingBP       746 non-null    float64
 5   Cholesterol     746 non-null    float64
 6   FastingBS       746 non-null    int64  
 7   RestingECG      746 non-null    object 
 8   MaxHR           746 non-null    float64
 9   ExerciseAngina  746 non-null    object 
 10  Oldpeak         746 non-null    float64
 11  ST_Slope        746 non-null    object 
 12  HeartDisease    746 non-null    int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 75.9+ KB


## Get X and y

In [37]:
# seperate X and y dataframes

feature_columns = data.columns[1 :-1]

X = data[feature_columns]
y = data['HeartDisease'] 

In [38]:
y.value_counts()

HeartDisease
0    390
1    356
Name: count, dtype: int64

## Feature Engineering

### Label Encoding

Use LabelEncoder to assign an integer to each category 
(This works well for Random Forest as trees split based on feature values, not on distance or magnitude.)

Note: 

- Label encoding should be performed after train/test split to avoid data leakage.
- Feature scaling and one-hot encoding are generally not required for tree-based algorithms.
- For categorical columns, label encoding (assigning integers to categories) is typically sufficient for tree-based models like Decision Trees and Random Forests. One-hot encoding can lead to unnecessary splits and higher dimensionality, which may reduce model efficiency or result suboptimal splits.
- Feature scaling (e.g., normalization or standardization) is not needed because tree-based models are not sensitive to feature magnitude.

**Categorical features:** `Sex`, `ChestPainType`, `FastingBS`, `RestingECG`, `ExerciseAngina`, `ST_Slope`


In [39]:
# print the dataframe before label encoding
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0,40.0,M,ATA,140.0,289.0,0,Normal,172.0,N,0.0,Up,0
1,1,49.0,F,NAP,160.0,180.0,0,Normal,156.0,N,1.0,Flat,1
2,2,37.0,M,ATA,130.0,283.0,0,ST,98.0,N,0.0,Up,0
3,3,48.0,F,ASY,138.0,214.0,0,Normal,108.0,Y,1.5,Flat,1
4,4,54.0,M,NAP,150.0,195.0,0,Normal,122.0,N,0.0,Up,0


In [40]:
# define categorical columns

cat_cols = ['Sex','ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope']

# Apply label encoding on categorical features using OrdinalEncoder()

preproc = ColumnTransformer([
    ('LabelEncoding', OrdinalEncoder(), cat_cols)],
    remainder = 'passthrough') 

preproc

Examine the dataframe after it go through the 'preproc' for label encoding

This is just for visually examining how label enconding works, not required for model building

In [41]:
# Apply the transformation
X_transformed = preproc.fit_transform(X)

# Reconstruct a DataFrame
new_columns = cat_cols + [col for col in X.columns if col not in cat_cols]
X_transformed_df = pd.DataFrame(X_transformed, columns=new_columns)

X_transformed_df.head()


Unnamed: 0,Sex,ChestPainType,FastingBS,RestingECG,ExerciseAngina,ST_Slope,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,1.0,1.0,0.0,1.0,0.0,2.0,40.0,140.0,289.0,172.0,0.0
1,0.0,2.0,0.0,1.0,0.0,1.0,49.0,160.0,180.0,156.0,1.0
2,1.0,1.0,0.0,2.0,0.0,2.0,37.0,130.0,283.0,98.0,0.0
3,0.0,0.0,0.0,1.0,1.0,1.0,48.0,138.0,214.0,108.0,1.5
4,1.0,2.0,0.0,1.0,0.0,2.0,54.0,150.0,195.0,122.0,0.0


## Model Pipeline

In [42]:

Pipeline = Pipeline([
    ('Feature Engineering', preproc),
    ('classifier', RandomForestClassifier())
])

Pipeline

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



## Train/Test Splitting

In [43]:
# train/test splitting

X_train, X_test, y_train, y_test = train_test_split(
    X, y, # dataset
    train_size = 0.8, # the size of training set = 80%
    shuffle = True, #to avoid ordering effect
    stratify = y, # maintain the distribution of y classes in both training and test sets
    random_state = 42
)

## Cross-validation

check performance of the trained model (before hyper-parameter tuning)

In [44]:
scoring = [
    'accuracy',
    'precision',
    'recall',
    'f1',
    'roc_auc'
]

result_dict = cross_validate(Pipeline , X_train, y_train, cv = 5, scoring = scoring)
result = pd.DataFrame(result_dict)
result

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,0.109075,0.023029,0.883333,0.877193,0.877193,0.877193,0.953216
1,0.109265,0.014408,0.87395,0.836066,0.910714,0.871795,0.920351
2,0.109431,0.020169,0.840336,0.806452,0.877193,0.840336,0.935767
3,0.0996,0.028018,0.831933,0.813559,0.842105,0.827586,0.895161
4,0.107133,0.025729,0.87395,0.83871,0.912281,0.87395,0.941143


In [45]:
result.mean()

fit_time          0.106901
score_time        0.022271
test_accuracy     0.860700
test_precision    0.834396
test_recall       0.883897
test_f1           0.858172
test_roc_auc      0.929128
dtype: float64

## Hyper-parameter Tuning & Cross-validation

### Hyper-parameter tuning using GridSearchCV with pre-defined param_grid and Pipeline 

In [46]:

# define a parameter grid

param_grid = {
    'classifier__n_estimators': [50, 100, 200, 300, 400],  # number of trees
    'classifier__max_depth': [None, 10, 20], # limit tree depth (None allows full growth; 10 and 20 can help prevent overfitting)
    'classifier__min_samples_split': [2, 5] # controls how sesitive the tree is to splitting (lower values allow more splits)
}

grid_search = GridSearchCV (Pipeline, param_grid, cv = 5, scoring = 'accuracy')

grid_search.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [47]:
print(grid_search.best_params_)

{'classifier__max_depth': 20, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}


In [48]:
print(grid_search.best_score_)

0.8691176470588236


## Final Model
View the best performing Random Forest model and save it as pickle file.

In [49]:
grid_search.best_estimator_

In [50]:
# Save the best performing model to a pickle file

with open('Random_Forest_best_model.pkl', 'wb') as f:     # open(create) a file named 'Random_Forest_best_model.pkl' in write-binary mode and assign it to variable f
    pickle.dump(grid_search.best_estimator_, f)     # serialize and write the best estimator to the file object f

# Model Testing

In [17]:
# import Statements

In [18]:
# Load the saved pipeline


# Make predictions on test data


# Evaluate performance



## Confusion Matrix Visualization

In [19]:
# Confusion matrix


# Plot
