**Import dependencies**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from google.colab import drive

**Mount data frame**

In [None]:
# Mount data
drive.mount('/content/drive')
pthname = '/content/drive/My Drive/Colab Notebooks/Heart Disease Classifier/heart.csv'

# Create the DataFrame
df = pd.read_csv(pthname)
df

Mounted at /content/drive


Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [None]:
# Print relevant statistics for each column
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [None]:
df.isnull().sum()

In [None]:
# Count occurences of heart disease
df["HeartDisease"].value_counts()

Split training set and testing set

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
y.shape
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99)

Train with RandomForestClassifier

In [None]:
# Create an instance of OneHotEncoder to handle categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit the encoder on the training data and transform the categorical columns for both training and testing data
X_train_encoded = encoder.fit_transform(X_train.select_dtypes(include='object'))
X_test_encoded = encoder.transform(X_test.select_dtypes(include='object'))

# Get the names of the encoded columns (categories) for both training and testing data
encoded_columns = encoder.get_feature_names_out(input_features=X_train.select_dtypes(include='object').columns)

# Convert the transformed arrays back to DataFrames with the appropriate column names
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns, index=X_test.index)

# Concatenate the encoded DataFrames with the original numeric columns for both training and testing data
X_train_encoded_full = pd.concat([X_train.select_dtypes(exclude='object'), X_train_encoded_df], axis=1)
X_test_encoded_full = pd.concat([X_test.select_dtypes(exclude='object'), X_test_encoded_df], axis=1)

# Now, you can create the RandomForestClassifier and fit it on the preprocessed training data
clf = RandomForestClassifier(criterion="gini", max_depth=8, min_samples_split=10, random_state=5)
clf.fit(X_train_encoded_full, y_train)

In [None]:
clf.feature_importances_

In [None]:
df.columns

**Making predictions on the test set**

In [None]:
# Make predictions on the test set
y_pred = clf.predict(X_test_encoded_full)
y_pred

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 1])

In [None]:
confusion_matrix(y_test,y_pred)

array([[ 93,  24],
       [  8, 105]])

In [None]:
# Accuracy of the model
accuracy_score(y_test,y_pred)

0.8608695652173913

In [None]:
# Cross validate
cross_val_score(clf,X_train_encoded_full,y_train,cv=10)

array([0.89855072, 0.82608696, 0.89855072, 0.82608696, 0.84057971,
       0.89855072, 0.85507246, 0.91304348, 0.88235294, 0.88235294])

In [None]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       101
           1       0.93      0.81      0.87       129

    accuracy                           0.86       230
   macro avg       0.86      0.87      0.86       230
weighted avg       0.87      0.86      0.86       230



In [None]:
# Function to take user input and preprocess it for prediction
def preprocess_user_input(input_data):
    # Create a DataFrame from the user input
    user_df = pd.DataFrame([input_data])

    # Use the same encoder used during training to transform the categorical columns
    user_encoded = encoder.transform(user_df.select_dtypes(include='object'))

    # Convert the transformed array back to a DataFrame with the appropriate column names
    user_encoded_df = pd.DataFrame(user_encoded, columns=encoded_columns, index=user_df.index)

    # Concatenate the encoded DataFrame with the original numeric columns
    user_encoded_full = pd.concat([user_df.select_dtypes(exclude='object'), user_encoded_df], axis=1)

    return user_encoded_full

# Sample user input
user_input = {
    'Age': 58,
    'Gender': 'M',
    'ChestPainType': 'ATA',
    'RestingBP': 136,
    'Cholesterol': 164,
    'FastingBS': 0,
    'RestingECG': 'ST',
    'MaxHR': 99,
    'ExerciseAngina': 'Y',
    'Oldpeak': 2.0,
    'ST_Slope': 'Flat'
}

# Sample user input #2
# user_input = {
#     'Age': 37,
#     'Gender': 'M',
#     'ChestPainType': 'ATA',
#     'RestingBP': 210,
#     'Cholesterol': 389,
#     'FastingBS': 1,
#     'RestingECG': 'Normal',
#     'MaxHR': 172,
#     'ExerciseAngina': 'N',
#     'Oldpeak': 0.0,
#     'ST_Slope': 'Up'
# }

In [39]:
# Preprocess the user input
user_input_encoded = preprocess_user_input(user_input)

# Use the trained RandomForestClassifier to predict the output
user_prediction = clf.predict(user_input_encoded)

print(user_prediction)

# Display the prediction result to the user
if user_prediction[0] == 1:
    print("Based on the provided information, you may have heart disease.")
else:
    print("Based on the provided information, you may not have heart disease.")

[1]
Based on the provided information, you may have heart disease.
