In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# if possible, read this from SQL
df = pd.read_csv(
    Path('csv/Patient_data.csv')
)

# Review the DataFrame
df.head()

Unnamed: 0,PatientID,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,...,AgeCategory,HeightInMeters,WeightInKilograms,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,0,0,1,3,4.0,0.0,0,1,9.0,0,...,0,1.6,71.67,0,0,1,1,0,0,0
1,1,0,0,3,0.0,0.0,0,1,6.0,0,...,1,1.78,95.25,0,0,1,1,1,0,0
2,2,0,0,3,0.0,0.0,0,0,8.0,1,...,2,1.85,108.86,1,0,0,1,2,0,1
3,3,0,1,1,5.0,0.0,0,1,9.0,0,...,3,1.7,90.72,0,0,1,1,2,0,1
4,4,0,1,2,3.0,15.0,0,1,5.0,2,...,3,1.55,79.38,0,0,1,1,2,0,0


In [3]:
heartdisease = []
# If patient had heart attack or angina, they will be classified as having heart disease
for index, row in df.iterrows():
    if (row['HadHeartAttack'] + row['HadAngina']) > 0:
        heartdisease.append(1)
    else:
        heartdisease.append(0)
# use these results as y variable for building model
y = pd.DataFrame(heartdisease,columns=['HadHeartDisease']).values.reshape(-1, 1)
y

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]], dtype=int64)

In [4]:
# drop heart attack and angina column from original df. this will be used as the X variables
X = df.drop(columns=['HadHeartAttack','HadAngina','PatientID'])
X


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadStroke,...,AgeCategory,HeightInMeters,WeightInKilograms,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,0,1,3,4.0,0.0,0,1,9.0,0,0,...,0,1.60,71.67,0,0,1,1,0,0,0
1,0,0,3,0.0,0.0,0,1,6.0,0,0,...,1,1.78,95.25,0,0,1,1,1,0,0
2,0,0,3,0.0,0.0,0,0,8.0,1,0,...,2,1.85,108.86,1,0,0,1,2,0,1
3,0,1,1,5.0,0.0,0,1,9.0,0,0,...,3,1.70,90.72,0,0,1,1,2,0,1
4,0,1,2,3.0,15.0,0,1,5.0,2,0,...,3,1.55,79.38,0,0,1,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,53,0,3,0.0,0.0,2,1,6.0,0,0,...,6,1.78,102.06,1,0,0,0,1,0,0
246018,53,1,1,0.0,7.0,0,1,7.0,0,0,...,10,1.93,90.72,0,0,0,0,2,0,1
246019,53,0,2,0.0,15.0,0,1,7.0,2,1,...,0,1.68,83.91,1,1,1,1,1,0,1
246020,53,1,4,2.0,2.0,0,1,7.0,0,0,...,4,1.70,83.01,0,1,1,0,1,0,0


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=7)

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [7]:
# fit random forest
rf_model = RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=7)
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [8]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [9]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [10]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,67170,174
Actual 1,6202,261


Accuracy Score : 0.9136125299768315
Classification Report
              precision    recall  f1-score   support

           0       0.92      1.00      0.95     67344
           1       0.60      0.04      0.08      6463

    accuracy                           0.91     73807
   macro avg       0.76      0.52      0.52     73807
weighted avg       0.89      0.91      0.88     73807



In [11]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.11260732026402126, 'AgeCategory'),
 (0.07949139588131614, 'WeightInKilograms'),
 (0.07498903382581262, 'State'),
 (0.07419005879054633, 'ChestScan'),
 (0.06245550177313818, 'GeneralHealth'),
 (0.05969948725689556, 'HeightInMeters'),
 (0.041251054844223686, 'SleepHours'),
 (0.03861558899375614, 'PhysicalHealthDays'),
 (0.0362562330762242, 'RemovedTeeth'),
 (0.030569812536475137, 'PneumoVaxEver')]