In [3]:
pip install xlrd openpyxl

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd

data = pd.read_csv("data/heart.csv")

In [7]:
data.head(5)

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,Male,Typical angina,125,212,Lower than 120 mg/ml,ST-T wave abnormality,168,No,1.0,Downsloping,Two,Reversable Defect,0
1,53,Male,Typical angina,140,203,Greater than 120 mg/ml,Normal,155,Yes,3.1,Upsloping,Zero,Reversable Defect,0
2,70,Male,Typical angina,145,174,Lower than 120 mg/ml,ST-T wave abnormality,125,Yes,2.6,Upsloping,Zero,Reversable Defect,0
3,61,Male,Typical angina,148,203,Lower than 120 mg/ml,ST-T wave abnormality,161,No,0.0,Downsloping,One,Reversable Defect,0
4,62,Female,Typical angina,138,294,Greater than 120 mg/ml,ST-T wave abnormality,106,No,1.9,Flat,Three,Fixed Defect,0


In [9]:
data.dtypes

age                                int64
sex                               object
chest_pain_type                   object
resting_blood_pressure             int64
cholestoral                        int64
fasting_blood_sugar               object
rest_ecg                          object
Max_heart_rate                     int64
exercise_induced_angina           object
oldpeak                          float64
slope                             object
vessels_colored_by_flourosopy     object
thalassemia                       object
target                             int64
dtype: object

In [11]:
from sklearn.preprocessing import LabelEncoder

# Check categorical columns
categorical_cols = data.select_dtypes(include='object').columns
print("Categorical columns:", categorical_cols)

# Create dictionary to store encoders (important for deployment)
label_encoders = {}

# Encode each categorical column
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Verify encoding
print("\nAfter Encoding:\n")
print(data.head())
print(data.dtypes)


Categorical columns: Index(['sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg',
       'exercise_induced_angina', 'slope', 'vessels_colored_by_flourosopy',
       'thalassemia'],
      dtype='object')

After Encoding:

   age  sex  chest_pain_type  resting_blood_pressure  cholestoral  \
0   52    1                3                     125          212   
1   53    1                3                     140          203   
2   70    1                3                     145          174   
3   61    1                3                     148          203   
4   62    0                3                     138          294   

   fasting_blood_sugar  rest_ecg  Max_heart_rate  exercise_induced_angina  \
0                    1         2             168                        0   
1                    0         1             155                        1   
2                    1         2             125                        1   
3                    1         2             161 

In [53]:
data.to_csv("data/encoded_heart.csv", index=False)

In [13]:
# Separate features and target

X = data.drop("target", axis=1)
y = data["target"]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

print("\nFirst 5 rows of X:")
print(X.head())

print("\nFirst 5 values of y:")
print(y.head())


Shape of X: (1025, 13)
Shape of y: (1025,)

First 5 rows of X:
   age  sex  chest_pain_type  resting_blood_pressure  cholestoral  \
0   52    1                3                     125          212   
1   53    1                3                     140          203   
2   70    1                3                     145          174   
3   61    1                3                     148          203   
4   62    0                3                     138          294   

   fasting_blood_sugar  rest_ecg  Max_heart_rate  exercise_induced_angina  \
0                    1         2             168                        0   
1                    0         1             155                        1   
2                    1         2             125                        1   
3                    1         2             161                        0   
4                    0         2             106                        0   

   oldpeak  slope  vessels_colored_by_flourosopy  thalassem

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,      # 20% test data
    random_state=42     # ensures reproducibility
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (820, 13)
Testing set shape: (205, 13)


In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit scaler only on training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using same scaler
X_test_scaled = scaler.transform(X_test)

print("Scaled training shape:", X_train_scaled.shape)
print("Scaled testing shape:", X_test_scaled.shape)


Scaled training shape: (820, 13)
Scaled testing shape: (205, 13)


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Initialize model
lr_model = LogisticRegression(max_iter=1000)

# Train model
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_prob_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_prob_lr)
mcc_lr = matthews_corrcoef(y_test, y_pred_lr)

print("Logistic Regression Performance:")
print("Accuracy:", accuracy_lr)
print("AUC:", auc_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("F1 Score:", f1_lr)
print("MCC:", mcc_lr)


Logistic Regression Performance:
Accuracy: 0.7951219512195122
AUC: 0.8656006091757091
Precision: 0.7699115044247787
Recall: 0.8446601941747572
F1 Score: 0.8055555555555556
MCC: 0.5928701370715527


In [25]:
from sklearn.tree import DecisionTreeClassifier

# Initialize model
dt_model = DecisionTreeClassifier(random_state=42)

# Train model
dt_model.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
auc_dt = roc_auc_score(y_test, y_prob_dt)
mcc_dt = matthews_corrcoef(y_test, y_pred_dt)

print("Decision Tree Performance:")
print("Accuracy:", accuracy_dt)
print("AUC:", auc_dt)
print("Precision:", precision_dt)
print("Recall:", recall_dt)
print("F1 Score:", f1_dt)
print("MCC:", mcc_dt)


Decision Tree Performance:
Accuracy: 0.9853658536585366
AUC: 0.9854368932038835
Precision: 1.0
Recall: 0.970873786407767
F1 Score: 0.9852216748768473
MCC: 0.9711511393019859


In [27]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train model
knn_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_knn = knn_model.predict(X_test_scaled)
y_prob_knn = knn_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
auc_knn = roc_auc_score(y_test, y_prob_knn)
mcc_knn = matthews_corrcoef(y_test, y_pred_knn)

print("KNN Performance:")
print("Accuracy:", accuracy_knn)
print("AUC:", auc_knn)
print("Precision:", precision_knn)
print("Recall:", recall_knn)
print("F1 Score:", f1_knn)
print("MCC:", mcc_knn)


KNN Performance:
Accuracy: 0.824390243902439
AUC: 0.9493146773272416
Precision: 0.7964601769911505
Recall: 0.8737864077669902
F1 Score: 0.8333333333333334
MCC: 0.651716995415485


In [29]:
from sklearn.naive_bayes import GaussianNB

# Initialize model
nb_model = GaussianNB()

# Train model
nb_model.fit(X_train, y_train)

# Predictions
y_pred_nb = nb_model.predict(X_test)
y_prob_nb = nb_model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)
auc_nb = roc_auc_score(y_test, y_prob_nb)
mcc_nb = matthews_corrcoef(y_test, y_pred_nb)

print("Naive Bayes Performance:")
print("Accuracy:", accuracy_nb)
print("AUC:", auc_nb)
print("Precision:", precision_nb)
print("Recall:", recall_nb)
print("F1 Score:", f1_nb)
print("MCC:", mcc_nb)


Naive Bayes Performance:
Accuracy: 0.775609756097561
AUC: 0.8596040357890729
Precision: 0.7567567567567568
Recall: 0.8155339805825242
F1 Score: 0.7850467289719626
MCC: 0.5527253303619355


In [31]:
from sklearn.ensemble import RandomForestClassifier

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

# Train model
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_prob_rf)
mcc_rf = matthews_corrcoef(y_test, y_pred_rf)

print("Random Forest Performance:")
print("Accuracy:", accuracy_rf)
print("AUC:", auc_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)
print("MCC:", mcc_rf)


Random Forest Performance:
Accuracy: 0.9853658536585366
AUC: 1.0
Precision: 1.0
Recall: 0.970873786407767
F1 Score: 0.9852216748768473
MCC: 0.9711511393019859


In [33]:
from xgboost import XGBClassifier

# Initialize model
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Train model
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_prob_xgb)
mcc_xgb = matthews_corrcoef(y_test, y_pred_xgb)

print("XGBoost Performance:")
print("Accuracy:", accuracy_xgb)
print("AUC:", auc_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1 Score:", f1_xgb)
print("MCC:", mcc_xgb)


XGBoost Performance:
Accuracy: 0.9853658536585366
AUC: 0.9834380354083381
Precision: 1.0
Recall: 0.970873786407767
F1 Score: 0.9852216748768473
MCC: 0.9711511393019859


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [35]:
import pandas as pd

results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ],
    "Accuracy": [
        accuracy_lr,
        accuracy_dt,
        accuracy_knn,
        accuracy_nb,
        accuracy_rf,
        accuracy_xgb
    ],
    "AUC": [
        auc_lr,
        auc_dt,
        auc_knn,
        auc_nb,
        auc_rf,
        auc_xgb
    ],
    "Precision": [
        precision_lr,
        precision_dt,
        precision_knn,
        precision_nb,
        precision_rf,
        precision_xgb
    ],
    "Recall": [
        recall_lr,
        recall_dt,
        recall_knn,
        recall_nb,
        recall_rf,
        recall_xgb
    ],
    "F1 Score": [
        f1_lr,
        f1_dt,
        f1_knn,
        f1_nb,
        f1_rf,
        f1_xgb
    ],
    "MCC": [
        mcc_lr,
        mcc_dt,
        mcc_knn,
        mcc_nb,
        mcc_rf,
        mcc_xgb
    ]
})

print("\nFinal Model Comparison Table:\n")
print(results)



Final Model Comparison Table:

                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.795122  0.865601   0.769912  0.844660  0.805556   
1        Decision Tree  0.985366  0.985437   1.000000  0.970874  0.985222   
2                  KNN  0.824390  0.949315   0.796460  0.873786  0.833333   
3          Naive Bayes  0.775610  0.859604   0.756757  0.815534  0.785047   
4        Random Forest  0.985366  1.000000   1.000000  0.970874  0.985222   
5              XGBoost  0.985366  0.983438   1.000000  0.970874  0.985222   

        MCC  
0  0.592870  
1  0.971151  
2  0.651717  
3  0.552725  
4  0.971151  
5  0.971151  


In [37]:
results = results.round(3)
print(results)


                 Model  Accuracy    AUC  Precision  Recall  F1 Score    MCC
0  Logistic Regression     0.795  0.866      0.770   0.845     0.806  0.593
1        Decision Tree     0.985  0.985      1.000   0.971     0.985  0.971
2                  KNN     0.824  0.949      0.796   0.874     0.833  0.652
3          Naive Bayes     0.776  0.860      0.757   0.816     0.785  0.553
4        Random Forest     0.985  1.000      1.000   0.971     0.985  0.971
5              XGBoost     0.985  0.983      1.000   0.971     0.985  0.971


In [43]:
import joblib
import os

os.chdir("/home/cloud/Desktop/ML_Assignment_2")
print("Now Working Directory:", os.getcwd())

# Create model folder if not exists
if not os.path.exists("model"):
    os.makedirs("model")

# Save models
joblib.dump(lr_model, "model/logistic_regression.pkl")
joblib.dump(dt_model, "model/decision_tree.pkl")
joblib.dump(knn_model, "model/knn.pkl")
joblib.dump(nb_model, "model/naive_bayes.pkl")
joblib.dump(rf_model, "model/random_forest.pkl")
joblib.dump(xgb_model, "model/xgboost.pkl")

# Save scaler (important for LR & KNN)
joblib.dump(scaler, "model/scaler.pkl")

print("All models saved successfully!")


Now Working Directory: /home/cloud/Desktop/ML_Assignment_2
All models saved successfully!


In [45]:
os.listdir("model")

['logistic_regression.pkl',
 'decision_tree.pkl',
 'knn.pkl',
 'naive_bayes.pkl',
 'random_forest.pkl',
 'xgboost.pkl',
 'scaler.pkl']