> Baseline model

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.model_selection import cross_validate

In [5]:
data = pd.read_csv('../data/raw/heart.csv') 
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [6]:
# seperate X and y dataframes

feature_columns = data.columns[:-1]

X = data[feature_columns]
y = data['HeartDisease'] 

In [7]:
y.value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

In [8]:
# define categorical columns

cat_cols = ['Sex','ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope']

# Apply label encoding on categorical features using OrdinalEncoder()

preproc = ColumnTransformer([
    ('LabelEncoding', OrdinalEncoder(), cat_cols)],
    remainder = 'passthrough') 

preproc

In [9]:
# Apply the transformation
X_transformed = preproc.fit_transform(X)

# Reconstruct a DataFrame
new_columns = cat_cols + [col for col in X.columns if col not in cat_cols]
X_transformed_df = pd.DataFrame(X_transformed, columns=new_columns)

X_transformed_df.head()

Unnamed: 0,Sex,ChestPainType,FastingBS,RestingECG,ExerciseAngina,ST_Slope,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,1.0,1.0,0.0,1.0,0.0,2.0,40.0,140.0,289.0,172.0,0.0
1,0.0,2.0,0.0,1.0,0.0,1.0,49.0,160.0,180.0,156.0,1.0
2,1.0,1.0,0.0,2.0,0.0,2.0,37.0,130.0,283.0,98.0,0.0
3,0.0,0.0,0.0,1.0,1.0,1.0,48.0,138.0,214.0,108.0,1.5
4,1.0,2.0,0.0,1.0,0.0,2.0,54.0,150.0,195.0,122.0,0.0


In [16]:
model_pipeline = Pipeline([
    ('Feature Engineering', preproc),
    ('classifier', LogisticRegression())
])

model_pipeline

In [17]:
# train/test splitting

X_train, X_test, y_train, y_test = train_test_split(
    X, y, # dataset
    train_size = 0.8, # the size of training set = 80%
    shuffle = True, #to avoid ordering effect
    stratify = y, # maintain the distribution of y classes in both training and test sets
    random_state = 42
)

In [None]:
from sklearn.model_selection import cross_validate

# Define the scoring metrics
scoring = [
    'accuracy',
    'precision',
    'recall',
    'f1',
    'roc_auc'
]

result_dict = cross_validate(model_pipeline, X_train, y_train, cv=5, scoring=scoring)

result = pd.DataFrame(result_dict)
print("\nCross-Validation Results:")
print(result)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Cross-Validation Results:
   fit_time  score_time  test_accuracy  test_precision  test_recall   test_f1  \
0  0.052717    0.048423       0.891156        0.892857     0.914634  0.903614   
1  0.042141    0.031425       0.870748        0.887500     0.876543  0.881988   
2  0.031057    0.038983       0.836735        0.890411     0.802469  0.844156   
3  0.177803    0.030389       0.836735        0.851852     0.851852  0.851852   
4  0.037808    0.035290       0.780822        0.775281     0.851852  0.811765   

   test_roc_auc  
0      0.954784  
1      0.937897  
2      0.919566  
3      0.895623  
4      0.846344  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
print(f"\nTraining Logistic Regression model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

# --- 6. Evaluate Baseline Model ---
y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:, 1] # Probability for the positive class (1)

# Calculate key metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)

print("\n" + "="*40)
print("BASELINE MODEL PERFORMANCE (Logistic Regression)")
print("="*40)
print(f"Accuracy:        {accuracy:.4f}")
print(f"AUC Score:       {auc:.4f}")
print(f"F1 Score:        {f1:.4f}")
print("-" * 40)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("-" * 40)




Training Logistic Regression model...
Model training complete.

BASELINE MODEL PERFORMANCE (Logistic Regression)
Accuracy:        0.8696
AUC Score:       0.9011
F1 Score:        0.8857
----------------------------------------

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.82      0.85        82
           1       0.86      0.91      0.89       102

    accuracy                           0.87       184
   macro avg       0.87      0.86      0.87       184
weighted avg       0.87      0.87      0.87       184

----------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
