Importing data and libraries

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,train_test_split,RandomizedSearchCV
import numpy as np 
import pandas as pd
import os
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#recover data
input_data_path = '../data_through_notes/' # Relative path to data folder from notebooks folder
reduced_df = pd.read_csv(os.path.join(input_data_path, 'reduced_data.csv'))
y = pd.read_csv(os.path.join(input_data_path, 'y_processed.csv'))
y=y.iloc[:,0]
print("Loaded reduced_data shape:", reduced_df.shape)
print("Loaded y shape:", y.shape)
print('first 5 rows in reduced data:')
print(reduced_df.head())

x_train,x_test,y_train,y_test=train_test_split(reduced_df,y,test_size=0.2,random_state=42)
print(f'x_train shape: {x_train.shape}')
print(f'y_train shaoe: {y_train.shape}')

Loaded reduced_data shape: (297, 12)
Loaded y shape: (297,)
first 5 rows in reduced data:
    thalach   oldpeak  thal_7.0       age  cp_4  trestbps      chol  exang_1  \
0  0.603053  0.370968         0  0.708333     0  0.481132  0.244292        0   
1  0.282443  0.241935         0  0.791667     1  0.622642  0.365297        1   
2  0.442748  0.419355         1  0.791667     1  0.245283  0.235160        1   
3  0.885496  0.564516         0  0.166667     0  0.339623  0.283105        0   
4  0.770992  0.225806         0  0.250000     0  0.339623  0.178082        0   

   sex_1  slope_2  ca_1.0  cp_3  
0      1        0       0     0  
1      1        1       0     0  
2      1        1       0     0  
3      1        0       0     1  
4      0        0       0     0  
x_train shape: (237, 12)
y_train shaoe: (237,)


Choosing Hyperparameters using GridsearchCV

In [23]:
print('------Gridsearch------\n \n \n')
#------Logestic Regression------
c_space =np.logspace(-5,8,15)
param_grid={
    'C' :c_space
}
LogisticRegression_model=LogisticRegression()
LogisticRegression_CV=GridSearchCV(LogisticRegression_model,param_grid,cv=5)
LogisticRegression_CV.fit(x_train,y_train)
print(f'------Logestic Regression------')
print(f"Tuned Logistic Regression Parameters: {LogisticRegression_CV.best_params_}")
print("Best score is {}".format(LogisticRegression_CV.best_score_))

#Using hyper parameter
logreg_optimal=LogisticRegression(C=0.4393970560760795,max_iter=100)
logreg_optimal.fit(x_train,y_train) 
y_logestic_predict=logreg_optimal.predict(x_test)
print(f'Accuracy Score with optimal parameter(Logestic Regression): {accuracy_score(y_test,y_logestic_predict):.4f}')

#------Random Forst------
RF_params={
    'n_estimators': [100, 200, 300], #https://numpy.org/doc/stable/reference/random/generated/numpy.random.randint.html
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
RandomForestClassifier_model=RandomForestClassifier(random_state=42)
RandomForestClassifier_CV=GridSearchCV(RandomForestClassifier_model,RF_params,cv=5)
RandomForestClassifier_CV.fit(x_train,y_train)
print('\n \n ------Random Forest------')
print(f'Tuned Random forest parameters: {RandomForestClassifier_CV.best_params_}')
print(f'Best score is {RandomForestClassifier_CV.best_score_}')

#using hyper parameter
best_RF_model=RandomForestClassifier_CV.best_estimator_
y_RF_predict=best_RF_model.predict(x_test)
accuracy_grid=accuracy_score(y_test,y_RF_predict)
percision_grid=precision_score(y_test,y_RF_predict)
F1_grid=f1_score(y_test,y_RF_predict)
Recall_grid=recall_score(y_test,y_RF_predict)
print(f'Accuracy Score:{accuracy_grid:.4f}')
print(f'Precision Score: {percision_grid:.4f}')
print(f'F1-score: {F1_grid:.4f}')
print(f'Recall Score: {Recall_grid:.4f}')

------Gridsearch------
 
 

------Logestic Regression------
Tuned Logistic Regression Parameters: {'C': np.float64(0.4393970560760795)}
Best score is 0.8141843971631205
Accuracy Score with optimal parameter(Logestic Regression): 0.8833

 
 ------Random Forest------
Tuned Random forest parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best score is 0.8011524822695038
Accuracy Score:0.8333
Precision Score: 0.8182
F1-score: 0.7826
Recall Score: 0.7500


Chossing hyperparameter using randomsearch

In [21]:
print('------Randomized serch------\n \n \n')

#------logestic Regression------
c_space =np.logspace(-5,8,15)
param_grid={
    'C' :c_space
}
LogisticRegression_model=LogisticRegression()
LogisticRegression_CV=RandomizedSearchCV(LogisticRegression_model,param_grid,cv=5)
LogisticRegression_CV.fit(reduced_df,y)
print(f'------Logestic Regression------')
print(f"Tuned Logistic Regression Parameters: {LogisticRegression_CV.best_params_}")
print("Best score is {}".format(LogisticRegression_CV.best_score_))

#Using hyper parameter
logreg_optimal=LogisticRegression(C=0.4393970560760795,max_iter=100)
logreg_optimal.fit(x_train,y_train) 
y_logestic_predict=logreg_optimal.predict(x_test)
print(f'Accuracy Score with optimal parameter(Logestic Regression): {accuracy_score(y_test,y_logestic_predict):.4f}')


#------Random Forst------
RF_params={
    'n_estimators': [100, 200, 300], #https://numpy.org/doc/stable/reference/random/generated/numpy.random.randint.html
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
RandomForestClassifier_model=RandomForestClassifier(random_state=42)
RandomForestClassifier_CV_random=RandomizedSearchCV(RandomForestClassifier_model,RF_params,cv=5)
RandomForestClassifier_CV_random.fit(x_train,y_train)
print('\n \n ------Random Forest------')
print(f'Tuned Random forest parameters: {RandomForestClassifier_CV_random.best_params_}')
print(f'Best score is {RandomForestClassifier_CV_random.best_score_}')

#using hyper parameter
best_RF_model_random=RandomForestClassifier_CV_random.best_estimator_
y_RF_predict_random=best_RF_model_random.predict(x_test)
accuracy_random=accuracy_score(y_test,y_RF_predict_random)
percision_random=precision_score(y_test,y_RF_predict_random)
F1_random=f1_score(y_test,y_RF_predict_random)
Recall_random=recall_score(y_test,y_RF_predict_random)
print(f'Accuracy Score:{accuracy_random:.4f}')
print(f'Precision Score: {percision_random:.4f}')
print(f'F1-score: {F1_random:.4f}')
print(f'Recall Score: {Recall_random:.4f}')

------Randomized serch------
 
 

------Logestic Regression------
Tuned Logistic Regression Parameters: {'C': np.float64(0.4393970560760795)}
Best score is 0.8416949152542372
Accuracy Score with optimal parameter(Logestic Regression): 0.8833

 
 ------Random Forest------
Tuned Random forest parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': None, 'criterion': 'gini'}
Best score is 0.8011524822695038
Accuracy Score:0.8333
Precision Score: 0.8182
F1-score: 0.7826
Recall Score: 0.7500


Choosing the best performance model 

In [29]:
if RandomForestClassifier_CV.best_score_ >= RandomForestClassifier_CV_random.best_score_:
    final_optimized_model = best_RF_model
    optimized_model_performance = {
        'Method': 'GridSearchCV',
        'Accuracy': accuracy_grid,
        'Precision': percision_grid,
        'Recall': Recall_grid,
        'F1-Score': F1_grid,
    }
else:
    final_optimized_model = best_RF_model_random
    optimized_model_performance = {
        'Method': 'RandomizedSearchCV',
        'Accuracy': accuracy_random,
        'Precision': percision_grid,
        'Recall': Recall_random,
        'F1-Score': F1_random,
    }

print("\n--- Summary of Optimized Model Performance ---")
print(f"Best Optimization Method: {optimized_model_performance['Method']}")
print(f"Accuracy: {optimized_model_performance['Accuracy']:.4f}")
print(f"Precision: {optimized_model_performance['Precision']:.4f}")
print(f"Recall: {optimized_model_performance['Recall']:.4f}")
print(f"F1-Score: {optimized_model_performance['F1-Score']:.4f}")



--- Summary of Optimized Model Performance ---
Best Optimization Method: GridSearchCV
Accuracy: 0.8333
Precision: 0.8182
Recall: 0.7500
F1-Score: 0.7826


Model Export

In [30]:
# --- Model Export ---
print("\n--- Model Export ---")

import joblib

# Define the path to save the model
model_save_path = '../models/'
os.makedirs(model_save_path, exist_ok=True) # Create the directory if it doesn't exist

# The final_optimized_model was chosen based on best test AUC
# This will save the best model (which was RandomForest_Grid in your last run)
model_filename = os.path.join(model_save_path, 'best_RF_heart_disease_model.joblib')

joblib.dump(final_optimized_model, model_filename)

print(f"Final optimized model saved to: {model_filename}")


--- Model Export ---
Final optimized model saved to: ../models/best_RF_heart_disease_model.joblib
