In [21]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib


In [23]:
# Load preprocessed data from the data folder
data_path = '../data/copd_preprocessed.csv'  # Path to the preprocessed data file
df_encoded = pd.read_csv(data_path)

# Separate features and target variable
X = df_encoded.drop('COPD_Diagnosis', axis=1)
y = df_encoded['COPD_Diagnosis']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the shape of training and testing sets to verify data
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")


Training set shape: (1260, 19), Test set shape: (540, 19)


In [25]:
# Dictionary to store models and their names
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}



In [27]:
# Train, evaluate, and save all models
for model_name, model in models.items():
    print(f"\nTraining and evaluating model: {model_name}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    
    print(f"{model_name} Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Save the model to a .pkl file
    model_filename = f"../models/{model_name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, model_filename)
    print(f"{model_name} model saved as {model_filename}")




Training and evaluating model: Logistic Regression
Logistic Regression Accuracy: 0.9814814814814815
Logistic Regression Confusion Matrix:
[[355   4]
 [  6 175]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       359
           1       0.98      0.97      0.97       181

    accuracy                           0.98       540
   macro avg       0.98      0.98      0.98       540
weighted avg       0.98      0.98      0.98       540

Logistic Regression model saved as ../models/logistic_regression_model.pkl

Training and evaluating model: Random Forest
Random Forest Accuracy: 0.9925925925925926
Random Forest Confusion Matrix:
[[359   0]
 [  4 177]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       359
           1       1.00      0.98      0.99       181

    accuracy                           0.99     