In [None]:
from model import run_model
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
model = pd.read_csv('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv')
model.columns

In [None]:

run_model('/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/penguins.csv', target_col='sex')



In [None]:
import os
import glob
from model import run_model
import pandas as pd
from io import StringIO
import sys

# Directory containing the CSV files
data_dir = '/home/sahau24/csc790project/Fall2025_DCAI/CSC_790_Synthetic_Data/data/fitness/independent'

# Get all CSV files in the directory
csv_files = glob.glob(os.path.join(data_dir, '*.csv'))
print(f"Found {len(csv_files)} CSV files in {data_dir}")

# Lists to store ROC AUC values for all three models
logistic_roc_values = []
rf_roc_values = []
hgb_roc_values = []
file_names = []

# Function to capture ROC AUC from model output
def extract_roc_auc_from_output(output_text):
    lines = output_text.split('\n')
    roc_aucs = []
    for line in lines:
        if 'ROC AUC:' in line:
            try:
                roc_value = float(line.split('ROC AUC:')[1].strip())
                roc_aucs.append(roc_value)
            except:
                continue
    return roc_aucs

# Process each CSV file
for csv_file in sorted(csv_files):
    file_name = os.path.basename(csv_file)
    print(f"\nProcessing: {file_name}")
    
    try:
        # Check if file has required columns
        df = pd.read_csv(csv_file)
        if 'is_fit' not in df.columns:
            print(f"  Skipping {file_name}: 'is_fit' column not found")
            continue
            
        # Capture stdout to get ROC AUC values
        old_stdout = sys.stdout
        sys.stdout = captured_output = StringIO()
        
        try:
            # Run the model
            run_model(csv_file, target_col='is_fit')
            
            # Get the captured output
            output = captured_output.getvalue()
            
            # Extract ROC AUC values
            roc_aucs = extract_roc_auc_from_output(output)
            
            # Debug: Print the captured output to see what's happening
            print(f"  Debug: Found {len(roc_aucs)} ROC AUC values: {roc_aucs}")
            
            if len(roc_aucs) >= 3:
                # Check if all values are the same (indicating a bug)
                if roc_aucs[0] == roc_aucs[1] == roc_aucs[2]:
                    print(f"  ⚠️ WARNING: All models have identical ROC AUC - check model.py for bugs!")
                
                # Store ROC AUC for each model (Logistic, Random Forest, HistGradientBoosting)
                logistic_roc_values.append(roc_aucs[0])
                rf_roc_values.append(roc_aucs[1])
                hgb_roc_values.append(roc_aucs[2])
                file_names.append(file_name)
                print(f"  ✅ Logistic: {roc_aucs[0]:.4f}, RF: {roc_aucs[1]:.4f}, HGB: {roc_aucs[2]:.4f}")
            else:
                print(f"  ⚠️ Expected 3 ROC AUC values, found {len(roc_aucs)}")
                # Print part of the output for debugging
                print(f"  Debug output snippet: {output[:500]}...")
                
        except Exception as e:
            print(f"  ❌ Error running model: {str(e)}")
            
        finally:
            # Restore stdout
            sys.stdout = old_stdout
            
    except Exception as e:
        print(f"  ❌ Error reading file: {str(e)}")

# Display results
print(f"\n{'='*60}")
print("SUMMARY OF ROC AUC VALUES")
print(f"{'='*60}")

if file_names:
    results_df = pd.DataFrame({
        'File': file_names,
        'Logistic_ROC': logistic_roc_values,
        'RandomForest_ROC': rf_roc_values,
        'HistGradBoost_ROC': hgb_roc_values
    })
    
    print(results_df.to_string(index=False))
    
    print(f"\nStatistics for each model:")
    print(f"Total files processed: {len(file_names)}")
    
    print(f"\nLogistic Regression:")
    print(f"  Average: {sum(logistic_roc_values)/len(logistic_roc_values):.4f}")
    print(f"  Min: {min(logistic_roc_values):.4f}")
    print(f"  Max: {max(logistic_roc_values):.4f}")
    
    print(f"\nRandom Forest:")
    print(f"  Average: {sum(rf_roc_values)/len(rf_roc_values):.4f}")
    print(f"  Min: {min(rf_roc_values):.4f}")
    print(f"  Max: {max(rf_roc_values):.4f}")
    
    print(f"\nHistGradientBoosting:")
    print(f"  Average: {sum(hgb_roc_values)/len(hgb_roc_values):.4f}")
    print(f"  Min: {min(hgb_roc_values):.4f}")
    print(f"  Max: {max(hgb_roc_values):.4f}")
    
    print(f"\nROC AUC values lists:")
    print(f"Logistic Regression: {logistic_roc_values}")
    print(f"Random Forest: {rf_roc_values}")
    print(f"HistGradientBoosting: {hgb_roc_values}")
    
else:
    print("No ROC AUC values were collected.")
