In [None]:
from pathlib import Path
from model_evaluation_pipeline import run_three_models_training, test_all_models

In [None]:
TARGET = 'is_fit'
N_RUNS = 1

# Static test file location
test_file = 'data/fitness/original/test.csv'

# Find all train.csv files in subfolders of data/fitness/ (excluding 'original')
data_dir = Path('data/fitness')
train_files = [f for f in data_dir.rglob('**/*.csv') if 'original' not in f.parts]

# Dictionary to store results
results = {}

# Iterate over each train file
for train_file in train_files:
    try:
        # Run multiple times to get error bars
        all_runs = []
        for run_idx in range(N_RUNS):
            # Train models
            training_results = run_three_models_training(str(train_file), TARGET, random_state=None)
            # Test all models
            test_results = test_all_models(training_results, test_file, TARGET)
            all_runs.append(test_results)
        
        results[str(train_file)] = all_runs
        print(f"Processed {train_file.parent.name} ({N_RUNS} runs)")
    except Exception as e:
        print(f"Error processing {train_file}: {e}")
        results[str(train_file)] = None

In [None]:
import re
import matplotlib.pyplot as plt
import numpy as np

# Organize data by method and model type
data_by_model = {
    'logistic': {},
    'random_forest': {},
    'hist_gradient': {}
}

for filename, all_runs in results.items():
    if not all_runs:
        continue
    
    # Extract method and percentage using regex
    # Pattern: data/fitness/{method}/minority_{percentage}pct...
    match = re.search(r'data/fitness/([^/]+)/minority_(\d+)pct', filename)
    
    if match:
        method = match.group(1)
        percentage = int(match.group(2))
        
        # For each model type, collect all runs
        for model_name in ['logistic', 'random_forest', 'hist_gradient']:
            if method not in data_by_model[model_name]:
                data_by_model[model_name][method] = {'percentages': [], 'roc_means': [], 'roc_stds': []}
            
            # Collect ROC AUC scores from all runs
            roc_scores = [run[model_name]['test_roc'] for run in all_runs]
            
            data_by_model[model_name][method]['percentages'].append(percentage)
            data_by_model[model_name][method]['roc_means'].append(np.mean(roc_scores))
            data_by_model[model_name][method]['roc_stds'].append(np.std(roc_scores))

# Sort each method's data by percentage for each model
for model_name in data_by_model:
    for method in data_by_model[model_name]:
        sorted_data = sorted(zip(
            data_by_model[model_name][method]['percentages'],
            data_by_model[model_name][method]['roc_means'],
            data_by_model[model_name][method]['roc_stds']
        ))
        data_by_model[model_name][method]['percentages'], \
        data_by_model[model_name][method]['roc_means'], \
        data_by_model[model_name][method]['roc_stds'] = zip(*sorted_data)

# Create three separate plots
model_titles = {
    'logistic': 'Logistic Regression',
    'random_forest': 'Random Forest',
    'hist_gradient': 'Histogram Gradient Boosting'
}

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (model_name, ax) in enumerate(zip(['logistic', 'random_forest', 'hist_gradient'], axes)):
    for method, data in data_by_model[model_name].items():
        percentages = np.array(data['percentages'])
        means = np.array(data['roc_means'])
        stds = np.array(data['roc_stds'])
        
        ax.errorbar(percentages, means, yerr=stds, marker='o', capsize=5, label=method)
    
    ax.set_xlabel('Minority Percentage (%)')
    ax.set_ylabel('ROC AUC')
    ax.set_title(f'{model_titles[model_name]}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()