In [None]:
from Data_Creation.ablation_dataset_initialization import get_splits_small, get_splits_mid, get_splits_large
from Experimentation.feature_selection import select_features, select_model
from Experimentation.mm_analysis import analyze_downstream_imputations
import pandas as pd

from missingness_functions import sparse_remove_data_until_percentage_complete

from sklearn.exceptions import ConvergenceWarning
from joblib import Parallel, delayed
import warnings

def set_warning_filters():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

n_jobs = 4
set_warning_filters()
Parallel(n_jobs=n_jobs)(delayed(set_warning_filters)() for _ in range(2*n_jobs)) 

In [20]:
small = get_splits_small(cols=None)

In [None]:
# small = get_splits_small(cols=None)
# mid = get_splits_mid(cols=None)

data = small

# Get Best Features and Model
data = data.apply(lambda x: pd.to_numeric(x, errors='coerce'))
features = select_features(dataframe=data,
                           target_column='Recurrence90days',
                           R1=5,
                           R2=10,
                           n_splits=5,
                           top_n_features=30)
reduced_data = data[features + ['Recurrence90days']]
clf, imp = select_model(dataframe=reduced_data,
                        target_column='Recurrence90days',
                        R1=1,
                        n_splits=5)

In [None]:
RATE_RANGE = [0.95, 0.90, 0.85, 0.8, 0.75]

small = get_splits_small(cols=None)
data = small
data = data.apply(lambda x: pd.to_numeric(x, errors='coerce'))
features = ['age_at_ablation', '12(13)-EpOME', 'Longest_duration_of_Afib', 'Lat-pre-p1', 'Lat-post-p2']
reduced_data = data[features + ['Recurrence90days']]
clf = 'RF'
imp = 'mice-lr'

# Determine
RQ_1_2_query = analyze_downstream_imputations(dataframe=reduced_data,
                                              target_column='Recurrence90days',
                                              clf_model=clf,
                                              iterations=40,
                                              complete_rates=RATE_RANGE)

In [None]:
# Group by imputer and aggregate mean and std for the relevant metrics
metrics = ['imputation_accuracy', 'kld', 'classifier_f1', 'classifier_auroc', 'classifier_auprc']

imputer_stats = RQ_1_2_query.groupby('imputer')[metrics].agg(['mean', 'std'])

# Flatten the MultiIndex columns
imputer_stats.columns = ['_'.join(col) for col in imputer_stats.columns]
imputer_stats = imputer_stats.reset_index()

print(imputer_stats)

In [None]:
import matplotlib.pyplot as plt

# Define metrics to aggregate
metrics = ['imputation_accuracy', 'kld', 'classifier_f1', 'classifier_auroc', 'classifier_auprc']

# Group by imputer and compute mean and std
imputer_stats = RQ_1_2_query.groupby('imputer')[metrics].agg(['mean', 'std'])

# Flatten MultiIndex so columns match expected naming format
imputer_stats.columns = [
    f"{metric}_{agg}" for metric, agg in imputer_stats.columns
]

# Reset index for merging or plotting
imputer_stats = imputer_stats.reset_index()

# Optional: rename columns for consistency with your plotting code
imputer_stats = imputer_stats.rename(columns={
    'classifier_auroc_mean': 'classifier_auroc_mean',
    'classifier_auprc_mean': 'classifier_auprc_mean'
})

print(imputer_stats.head())


aggregated_varied_clf_df = RQ_1_2_query.groupby(['imputer', 'complete_rate'])[
    ['classifier_auroc', 'classifier_auprc']
].agg(['mean', 'std']).reset_index()

# Flatten columns to match plotting logic
aggregated_varied_clf_df.columns = [
    f"{col[0]}_{col[1]}" if col[1] else col[0]
    for col in aggregated_varied_clf_df.columns.values
]



clf_vals = ['classifier_auroc_mean', 'classifier_auprc_mean']

for clf_val in clf_vals:

    plt.figure(figsize=(10, 6))

    for imputer in ['mean', 'median', 'knn', 'cart', 'mice-lr', 'mice-dt', 'random', 'zero']:
        # Subset data for the current imputer
        
        subset = aggregated_varied_clf_df[aggregated_varied_clf_df['imputer'] == imputer]
        
        # Calculate the mean and std for the current imputer
        means = subset.groupby('complete_rate')[clf_val].mean()
        # stds = subset.groupby('complete_rate')[accuracy_type].std()
        
        # Plot the mean and confidence intervals (mean +/- std)
        plt.plot(means.index, means.values, label=f"{imputer}", marker='o')
        # plt.fill_between(means.index, means.values - stds.values, means.values + stds.values, alpha=0.2)

    # Set labels and title
    plt.xlabel("Completeness Rate")
    if clf_val == 'classifier_auroc_mean':
        plt.ylabel("Average Classifier AUROC Score")
    elif clf_val == 'classifier_auprc_mean':
        plt.ylabel("Average Classifier AUPRC Score")
    plt.title(f"Imputation's Average Affect on Classifiers vs Completeness")
    plt.legend(title="Imputers")
    plt.grid(True)
    
    plt.show()

In [None]:
c_cols = []
e_cols = []
m_cols = []

RQ_3_query_list = []
for m_cols in [c_cols, e_cols, c_cols + e_cols]:
    RQ_3_query_list.append([m_cols, analyze_downstream_imputations(dataframe=reduced_data[m_cols],
                                                          target_column='Recurrence90days',
                                                          clf_model=clf,
                                                          iterations=20,
                                                          imputers_given=imp)])
    
RQ_4_query_list = []
for m_cols in [c_cols, e_cols]:
    for rate in RATE_RANGE:
        temp_reduced_data = reduced_data.copy()
        temp_reduced_data = sparse_remove_data_until_percentage_complete(data=temp_reduced_data,
                                                                         percentage=rate,
                                                                         random_state=0)
        RQ_4_query_list.append([m_cols, rate, analyze_downstream_imputations(dataframe=temp_reduced_data,
                                                              target_column='Recurrence90days',
                                                              clf_model=clf,
                                                              iterations=1,
                                                              complete_rates=None,
                                                              imputers_given=imp)])

RQ_5_query_list = []
for m_cols in [c_cols, e_cols, c_cols + e_cols]:
    RQ_5_query_list.append([m_cols, analyze_downstream_imputations(dataframe=reduced_data,
                                                          target_column='Recurrence90days',
                                                          clf_model=clf,
                                                          iterations=20,
                                                          imputers_given=imp,
                                                          nan_test_cols=m_cols)])