In [None]:
import os
import pickle
from typing import Any, Dict, Iterator, List

import polars as pl
from tqdm import tqdm

from auto_ml.model import AutoMLModel
from custom_data_types import AutoMLData
from data_collection.column_selection import get_dataset
from data_preprocessing.preprocessing import autoML_prep, feature_generation
from dataprofiling.data_summary import DataSummary
from model_assessment.fairness import FairnessAssessor

datasets = [
    "https://www.kaggle.com/datasets/prasad22/healthcare-dataset",
    "https://www.kaggle.com/datasets/willianoliveiragibin/healthcare-insurance",
    "https://www.kaggle.com/datasets/nanditapore/healthcare-diabetes",
    "https://www.kaggle.com/datasets/jpmiller/employee-attrition-for-healthcare",
    "https://www.kaggle.com/datasets/wajahat1064/healthcare-appointment-dataset",
    "https://www.kaggle.com/datasets/deependraverma13/diabetes-healthcare-comprehensive-dataset",
    "https://www.kaggle.com/datasets/babyoda/healthcare-investments-and-length-of-hospital-stay",
    "https://www.kaggle.com/datasets/anmolkumar/janatahack-healthcare-analytics-part-2",
    "https://www.kaggle.com/datasets/kalilurrahman/united-healthcare-stock-data",
    "https://www.kaggle.com/datasets/amandam1/breastcancerdataset",
    "https://huggingface.co/datasets/mstz/madelon",
    "https://www.kaggle.com/datasets/utkarshx27/health-services-in-metropolitan-areas",
    "https://www.kaggle.com/datasets/kanchana1990/colorado-healthcare-decoding-no-show-patterns",
    "https://www.kaggle.com/datasets/gauravduttakiit/reproductive-childhealthcare-classification",
    "https://www.kaggle.com/datasets/meeratif/global-healthcare-pricess",
    "https://www.kaggle.com/datasets/uom190346a/disease-symptoms-and-patient-profile-dataset",
    "https://www.kaggle.com/datasets/nanditapore/medical-cost-dataset",
    "https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset",
    "https://www.kaggle.com/datasets/daminitiwari/insurance",
    "https://www.kaggle.com/datasets/rivalytics/healthcare-workforce-mental-health-dataset",
    "https://www.kaggle.com/datasets/ankushpanday1/colorectal-cancer-risk-and-survival-data",
    "https://www.kaggle.com/datasets/iamsouravbanerjee/life-expectancy-at-birth-across-the-globe",
    "https://www.kaggle.com/datasets/heidarmirhajisadati/regional-cost-of-living-analysis",
    "https://www.kaggle.com/datasets/jobspikr/30000-latest-healthcare-jobs-emedcareers-europe",
    "https://www.kaggle.com/datasets/anshulmahajan14/healthcare-dataset",
    "https://www.kaggle.com/datasets/mrsimple07/obesity-prediction",
    "https://www.kaggle.com/datasets/harshsingh2209/supply-chain-analysis",
    "https://www.kaggle.com/datasets/amirmotefaker/supply-chain-dataset",
    "https://www.kaggle.com/datasets/aranyogeshm/healthcare-dataset",
    "https://www.kaggle.com/datasets/adishgolechha/ecommerce-healthcare-orders-dataset",
    "https://www.kaggle.com/datasets/simranjitkhehra/healthcare-patient-record",
    "https://www.kaggle.com/datasets/sunayanagawde/flipkart-healthcare-products-dataset",
    "https://www.kaggle.com/datasets/muhammadehsan02/healthcare-prediction-dataset",
    "https://www.kaggle.com/datasets/ashutoshswarnakar/healthcare-patient-records",
    "https://www.kaggle.com/datasets/prasad22/pmc-hospital-infrastructure",
    "https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset",
    "https://www.kaggle.com/datasets/nelgiriyewithana/countries-of-the-world-2023",
    "https://www.kaggle.com/datasets/hasibur013/diabetes-dataset",
    "https://www.kaggle.com/datasets/divyabhavana/synthetic-healthcare-dataset",
    "https://www.kaggle.com/datasets/shashwatwork/wellbeing-of-healthcare-professionals-in-india",
    "https://www.kaggle.com/datasets/simran726/healthcare-patient-monitoring-data",
    "https://www.kaggle.com/datasets/kapoorprakhar/cardio-health-risk-assessment-dataset",
    "https://www.kaggle.com/datasets/rabieelkharoua/predict-liver-disease-1700-records-dataset",
    "https://www.kaggle.com/datasets/zsinghrahulk/aids-clinical-trials-group-study-175",
    "https://www.kaggle.com/datasets/willianoliveiragibin/annual-cause-death-numbers",
    "https://www.kaggle.com/datasets/samira1992/diabetes-intermediate-dataset",
    "https://www.kaggle.com/datasets/sresthajain/medical-diagnosis-dataset",
    "https://www.kaggle.com/datasets/lingyoungloon/who-healthcare-systems",
    "https://www.kaggle.com/datasets/mayankgupta96/healthcare",
    "https://www.kaggle.com/datasets/andrewmvd/fetal-health-classification",
    "https://www.kaggle.com/datasets/zain280/diabeties-dataset",
    "https://www.kaggle.com/datasets/hasaanrana/diet-exercise-and-pcos-insights",
    "https://www.kaggle.com/datasets/ankushpanday1/pancreatic-cancer-prediction-dataset",
    "https://www.kaggle.com/datasets/ashaychoudhary/anxiety-attack-factors-symptoms-and-severity",
    "https://www.kaggle.com/datasets/godfatherfigure/healthcare-dataset-stroke-data",
    "https://www.kaggle.com/datasets/awais8765/healthcare-diabetes",
    "https://www.kaggle.com/datasets/mahad049/heart-health-stats-dataset",
    "https://www.kaggle.com/datasets/hhs/health-insurance",
    "https://www.kaggle.com/datasets/prashikmeshram37/healthcare-dataset",
    "https://www.kaggle.com/datasets/samira1992/countries-intermediate-dataset",
    "https://www.kaggle.com/datasets/hasibur013/bangladesh-hospital-dataset",
    "https://www.kaggle.com/datasets/rajagrawal7089/healthcare",
    "https://www.kaggle.com/datasets/joymarhew/medical-reccomadation-dataset",
    "https://www.kaggle.com/datasets/mattop/burger-king-menu-nutrition-data",
    "https://www.kaggle.com/datasets/ankushpanday1/diabetes-prediction-in-india-dataset",
    "https://www.kaggle.com/datasets/jatinthakur706/copd-asthma-patient-dataset",
    "https://www.kaggle.com/datasets/gyanashish/healthcare-diabetes",
    "https://www.kaggle.com/datasets/omegasaransh12/reproductive-childhealthcare",
    "https://www.kaggle.com/datasets/splcher/adverse-hospital-events-in-california",
    "https://www.kaggle.com/datasets/ulrikthygepedersen/life-expectancy",
    "https://www.kaggle.com/datasets/danevans/world-bank-wdi-212-health-systems",
    "https://www.kaggle.com/datasets/nvlkumar/healthcare-diabetes",
    "https://www.kaggle.com/datasets/waqi786/brain-tumor-dataset",
    "https://www.kaggle.com/datasets/shriyashjagtap/heart-attack-risk-assessment-dataset",
    "https://www.kaggle.com/datasets/mathurinache/invehicle-coupon-recommendation",
    "https://www.kaggle.com/datasets/rasooljader/gestational-diabetes",
    "https://www.kaggle.com/datasets/noeyislearning/framingham-heart-study",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=40883",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=42865",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=1464",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=40922",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=1509",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=1523",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=4153",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=960",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&sort=match&id=43150",
    "https://openml.org/search?type=data&status=any&tags.tag=healthcare&id=1464",
    "https://openml.org/search?type=data&status=any&sort=match&id=43781",
    "https://openml.org/search?type=data&status=any&sort=match&id=43827",
    "https://openml.org/search?type=data&status=any&sort=match&id=42738",
    "https://openml.org/search?type=data&status=any&sort=match&id=43896",
    "https://openml.org/search?type=data&status=any&sort=match&id=43897",
    "https://openml.org/search?type=data&status=any&sort=match&id=43457",
    "https://openml.org/search?type=data&status=any&sort=match&id=42878",
    "https://openml.org/search?type=data&status=any&sort=match&id=46101",
    "https://openml.org/search?type=data&status=any&sort=match&id=46076",
    "https://openml.org/search?type=data&status=any&sort=match&id=43414",
    "https://huggingface.co/datasets/naabiil/Obesity_Levels_Estimation",
    "https://huggingface.co/datasets/aai510-group1/telco-customer-churn",
    "https://huggingface.co/datasets/imodels/credit-card",
    "https://huggingface.co/datasets/mstz/heart_failure",
    "https://huggingface.co/datasets/Einstellung/demo-salaries",
    "https://huggingface.co/datasets/katossky/wine-recognition",
    "https://huggingface.co/datasets/Ozziey/poems_dataset",
    "https://huggingface.co/datasets/wwydmanski/wisconsin-breast-cancer",
    "https://huggingface.co/datasets/wwydmanski/tabular-letter-recognition",
    "https://huggingface.co/datasets/mstz/heloc",
    "https://huggingface.co/datasets/mstz/student_performance",
    "https://huggingface.co/datasets/mstz/compas",
    "https://huggingface.co/datasets/mstz/german",
    "https://huggingface.co/datasets/mstz/annealing",
    "https://huggingface.co/datasets/mstz/sonar",
]

aggregated_results = {
    "outliers": {"train": [], "val": [], "test": []},
    "label_issues": {"train": [], "val": [], "test": []},
    "near_duplicates": {"tvc": [], "ttc": [], "vtc": [], "genc": []},
    "leakage_correlation": [],
    "data_balance": [],
}

aggregated_metrics_original: Dict[str, Dict[str, List[float | str]]] = {
    "binary": {
        "dataset_name": [],
        "count": [],
        "accuracy": [],
        "balanced_accuracy": [],
        "f1": [],
        "precision": [],
        "sensitivity/recall": [],
        "selection rate": [],
        "specificity/selectivity": [],
        "false positive rate": [],
        "false negative rate": [],
        "MCC": [],
        "demographic parity difference": [],
        "demographic parity ratio": [],
        "equalized odds difference": [],
        "equalized odds ratio": [],
        "equal opportunity difference": [],
        "equal opportunity ratio": [],
    },
    "multiclass": {
        "dataset_name": [],
        "count": [],
        "accuracy": [],
        "balanced_accuracy": [],
        "f1": [],
        "precision": [],
        "sensitivity/recall": [],
        "MCC": [],
    },
    "regression": {
        "dataset_name": [],
        "count": [],
        "MAE": [],
        "MSE": [],
        "RMSE": [],
        "MdAE": [],
        "MAPE": [],
        "R2": [],
    },
}

aggregated_metrics_cleaned: Dict[str, Dict[str, List[float | str]]] = {
    "binary": {
        "dataset_name": [],
        "count": [],
        "accuracy": [],
        "balanced_accuracy": [],
        "f1": [],
        "precision": [],
        "sensitivity/recall": [],
        "selection rate": [],
        "specificity/selectivity": [],
        "false positive rate": [],
        "false negative rate": [],
        "MCC": [],
        "demographic parity difference": [],
        "demographic parity ratio": [],
        "equalized odds difference": [],
        "equalized odds ratio": [],
        "equal opportunity difference": [],
        "equal opportunity ratio": [],
    },
    "multiclass": {
        "dataset_name": [],
        "count": [],
        "accuracy": [],
        "balanced_accuracy": [],
        "f1": [],
        "precision": [],
        "sensitivity/recall": [],
        "MCC": [],
    },
    "regression": {
        "dataset_name": [],
        "count": [],
        "MAE": [],
        "MSE": [],
        "RMSE": [],
        "MdAE": [],
        "MAPE": [],
        "R2": [],
    },
}


def update_metrics_for_type(
    problem_type: str,
    original_rows: Iterator[Dict[str, Any]],
    cleaned_rows: Iterator[Dict[str, Any]],
    dataset_path: str,
    metrics_original: Dict[str, Dict[str, List[float | str]]],
    metrics_cleaned: Dict[str, Dict[str, List[float | str]]],
) -> None:
    for original, cleaned in zip(original_rows, cleaned_rows):
        metrics_original[problem_type][original["metric"]].append(original["value"])
        metrics_cleaned[problem_type][cleaned["metric"]].append(cleaned["value"])

    metrics_original[problem_type]["dataset_name"].append(str(dataset_path))
    metrics_cleaned[problem_type]["dataset_name"].append(str(dataset_path))


def update_aggregate_metrics(
    original_metric: Dict[str, Dict[str, pl.DataFrame]],
    automl_data: AutoMLData,
    cleaned_metric: Dict[str, Dict[str, pl.DataFrame]],
):
    if any(input is None for input in [original_metric, cleaned_metric, automl_data]):
        for input in [original_metric, cleaned_metric, automl_data]:
            if input is None:
                print(f"🚨 calculating metrics: Input {input} is None")
        return
    original_overall = original_metric[list(original_metric.keys())[0]]["overall"]
    cleaned_overall = cleaned_metric[list(cleaned_metric.keys())[0]]["overall"]
    if not original_overall["metric"].equals(cleaned_overall["metric"]):
        print("🚨 calculating metrics: Original and cleaned metrics are not the same")
        return
    for df, name in [(original_overall, "original"), (cleaned_overall, "cleaned")]:
        for row in df.iter_rows(named=True):
            metric_name = row["metric"]
            metric_value = row["value"]
            if not isinstance(metric_value, (float, int)) or metric_value is None:
                print(f"🚨 calculating metrics: Metric {metric_name} contains invalid values in {name} metrics")
                return
    if automl_data["problem_type"] not in ["binary", "multiclass", "regression"]:
        print("🚨 calculating metrics: Invalid problem_type")
        return
    update_metrics_for_type(
        automl_data["problem_type"],
        original_overall.iter_rows(named=True),
        cleaned_overall.iter_rows(named=True),
        str(automl_data["path"]),
        aggregated_metrics_original,
        aggregated_metrics_cleaned,
    )
    print(f"✅ Successfully updated metrics for {automl_data['path']}")


for d in tqdm(datasets):
    try:
        # Get Raw Data
        data = get_dataset(d)
        data_path = data["path"]
        if os.path.exists(f"{data_path}/raw_data.pkl"):
            with open(f"{data_path}/raw_data.pkl", "rb") as f:
                data = pickle.load(f)
        else:
            with open(f"{data_path}/raw_data.pkl", "wb") as f:
                pickle.dump(data, f)

        print(f"Target: {data['target']}")
        print(f"Sensitive: {data['sensitive_features']}")
        # Preprocess Data
        proccessed_data = autoML_prep(data)
        # Run AutoML 1
        if os.path.exists(f"{data_path}/automl_original.pkl"):
            print("Loading original AutoML")
            with open(f"{data_path}/automl_original.pkl", "rb") as f:
                original_automl = pickle.load(f)
            original_automl_data = original_automl.auto_ml_data
        else:
            print("Running original AutoML")
            original_automl = AutoMLModel(proccessed_data, time_limit=300, preset="good", load=False, verbosity=0)
            original_automl.run_auto_ml()
            original_automl_data = original_automl.auto_ml_data
            if original_automl_data["problem_type"] != "regression":
                original_train_pred = original_automl.predict_proba("train")
                original_test_pred = original_automl.predict_proba("test")
                combined_prob_original = pl.concat([original_train_pred, original_test_pred], how="vertical").to_numpy()
            else:
                combined_prob_original = None
            with open(f"{data_path}/automl_original.pkl", "wb") as f:
                pickle.dump(original_automl, f)

        # Profile Data
        if os.path.exists(f"{data_path}/data_summary.pkl"):
            with open(f"{data_path}/data_summary.pkl", "rb") as f:
                data_summary = pickle.load(f)
            profiled_data = data_summary.export()
        else:
            if original_automl_data["problem_type"] != "regression":
                original_train_pred = original_automl.predict_proba("train")
                original_test_pred = original_automl.predict_proba("test")
                combined_prob_original = pl.concat([original_train_pred, original_test_pred], how="vertical").to_numpy()
            else:
                combined_prob_original = None
            if combined_prob_original is None:
                data_summary = DataSummary(proccessed_data)
            else:
                data_summary = DataSummary(proccessed_data, pred_probs=combined_prob_original)
            data_summary.create_summary()
            profiled_data = data_summary.export()
            with open(f"{data_path}/data_summary.pkl", "wb") as f:
                pickle.dump(data_summary, f)

        #  original metrics 1
        if os.path.exists(f"{data_path}/original_metrics.pkl"):
            with open(f"{data_path}/original_metrics.pkl", "rb") as f:
                original_metrics = pickle.load(f)
        else:
            fas_original = FairnessAssessor(original_automl_data)
            fas_original.analyze_all(intersections=False, feature_type="only one")
            original_metrics = fas_original.get_all_metrics()
            with open(f"{data_path}/original_metrics.pkl", "wb") as f:
                pickle.dump(original_metrics, f)
        # Run AutoML 2
        if os.path.exists(f"{data_path}/automl_cleaned.pkl"):
            with open(f"{data_path}/automl_cleaned.pkl", "rb") as f:
                cleaned_automl = pickle.load(f)
            cleaned_automl_data = cleaned_automl.auto_ml_data
        else:
            print("Running cleaned AutoML")
            cleaned_dict = data_summary.solve_issues(return_results=True, consider_near_duplicates=False)
            if cleaned_dict["train"] is not None:
                data["train"] = cleaned_dict["train"]
            if cleaned_dict["val"] is not None:
                data["val"] = cleaned_dict["val"]
            if cleaned_dict["test"] is not None:
                data["test"] = cleaned_dict["test"]
            cleaned_proccessed_data = autoML_prep(data)
            cleaned_automl = AutoMLModel(cleaned_proccessed_data, time_limit=300, preset="good", load=False, verbosity=0)
            cleaned_automl.run_auto_ml()
            cleaned_automl_data = cleaned_automl.auto_ml_data
            with open(f"{data_path}/automl_cleaned.pkl", "wb") as f:
                pickle.dump(cleaned_automl, f)
        # metrics 2
        if os.path.exists(f"{data_path}/cleaned_metrics.pkl"):
            with open(f"{data_path}/cleaned_metrics.pkl", "rb") as f:
                cleaned_metrics = pickle.load(f)
        else:
            fas_cleaned = FairnessAssessor(cleaned_automl_data)
            fas_cleaned.analyze_all(intersections=False, feature_type="only one")
            cleaned_metrics = fas_cleaned.get_all_metrics()
            with open(f"{data_path}/cleaned_metrics.pkl", "wb") as f:
                pickle.dump(cleaned_metrics, f)
        update_aggregate_metrics(original_metrics, original_automl_data, cleaned_metrics)
        issue_summary = data_summary.quick_summary()
        print(issue_summary["string_output"])

        with open(f"{data_path}/statistics.pkl", "wb") as f:
            pickle.dump(issue_summary, f)
        # Aggregate Results
        for key in ["train", "val", "test"]:
            aggregated_results["outliers"][key].append(issue_summary["outliers"].get(key, 0))
        for key in ["train", "val", "test"]:
            aggregated_results["label_issues"][key].append(issue_summary["label_issues"].get(key, 0))
        for key in ["tvc", "ttc", "vtc", "genc"]:
            aggregated_results["near_duplicates"][key].append(issue_summary["near_duplicates"].get(key, 0))
        aggregated_results["leakage_correlation"].append(issue_summary.get("leakage_correlation", 0))
        aggregated_results["data_balance"].append(issue_summary.get("data_balance", 0))

    except Exception as e:
        print(f"🚨 Skipping Dataset: {e}")
        with open(f"{data_path}/errors.txt", "a") as error_file:
            error_file.write(str(e))
        continue

with open("aggregated_results.pkl", "wb") as f:
    pickle.dump(aggregated_results, f)
with open("aggregated_metrics_original.pkl", "wb") as f:
    pickle.dump(aggregated_metrics_original, f)
with open("aggregated_metrics_cleaned.pkl", "wb") as f:
    pickle.dump(aggregated_metrics_cleaned, f)