In [15]:
import numpy as np
import pandas as pd

df_train = pd.DataFrame({"sequences": np.load("App/datasets/dataset51_hoellerer_dnarna/SAPIENs/data/sequences_validation_sample.npy"), "target": np.load("App/datasets/dataset51_hoellerer_dnarna/SAPIENs/data/targets_validation_sample.npy")})
df_train.to_csv("sequences_validation_sample.tsv", sep="\t", index=False)

In [13]:
df_test = pd.DataFrame({"sequences": np.load("App/datasets/dataset51_hoellerer_dnarna/SAPIENs/data/sequences_test.npy"), "target": np.load("App/datasets/dataset51_hoellerer_dnarna/SAPIENs/data/targets_test.npy")})
df_test.to_csv("sequences_test.tsv", sep="\t", index=False)

In [35]:
import polars as pl
import pandas as pd
import os

pl.Config(tbl_rows=50)

full_datasets_path = "App/datasets"

datasets_list = [os.path.join(full_datasets_path, item) for item in os.listdir(full_datasets_path) 
                if os.path.isdir(os.path.join(full_datasets_path, item))]
df_metrics = pd.DataFrame()

for dataset_path in datasets_list:
    experiments_folder = os.path.join(dataset_path, "runs")
    if os.path.exists(experiments_folder):
        runs_folders = [os.path.join(experiments_folder, run_folder) for run_folder in os.listdir(experiments_folder)]
        
        
        for run_folder in runs_folders:
            df_metrics_run = pd.read_csv(os.path.join(run_folder, "training_kfold(10)_metrics.csv"))
            df_metrics_run["dataset"] = dataset_path.split("/")[-1]
            # df_metrics_run["run"] = int(run_folder.split("/")[-1].split("_")[-1])
            df_metrics = pd.concat([df_metrics, df_metrics_run])

metric_columns = [column for column in df_metrics.columns.drop(["dataset"]).tolist() if "std" not in column]
df_metrics = pl.from_pandas(df_metrics[["dataset"] + metric_columns]).sort(by=["dataset"])
df_metrics = df_metrics.group_by("dataset").agg(
    [
        (pl.col(col).mean().round(3).cast(pl.Utf8) + " ± " + 
         pl.col(col).std().round(3).cast(pl.Utf8)).alias(col)
        for col in df_metrics.columns if col != "dataset"
    ]
)

df_sorted = df_metrics.with_columns(
                pl.col("dataset").str.extract(r"dataset(\d+)").cast(pl.Int64).alias("dataset_num")
            ).sort("dataset_num").drop("dataset_num")

df_sorted

dataset,ACC,MCC,F1,balanced_ACC,kappa,gmean,F1_micro,F1_macro,F1_w
str,str,str,str,str,str,str,str,str,str
"""dataset1_zhang_protein""","""0.894 ± 0.008""","""0.703 ± 0.024""","""0.763 ± 0.02""","""0.835 ± 0.015""","""0.696 ± 0.025""","""0.824 ± 0.017""",,,
"""dataset2_phasit_protein""","""0.804 ± 0.011""","""0.612 ± 0.022""","""0.803 ± 0.012""","""0.804 ± 0.011""","""0.607 ± 0.023""","""0.801 ± 0.012""",,,
"""dataset3_lin_dnarna""","""0.788 ± 0.006""","""0.527 ± 0.012""","""0.683 ± 0.007""","""0.758 ± 0.005""","""0.525 ± 0.012""","""0.751 ± 0.006""",,,
"""dataset4_li_protein""","""0.918 ± 0.01""","""0.834 ± 0.021""","""0.935 ± 0.008""","""0.906 ± 0.012""","""0.825 ± 0.022""","""0.901 ± 0.014""",,,
"""dataset5_zhao_protein""","""0.909 ± 0.001""","""0.079 ± 0.01""","""0.075 ± 0.009""","""0.516 ± 0.002""","""0.053 ± 0.007""","""0.206 ± 0.014""",,,
"""dataset6_han_dnarna""","""0.967 ± 0.001""","""0.935 ± 0.002""","""0.967 ± 0.001""","""0.967 ± 0.001""","""0.935 ± 0.002""","""0.967 ± 0.001""",,,
"""dataset7_han_dnarna""","""0.914 ± 0.002""","""0.83 ± 0.004""","""0.916 ± 0.002""","""0.914 ± 0.002""","""0.828 ± 0.004""","""0.914 ± 0.002""",,,
"""dataset8_meng_dnarna""","""0.973 ± 0.0""","""0.946 ± 0.001""","""0.972 ± 0.0""","""0.973 ± 0.0""","""0.945 ± 0.001""","""0.973 ± 0.0""",,,
"""dataset9_lv_dnarna""","""0.903 ± 0.011""","""0.812 ± 0.023""","""0.899 ± 0.012""","""0.903 ± 0.011""","""0.806 ± 0.023""","""0.9 ± 0.011""",,,
"""dataset10_lv_dnarna""","""0.999 ± 0.003""","""0.998 ± 0.006""","""0.999 ± 0.004""","""0.999 ± 0.003""","""0.998 ± 0.007""","""0.999 ± 0.003""",,,


In [None]:
df_papers = pl.DataFrame({"dataset": ["dataset1_zhang_protein"], 
             "ACC": [0.871], 
             "MCC": [None], 
             "F1": [None], 
             "balanced_ACC": [None], 
             "kappa": [None], 
             "gmean": [None], 
             "F1_micro": [None], 
             "F1_macro": [None], 
             "F1_w": [None]})
df_papers

dataset,ACC,MCC,F1,balanced_ACC,kappa,gmean,F1_micro,F1_macro,F1_w
str,null,null,null,null,null,null,null,null,null
"""dataset1_zhang_protein""",,,,,,,,,
