In [6]:
import polars as pl
import numpy as np
import pandas as pd
import math
from typing import Optional
import re
import os

for i in range(1, 11):
    path = f"App/datasets/dataset60_valeri_dnarna_1/runs/run_{i}"
    # Read the file
    with open(os.path.join(path, "feature_importance.csv")) as f:
        lines = f.readlines()

    features = []
    importances = []

    for line in lines:
        # Extract the feature name between the first '(' and ')'
        feature_match = re.search(r'Feature\s*\((.*?)\)', line)
        # Extract importance: number inside the last parentheses
        importance_match = re.search(r'\(([-+]?[0-9]*\.?[0-9]+)\)\s*$', line)

        if feature_match and importance_match:
            features.append(feature_match.group(1))
            importances.append(float(importance_match.group(1)))

    df = pd.DataFrame({
        "Feature": features,
        "Importance": importances
    })

    df.to_csv(os.path.join(path, "feature_importance.csv"), sep='\t', index=False)

In [None]:
def calculate_metrics_from_confusion_matrix(matrix_path):
    """
    Calculates ACC, Sn, Sp, F1, and MCC from a binary confusion matrix CSV.
    """
    df = pd.read_csv(matrix_path, index_col=0)

    if len(df.columns) < 4:
        # Extract values (assuming order like your example)
        TN = df.loc["negative", "negative"]
        FP = df.loc["negative", "positive"]
        FN = df.loc["positive", "negative"]
        TP = df.loc["positive", "positive"]

        # Metrics
        ACC = (TP + TN) / (TP + TN + FP + FN)
        Sn = TP / (TP + FN) if (TP + FN) > 0 else 0  # Sensitivity (Recall)
        Sp = TN / (TN + FP) if (TN + FP) > 0 else 0  # Specificity
        F1 = (2 * TP) / (2 * TP + FP + FN) if (2 * TP + FP + FN) > 0 else 0
        MCC = ((TP * TN) - (FP * FN)) / np.sqrt(
            (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)
        ) if all(x > 0 for x in [(TP + FP), (TP + FN), (TN + FP), (TN + FN)]) else 0

        return {
            "ACC_test": ACC,
            "Sn_test": Sn,
            "Sp_test": Sp,
            "F1_test": F1,
            "MCC_test": MCC,
        }
    else:
        return {
            "ACC_test": None,
            "Sn_test": None,
            "Sp_test": None,
            "F1_test": None,
            "MCC_test": None,
        }

# ---------- Main pipeline ----------
pl.Config(tbl_rows=60)

full_datasets_path = "App/datasets"
datasets_list = [
    os.path.join(full_datasets_path, item)
    for item in os.listdir(full_datasets_path)
    if os.path.isdir(os.path.join(full_datasets_path, item))
]

df_metrics = pd.DataFrame()

for dataset_path in datasets_list:
    experiments_folder = os.path.join(dataset_path, "runs")
    if os.path.exists(experiments_folder):
        runs_folders = [
            os.path.join(experiments_folder, run_folder)
            for run_folder in os.listdir(experiments_folder)
        ]

        for run_folder in runs_folders:
            run_num = int(re.search(r"\d+", run_folder.split("/")[-1]).group())

            if run_num >= 6 and run_num <= 10:
                # --- Load training metrics ---
                train_path = os.path.join(run_folder, "training_kfold(10)_metrics.csv")
                if not os.path.exists(train_path):
                    continue

                df_metrics_run = pd.read_csv(train_path)
                df_metrics_run["dataset"] = dataset_path.split("/")[-1]

                # --- Load test confusion matrix & compute metrics ---
                test_confusion_path = os.path.join(run_folder, "test_confusion_matrix.csv")
                test_other_path = os.path.join(run_folder, "metrics_other.csv")
                if os.path.exists(test_other_path):
                    test_metrics = calculate_metrics_from_confusion_matrix(test_confusion_path)
                    for k, v in test_metrics.items():
                        df_metrics_run[k] = v

                    df_metrics_run["AUC_test"] = pl.read_csv(test_other_path).filter(pl.col("Metric") == "AUC")["Value"].item()
                else:
                    # If missing, fill with NaN
                    for k in ["ACC_test", "Sn_test", "Sp_test", "F1_test", "MCC_test"]:
                        df_metrics_run[k] = np.nan

                df_metrics = pd.concat([df_metrics, df_metrics_run], ignore_index=True)

# ---------- Sorting and Polars conversion ----------
metric_columns = [
    column for column in df_metrics.columns.drop(["dataset"]).tolist() if "std" not in column
]
df_metrics = pl.from_pandas(df_metrics[["dataset"] + metric_columns]).sort(by=["dataset"])

df_metrics = df_metrics.group_by("dataset").agg(
    [
        (pl.col(col).mean().round(3).cast(pl.Utf8) + " ± " + 
         pl.col(col).std().round(3).cast(pl.Utf8)).alias(col)
        for col in df_metrics.columns if col != "dataset"
    ]
)

# Optional sorting by dataset number if datasets are named like "dataset1", "dataset2", etc.
df_sorted = (
    df_metrics.with_columns(
        pl.col("dataset").str.extract(r"dataset(\d+)").cast(pl.Int64).alias("dataset_num")
    )
    .sort("dataset_num")
    .drop("dataset_num")
)

df_final = df_sorted.select(["dataset", "Sn", "Sp", "ACC", "MCC", "AUC", "Sn_test", "Sp_test", "ACC_test", "MCC_test", "AUC_test"])
df_final

dataset,Sn,Sp,ACC,MCC,AUC,Sn_test,Sp_test,ACC_test,MCC_test,AUC_test
str,str,str,str,str,str,str,str,str,str,str
"""dataset1_liu_protein_0""","""0.972 ± 0.0""","""0.997 ± 0.0""","""0.972 ± 0.0""","""0.945 ± 0.0""","""0.98 ± 0.0""",,,,,
"""dataset2_yu_protein_0""","""0.86 ± 0.006""","""0.855 ± 0.018""","""0.86 ± 0.006""","""0.723 ± 0.011""","""0.926 ± 0.004""","""0.839 ± 0.025""","""0.841 ± 0.04""","""0.84 ± 0.03""","""0.681 ± 0.061""","""0.922 ± 0.013"""
"""dataset3_li_protein_0""","""0.913 ± 0.024""","""0.932 ± 0.034""","""0.913 ± 0.024""","""0.829 ± 0.049""","""0.97 ± 0.009""",,,,,
"""dataset4_charoenkwan_protein_0""","""0.922 ± 0.004""","""0.938 ± 0.006""","""0.922 ± 0.004""","""0.844 ± 0.009""","""0.975 ± 0.002""","""0.909 ± 0.006""","""0.946 ± 0.014""","""0.928 ± 0.009""","""0.856 ± 0.018""","""0.973 ± 0.006"""
"""dataset5_agrawal_protein_0""","""0.768 ± 0.008""","""0.807 ± 0.003""","""0.768 ± 0.008""","""0.54 ± 0.015""","""0.85 ± 0.004""","""0.785 ± 0.007""","""0.759 ± 0.009""","""0.772 ± 0.005""","""0.544 ± 0.01""","""0.846 ± 0.003"""
"""dataset6_timmons_protein_0""","""0.92 ± 0.045""","""0.931 ± 0.064""","""0.92 ± 0.045""","""0.844 ± 0.088""","""0.974 ± 0.031""",,,,,
"""dataset7_timmons_protein_0""","""0.938 ± 0.074""","""0.949 ± 0.081""","""0.938 ± 0.073""","""0.877 ± 0.146""","""0.979 ± 0.04""",,,,,
"""dataset8_pinacho_protein_0""","""0.951 ± 0.004""","""0.969 ± 0.004""","""0.951 ± 0.004""","""0.904 ± 0.007""","""0.988 ± 0.001""","""0.939 ± 0.006""","""0.956 ± 0.009""","""0.947 ± 0.004""","""0.895 ± 0.007""","""0.988 ± 0.001"""
"""dataset9_manavalan_protein_0""","""0.805 ± 0.007""","""0.821 ± 0.008""","""0.805 ± 0.007""","""0.611 ± 0.015""","""0.879 ± 0.008""","""0.878 ± 0.007""","""0.824 ± 0.014""","""0.851 ± 0.01""","""0.703 ± 0.02""","""0.924 ± 0.006"""
"""dataset10_charoenkwan_protein_…","""0.983 ± 0.003""","""0.985 ± 0.011""","""0.983 ± 0.003""","""0.966 ± 0.005""","""0.998 ± 0.002""","""0.907 ± 0.128""","""0.991 ± 0.003""","""0.977 ± 0.024""","""0.915 ± 0.087""","""0.989 ± 0.016"""


In [6]:
import re
import math
import polars as pl
import pandas as pd
from typing import Optional, Tuple

METRIC_ORDER = [
    "Sn",
    "Sp",
    "ACC",
    "MCC",
    "AUC",
    "Sn_test",
    "Sp_test",
    "ACC_test",
    "MCC_test",
    "AUC_test",
]

float_re = re.compile(r"[-+]?\d*\.\d+|\d+")
# New pattern to match mean ± std format (handles both ± and $\pm$)
mean_std_re = re.compile(r"([-+]?\d*\.\d+|\d+)\s*(?:±|\$\\pm\$)\s*([-+]?\d*\.\d+|\d+)")

def find_df_row_for_model(df: pl.DataFrame, model_num: int) -> Optional[pl.DataFrame]:
    """Try several plausible dataset name matches (dataset{n}, model{n}, contains n).
       Returns a single-row polars DataFrame or None.
    """
    s = df["dataset"].to_list()
    candidates = []
    patterns = [
        rf"dataset{model_num}\b"
    ]
    for i, name in enumerate(s):
        ln = str(name)
        for p in patterns:
            if re.search(p, ln, flags=re.IGNORECASE):
                candidates.append(i)
                break
    if not candidates:
        # Fallback: any dataset that contains the number digits anywhere
        for i, name in enumerate(s):
            if str(model_num) in str(name):
                candidates.append(i)
    if not candidates:
        return None
    # if multiple, prefer exact dataset{n}
    for idx in candidates:
        if re.search(rf"dataset{model_num}\b", str(s[idx]), flags=re.IGNORECASE):
            return df[idx: idx+1]
    # otherwise return first candidate
    return df[candidates[0]: candidates[0]+1]

def extract_first_number(token: str) -> Optional[float]:
    m = float_re.search(token)
    if not m:
        return None
    try:
        return float(m.group(0))
    except:
        return None

def extract_mean_std(token: str) -> Optional[Tuple[float, float]]:
    """Extract both mean and std from a string like '0.123 ± 0.045' or '0.123 $\pm$ 0.045'"""
    m = mean_std_re.search(token)
    if not m:
        return None
    try:
        mean_val = float(m.group(1))
        std_val = float(m.group(2))
        return (mean_val, std_val)
    except:
        return None

def replace_first_number(token: str, replacement: str) -> str:
    """Replace the first numeric substring in token with replacement, preserving spacing."""
    return float_re.sub(replacement, token, count=1)

def replace_mean_only(token: str, mean_replacement: str) -> str:
    """Replace only the mean value in a ± format string, keeping std as is."""
    def replacement_func(match):
        mean_part = match.group(1)
        std_part = match.group(2)
        # Replace only the mean, keep std unchanged, use $\pm$
        return f"{mean_replacement} $\\pm$ {std_part}"
    
    return mean_std_re.sub(replacement_func, token, count=1)

def format_value_for_latex(val) -> str:
    """Format numeric val to 3 decimals, or '-' for missing.
       If val is a string with ±, format both parts with $\pm$."""
    if val is None or (isinstance(val, float) and (math.isnan(val))):
        return "-"
    
    # Handle string values with ± or $\pm$
    if isinstance(val, str) and ("±" in val or r"$\pm$" in val):
        mean_std = extract_mean_std(val)
        if mean_std:
            mean_val, std_val = mean_std
            return f"{mean_val:.3f} $\\pm$ {std_val:.3f}"
        else:
            return "-"
    
    # Handle single numeric values
    try:
        return f"{float(val):.3f}"
    except:
        return "-"

def bold_if_needed(original_token: str, make_bold: bool) -> str:
    """Wrap only the mean number in \textbf{} if make_bold True.
       For ± format, only bold the mean, not the std."""
    if make_bold:
        # Check if this is a ± format
        mean_std = extract_mean_std(original_token)
        if mean_std:
            mean_val, std_val = mean_std
            # Bold only the mean, keep std normal, use $\pm$
            return replace_mean_only(
                original_token, 
                r"\\textbf{" + f"{mean_val:.3f}" + r"}"
            )
        else:
            # Single number format
            m = float_re.search(original_token)
            if not m:
                return original_token
            num = m.group(0)
            return replace_first_number(original_token, r"\\textbf{" + num + r"}")
    else:
        return original_token

def build_bio_token(value_str: str) -> str:
    """Return the LaTeX cell content for BioAutoML value (already formatted like 0.123 or '-')"""
    if value_str == "-":
        return " -"
    else:
        # put numeric without math mode; keep consistent with template
        return f" {value_str}"

def fill_table(df_polars: Optional[pl.DataFrame]=None, df_csv_path: Optional[str]=None,
               tex_in="table.tex", tex_out="table_filled.tex"):
    # load dataframe
    if df_polars is None:
        if df_csv_path is None:
            raise ValueError("Provide either df_polars or df_csv_path")
        df = pl.from_pandas(pd.read_csv(df_csv_path))
    else:
        df = df_polars

    # ensure columns present
    missing = [c for c in METRIC_ORDER + ["dataset"] if c not in df.columns]
    if missing:
        raise ValueError(f"Dataframe missing columns: {missing}")

    # read LaTeX
    with open(tex_in, "r", encoding="utf-8") as f:
        lines = f.readlines()

    # Counters for how many times BioAutoML-FAST is best
    best_train_count = 0
    best_test_count = 0
    train_count = 0
    test_count = 0

    pct_differences = []

    # process each BioAutoML-FAST row
    i = 0
    while i < len(lines):
        line = lines[i]
        if "& BioAutoML-FAST" in line:
            # find reference row: search upwards for the last '\\' line that contains '&' before this
            j = i - 1
            while j >= 0 and lines[j].strip() == "":
                j -= 1
            if j < 0:
                i += 1
                continue
            ref_line = lines[j].rstrip("\n")
            bio_line = lines[i].rstrip("\n")

            # count tokens in ref_line
            ref_tokens = [t for t in ref_line.split("&")]
            # tokens after the reference name are the metric columns
            # tokens[0] is the indentation+possibly multirow part; tokens[1] is the reference source (author)
            metric_tokens = ref_tokens[2:]
            # remove trailing '\\' from last token
            if metric_tokens:
                metric_tokens[-1] = metric_tokens[-1].rstrip().rstrip("\\").rstrip()

            num_metrics = len(metric_tokens)
            # we expect len(METRIC_ORDER) metrics; but be robust:
            if num_metrics < len(METRIC_ORDER):
                # if fewer, only use first num_metrics metrics
                metric_names = METRIC_ORDER[:num_metrics]
            else:
                metric_names = METRIC_ORDER[:num_metrics]

            # find model number from the multirow line above ref (look in the multirow line or ref_line)
            # search backwards for line containing "Model <n>"
            model_num = None
            k = j - 1
            search_range = range(max(0, j-4), j+1)
            for idx in search_range:
                if "Model" in lines[idx]:
                    m = re.search(r"Model\s+(\d+)", lines[idx])
                    if m:
                        model_num = int(m.group(1))
                        break

            if model_num is None:
                print(f"Couldn't determine model number for BioAutoML row near line {i}. Skipping.")
                i += 1
                continue

            # locate df row for this model
            df_row = find_df_row_for_model(df, model_num)
            if df_row is None or df_row.height == 0:
                # no data: build Bio row with '-'s and leave ref unchanged
                bio_cells = [" -"] * len(metric_names)
                new_bio_line = re.sub(r"&\s*BioAutoML-FAST.*\\\\", "& BioAutoML-FAST " + " &".join(bio_cells) + " \\\\", bio_line)
                lines[i] = new_bio_line + "\n"
                i += 1
                continue

            # extract numeric values from reference tokens
            ref_nums = []
            for tok in metric_tokens[:len(metric_names)]:
                # Extract just the mean value for comparison
                mean_std = extract_mean_std(tok)
                if mean_std:
                    ref_nums.append(mean_std[0])  # Use mean for comparison
                else:
                    num = extract_first_number(tok)
                    ref_nums.append(num)

            # get Bio values from df_row
            bio_nums = []
            bio_strs = []
            for mn in metric_names:
                v = df_row[mn].to_list()[0]
                if v is None or (isinstance(v, float) and math.isnan(v)):
                    bio_nums.append(None)
                    bio_strs.append("-")
                else:
                    # Handle both string (with ± or $\pm$) and numeric values
                    if isinstance(v, str) and ("±" in v or r"$\pm$" in v):  # Fixed: changed 'val' to 'v'
                        mean_std = extract_mean_std(v)
                        if mean_std:
                            mean_val, std_val = mean_std
                            bio_nums.append(mean_val)  # Use mean for comparison
                            bio_strs.append(format_value_for_latex(v))
                        else:
                            bio_nums.append(None)
                            bio_strs.append("-")
                    else:
                        try:
                            fv = float(v)
                            bio_nums.append(fv)
                            bio_strs.append(format_value_for_latex(fv))
                        except:
                            bio_nums.append(None)
                            bio_strs.append("-")

            # Decide bolding: for each column compare ref_nums[idx] vs bio_nums[idx]
            ref_tokens_new = metric_tokens.copy()
            bio_cells = []
            for idx_col in range(len(metric_names)):
                rnum = ref_nums[idx_col] if idx_col < len(ref_nums) else None
                bnum = bio_nums[idx_col]
                r_is_missing = (rnum is None)
                b_is_missing = (bnum is None)

                # New rule: bold BioAutoML if reference mean lies within ± std of BioAutoML's mean
                bio_std = None
                # Extract Bio std if present
                mean_std = extract_mean_std(bio_strs[idx_col])
                if mean_std:
                    _, std_val = mean_std
                    bio_std = std_val

                if r_is_missing or b_is_missing or bio_std is None:
                    bold_bio = False
                    bold_ref = False
                else:
                    lower = bnum - bio_std
                    upper = bnum + bio_std
                    if bnum > rnum or lower <= rnum <= upper:
                        bold_bio = True
                    else:
                        bold_bio = False

                        pct_differences.append(bnum - rnum)

                    # Reference bolding remains original:
                    bold_ref = False  # You did not request std-based bolding for reference

                if not (r_is_missing or b_is_missing):
                    metric_name = metric_names[idx_col]
                    if metric_name.endswith("_test"):
                        test_count += 1
                        if bold_bio:
                            best_test_count += 1
                    else:
                        train_count += 1
                        if bold_bio:
                            best_train_count += 1

                # update reference token (preserve $\pm$ etc.)
                orig_ref_token = metric_tokens[idx_col] if idx_col < len(metric_tokens) else ""
                if bold_ref:
                    ref_tokens_new[idx_col] = bold_if_needed(orig_ref_token, True)
                else:
                    ref_tokens_new[idx_col] = orig_ref_token

                # build BioAutoML cell
                bio_token_raw = bio_strs[idx_col]
                if bio_token_raw == "-":
                    bio_cell = " -"
                else:
                    if bold_bio:
                        # Handle ± format for bio values - only bold the mean
                        if "$\\pm$" in bio_token_raw:
                            mean_std = extract_mean_std(bio_token_raw)
                            if mean_std:
                                mean_val, std_val = mean_std
                                bio_cell = f" \\textbf{{{mean_val:.3f}}} $\\pm$ {std_val:.3f}"
                            else:
                                bio_cell = " \\textbf{" + bio_token_raw + "}"
                        else:
                            bio_cell = " \\textbf{" + bio_token_raw + "}"
                    else:
                        bio_cell = " " + bio_token_raw
                bio_cells.append(bio_cell)

            # reconstruct the reference line preserving format:
            # ref_tokens are: [indent/multirow, author, metric1, metric2, ...]
            new_ref_parts = ref_tokens[:2] + ref_tokens_new
            # ensure last token ends with " \\\\"
            new_ref_parts = [p.rstrip() for p in new_ref_parts]
            new_ref_line = " &".join(new_ref_parts).rstrip()
            # put back trailing \\
            if not new_ref_line.endswith("\\\\"):
                new_ref_line = new_ref_line + " \\\\"
            lines[j] = new_ref_line + "\n"

            # reconstruct bio line
            # find prefix up to "& BioAutoML-FAST"
            before, sep, after = bio_line.partition("& BioAutoML-FAST")
            # build new bio cells sequence: each cell preceded by ' &'
            new_bio_cells = "".join([" &" + c for c in bio_cells])
            new_bio_line = before + "& BioAutoML-FAST" + new_bio_cells + " \\\\"
            lines[i] = new_bio_line + "\n"

        i += 1

    # write output
    with open(tex_out, "w", encoding="utf-8") as f:
        f.writelines(lines)

    print(f"{best_train_count}/{train_count}")
    print(f"{best_test_count}/{test_count}")

    print(f"Finished. Output written to {tex_out}")
    return pct_differences

pct_differences = fill_table(df_polars=df_final, tex_in="table.tex", tex_out="table_filled.tex")

81/137
66/158
Finished. Output written to table_filled.tex


In [12]:
np.mean(pct_differences)

-0.08984459459459457

In [None]:
69/128
62/150

50 TRIALS

79/137
62/158

In [8]:
import polars as pl
import numpy as np
import pandas as pd
import math
from typing import Optional
import re
import os


# ---------- Main pipeline ----------
pl.Config(tbl_rows=60)

full_datasets_path = "App/datasets"
datasets_list = [
    os.path.join(full_datasets_path, item)
    for item in os.listdir(full_datasets_path)
    if os.path.isdir(os.path.join(full_datasets_path, item))
]

df_metrics = pd.DataFrame()

for dataset_path in datasets_list:
    experiments_folder = os.path.join(dataset_path, "runs")
    if os.path.exists(experiments_folder):
        runs_folders = [
            os.path.join(experiments_folder, run_folder)
            for run_folder in os.listdir(experiments_folder)
        ]

        for run_folder in runs_folders:
            run_num = int(re.search(r"\d+", run_folder.split("/")[-1]).group())

            if run_num >= 6 and run_num <= 10:
                # --- Load training metrics ---
                train_path = os.path.join(run_folder, "training_kfold(10)_metrics.csv")
                if not os.path.exists(train_path):
                    continue

                df_metrics_run = pd.read_csv(train_path)
                df_metrics_run["dataset"] = dataset_path.split("/")[-1]

                if "mean_absolute_error" in df_metrics_run.columns:
                    df_metrics = pd.concat([df_metrics, df_metrics_run], ignore_index=True)

# ---------- Sorting and Polars conversion ----------
metric_columns = [
    column for column in df_metrics.columns.drop(["dataset"]).tolist() if "std" not in column
]
df_metrics = pl.from_pandas(df_metrics[["dataset"] + metric_columns]).sort(by=["dataset"])

df_metrics = df_metrics.group_by("dataset").agg(
    [
        (pl.col(col).mean().round(3).cast(pl.Utf8) + " ± " + 
         pl.col(col).std().round(3).cast(pl.Utf8)).alias(col)
        for col in df_metrics.columns if col != "dataset"
    ]
)

# Optional sorting by dataset number if datasets are named like "dataset1", "dataset2", etc.
df_sorted = (
    df_metrics.with_columns(
        pl.col("dataset").str.extract(r"dataset(\d+)").cast(pl.Int64).alias("dataset_num")
    )
    .sort("dataset_num")
    .drop("dataset_num")
)
df_sorted
# df_final = df_sorted.select(["dataset", "Sn", "Sp", "ACC", "MCC", "AUC", "Sn_test", "Sp_test", "ACC_test", "MCC_test", "AUC_test"])
# df_final

dataset,mean_absolute_error,mean_squared_error,root_mean_squared_error,r2
str,str,str,str,str
"""dataset58_valeri_dnarna_1""","""0.182 ± 0.0""","""0.052 ± 0.0""","""0.228 ± 0.0""","""0.475 ± 0.002"""
"""dataset59_hoellerer_dnarna_1""","""0.078 ± 0.0""","""0.013 ± 0.0""","""0.114 ± 0.0""","""0.849 ± 0.001"""


In [None]:
df_papers = pl.DataFrame({"dataset": ["dataset1_zhang_protein"], 
             "ACC": [0.871], 
             "MCC": [None], 
             "F1": [None], 
             "balanced_ACC": [None], 
             "kappa": [None], 
             "gmean": [None], 
             "F1_micro": [None], 
             "F1_macro": [None], 
             "F1_w": [None]})
df_papers

dataset,ACC,MCC,F1,balanced_ACC,kappa,gmean,F1_micro,F1_macro,F1_w
str,null,null,null,null,null,null,null,null,null
"""dataset1_zhang_protein""",,,,,,,,,
