In [27]:
import json
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from tqdm import tqdm

from local_python.feature_evaluation import (
    evaluate_with_knn, evaluate_with_lr, calculate_scores
)
from local_python.general_utils import (
    set_seed,
    load_pd_from_json,
)

In [28]:
csv_source_directory = "../datasets/intermediate-features/"
metric_target_directory = "../runs/"
master_file_name = "master-metrics-remaining.txt"
# previous_file_name = "master-metrics-vit-derma.txt"
# previous_file_name = "master-metrics-vit-plant.txt"
# previous_file_name = "master-metrics-resnet-derma.txt"
# previous_file_name = "master-metrics-resnet-plant.txt"
previous_file_name = "empty.txt"

seed_list = range(100)
max_neighbors = 200
max_iter = 10_000
knn_metric = "cosine"

detail_selection_config = {  # number of samples and seeds
    1: seed_list,
    3: seed_list,
    10: seed_list,
    30: seed_list,
    100: seed_list,
    None: [None],
}

master_selection_config = {
    None: range(10),
}

dataset_prefixes = [
    "DDI-ViT_T16-student",
    # "PAD_UFES_20-ViT_T16-student",
    # "HAM10000-ViT_T16-student",
    # "Fitzpatrick17k-ViT_T16-student",
    # "PlantDoc-ViT_T16-student",
    # "PlantDataset-ViT_T16-student",
    # "Cassava-ViT_T16-student",
    # "PlantVillage-ViT_T16-student",

    "DDI-ResNet50",
    # "PAD_UFES_20-ResNet50",
    # "HAM10000-ResNet50",
    # "Fitzpatrick17k-ResNet50",
    # "PlantDoc-ResNet50",
    # "PlantDataset-ResNet50",
    # "Cassava-ResNet50",
    # "PlantVillage-ResNet50",

    # "DDI-ViT_T16-teacher",
    # "PAD_UFES_20-ViT_T16-teacher",
    # "HAM10000-ViT_T16-teacher",
    # "Fitzpatrick17k-ViT_T16-teacher",
    # "PlantDoc-ViT_T16-teacher",
    # "PlantDataset-ViT_T16-teacher",
    # "Cassava-ViT_T16-teacher",
    # "PlantVillage-ViT_T16-teacher",
]

In [29]:
df_previous = load_pd_from_json(os.path.join(metric_target_directory, previous_file_name)).astype(str)

Read 1 entries from empty.txt


In [30]:
def calculate_scores_of_dataframe(
    df_full, number_of_samples, selection_seed, csv_path, evaluations
):
    df_training = df_full[df_full["set"] == "train"]
    df_devel = df_full[(df_full["set"] == "train") | (df_full["set"] == "valid")]
    df_train, df_valid = train_test_split(
        df_devel, train_size=len(df_training), stratify=df_devel["target_code"], random_state=selection_seed
    )
    df_test = df_full[df_full["set"] == "test"]
    if number_of_samples is not None:
        target_group = df_train.groupby("target_code")
        max_samples_possible = target_group["set"].count().min()
        if max_samples_possible < number_of_samples:
            # abort because this is impossible
            return
        df_train = target_group.sample(
            number_of_samples, random_state=selection_seed, replace=False
        )
    features = (
        df_train.loc[:, ~df_train.columns.isin(["target_code", "set"])],
        df_valid.loc[:, ~df_valid.columns.isin(["target_code", "set"])],
        df_test.loc[:, ~df_test.columns.isin(["target_code", "set"])],
    )
    targets = (
        df_train["target_code"].to_numpy(),
        df_valid["target_code"].to_numpy(),
        df_test["target_code"].to_numpy(),
    )

    best_k_f1_score_average = "macro" if 2 < len(df_full["target_code"].unique()) else "binary"
    entries = []
    for evaluation in evaluations:
        entry = {}
        entry["feature_identifier"] = csv_path
        entry["selection_seed"] = selection_seed
        entry["number_of_samples"] = number_of_samples
        entry["model_name"] = evaluation

        if evaluation == "lr":
            entry = evaluate_with_lr(
                features,
                targets,
                entry,
                seed=selection_seed,
                max_iter=max_iter,
            )
        elif evaluation == "knn":
            entry = evaluate_with_knn(
                features,
                targets,
                entry,
                number_of_samples,
                knn_metric=knn_metric,
                max_neighbors=max_neighbors,
                best_k_f1_score_average=best_k_f1_score_average,
            )
        else:
            print(f"Unknown evaluation method {evaluation}")
        entries.append(entry)
    return entries

def calculate_scores_of_file(csv_path, metric_file_path, sample_selection_config):
    evaluations = ["lr", "knn"]
    df_full = None
    for number_of_samples in sample_selection_config:
        selection_seeds = sample_selection_config[number_of_samples]
        number_of_existing_entries = sum((df_previous["feature_identifier"]==csv_path) & (df_previous["number_of_samples"]==str(number_of_samples)))
        if len(selection_seeds) * len(evaluations) <= number_of_existing_entries:
            continue
        if df_full is None:
            print(f"- Processing: {csv_file}")
            df_full = pd.read_csv(csv_path, index_col=0)
        for selection_seed in tqdm(selection_seeds):
            set_seed(selection_seed, verbose=False)
            entries = calculate_scores_of_dataframe(
                df_full, number_of_samples, selection_seed, csv_path, evaluations
            )
            with open(metric_file_path, "a") as detaillog:
                for entry in entries:
                    json.dump(entry, detaillog, indent=2)


In [31]:
csv_files = set(os.listdir(path=csv_source_directory))
metric_files = set(os.listdir(path=metric_target_directory))

# configuration check
covered_csv_files = set()
covered_metric_files = set()

for dataset_prefix in dataset_prefixes:
    matches_source = [x for x in csv_files if x.startswith(dataset_prefix)]
    if 0 == len(matches_source):
        print(f"Warn: No source file with prefix '{dataset_prefix}' found")
        continue
    else:
        covered_csv_files.update(matches_source)

    matches_target = [x for x in metric_files if x.startswith(dataset_prefix)]
    if 1 < len(matches_target):
        print(
            f"Warn: {len(matches_target)} target files found with prefix '{dataset_prefix}'"
        )
        print(matches_target)

    print(
        f"'{dataset_prefix}' has {len(matches_source)} source files and {len(matches_target)} target files"
    )
    covered_metric_files.update(matches_target)


print(f"Source files {len(csv_files)}")
print(f"Target files {len(metric_files)}")
# uncovered_csv_files = csv_files - covered_csv_files
# uncovered_metric_files = metric_files - covered_metric_files

Warn: No source file with prefix 'DDI-ViT_T16-student' found
'DDI-ResNet50' has 5 source files and 1 target files
Source files 112
Target files 40


In [32]:
metric_file_path = os.path.join(metric_target_directory, master_file_name)
if not os.path.exists(metric_file_path):
    for dataset_prefix in dataset_prefixes: # [:1]
        if "-teacher" in dataset_prefix:
            continue
        dataset_prefix = dataset_prefix.replace(
            "-student", ""
        )
        print(f"Calculating '{dataset_prefix}' to '{metric_file_path}'")
        for i, csv_file in enumerate(csv_files):
            if "-teacher" in csv_file:
                continue
            if csv_file.startswith(dataset_prefix):
                csv_path = os.path.join(csv_source_directory, csv_file)
                calculate_scores_of_file(csv_path, metric_file_path, master_selection_config)

Calculating 'DDI-ViT_T16' to '../runs/master-metrics-remaining.txt'
- Processing: DDI-ViT_T16-Random.csv


  0%|          | 0/10 [00:00<?, ?it/s]

[ True False]
binary





TypeError: 'NoneType' object is not iterable

In [None]:
for dataset_prefix in dataset_prefixes:
    metric_file_path = os.path.join(
        metric_target_directory, f"{dataset_prefix}-metrics.txt"
    )
    if os.path.exists(metric_file_path):
        print(f"Metric file already exists: {metric_file_path}")
        continue

    dataset_prefix = dataset_prefix.replace("-student", "")  # NOTE: dirty workaround
    print(f"Calculating '{dataset_prefix}' to '{metric_file_path}'")
    for i, csv_file in enumerate(csv_files):
        if ("teacher" in csv_file) and (
            "teacher" not in dataset_prefix
        ):  # NOTE: dirty workaround
            continue
        if csv_file.startswith(dataset_prefix):
            csv_path = os.path.join(csv_source_directory, csv_file)
            calculate_scores_of_file(csv_path, metric_file_path, detail_selection_config)

Metric file already exists: ../runs/PlantDoc-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/PlantDataset-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/Cassava-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/PlantVillage-ViT_T16-student-metrics.txt
