In [28]:
import json
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from tqdm import tqdm

from local_python.feature_evaluation import (
    evaluate_with_knn, evaluate_with_lr, calculate_scores
)
from local_python.general_utils import (
    set_seed,
)

In [29]:
csv_source_directory = "../datasets/intermediate-features/"
metric_target_directory = "../runs/"

seed_list = range(100)
max_neighbors = 200
max_iter = 10_000
best_k_f1_score_average = "macro"  # "weighted"
knn_metric = "cosine"

detail_selection_config = {  # number of samples and seeds
    1: seed_list,
    3: seed_list,
    10: seed_list,
    30: seed_list,
    100: seed_list,
    None: [None],
}

master_selection_config = {
    None: range(10),
}

dataset_prefixes = [
    "DDI-ViT_T16-student",
    "PAD_UFES_20-ViT_T16-student",
    "HAM10000-ViT_T16-student",
    "Fitzpatrick17k-ViT_T16-student",
    "PlantDoc-ViT_T16-student",
    "PlantDataset-ViT_T16-student",
    "Cassava-ViT_T16-student",
    "PlantVillage-ViT_T16-student",
    "DDI-ResNet50",
    "PAD_UFES_20-ResNet50",
    "HAM10000-ResNet50",
    "Fitzpatrick17k-ResNet50",
    "PlantDoc-ResNet50",
    "PlantDataset-ResNet50",
    "Cassava-ResNet50",
    "PlantVillage-ResNet50",
    "DDI-ViT_T16-teacher",
    "PAD_UFES_20-ViT_T16-teacher",
    "HAM10000-ViT_T16-teacher",
    "Fitzpatrick17k-ViT_T16-teacher",
    "PlantDoc-ViT_T16-teacher",
    "PlantDataset-ViT_T16-teacher",
    "Cassava-ViT_T16-teacher",
    "PlantVillage-ViT_T16-teacher",
]

In [30]:
def calculate_scores_of_dataframe(
    df_full, number_of_samples, selection_seed, csv_path, evaluations
):
    df_training = df_full[df_full["set"] == "train"]
    df_devel = df_full[(df_full["set"] == "train") | (df_full["set"] == "valid")]
    df_train, df_valid = train_test_split(
        df_devel, train_size=len(df_training), stratify=df_devel["target_code"], random_state=selection_seed
    )
    df_test = df_full[df_full["set"] == "test"]

    if number_of_samples is not None:
        target_group = df_train.groupby("target_code")
        max_samples_possible = target_group["set"].count().min()
        if max_samples_possible < number_of_samples:
            # abort because this is impossible
            return
        df_train = target_group.sample(
            number_of_samples, random_state=selection_seed, replace=False
        )
    features = (
        df_train.loc[:, ~df_train.columns.isin(["target_code", "set"])],
        df_valid.loc[:, ~df_valid.columns.isin(["target_code", "set"])],
        df_test.loc[:, ~df_test.columns.isin(["target_code", "set"])],
    )
    targets = (
        df_train["target_code"].to_numpy(),
        df_valid["target_code"].to_numpy(),
        df_test["target_code"].to_numpy(),
    )

    entries = []
    for evaluation in evaluations:
        entry = {}
        entry["feature_identifier"] = csv_path
        entry["selection_seed"] = selection_seed
        entry["number_of_samples"] = number_of_samples
        entry["model_name"] = evaluation

        if evaluation == "lr":
            entry = evaluate_with_lr(
                features,
                targets,
                entry,
                seed=selection_seed,
                max_iter=max_iter,
            )
        elif evaluation == "knn":
            entry = evaluate_with_knn(
                features,
                targets,
                entry,
                number_of_samples,
                knn_metric=knn_metric,
                max_neighbors=max_neighbors,
                best_k_f1_score_average=best_k_f1_score_average,
            )
        else:
            print(f"Unknown evaluation method {evaluation}")
        entries.append(entry)
    return entries

def calculate_scores_of_file(csv_path, metric_file_path, sample_selection_config):
    df_full = pd.read_csv(csv_path, index_col=0)
    for number_of_samples in sample_selection_config:
        # print(f"number_of_samples: {number_of_samples}")
        selection_seeds = sample_selection_config[number_of_samples]
        for selection_seed in tqdm(selection_seeds):
            set_seed(selection_seed, verbose=False)
            entries = calculate_scores_of_dataframe(
                df_full, number_of_samples, selection_seed, csv_path, ["lr", "knn"]
            )
            with open(metric_file_path, "a") as detaillog:
                for entry in entries:
                    # print(entry)
                    json.dump(entry, detaillog, indent=2)


In [31]:
csv_files = set(os.listdir(path=csv_source_directory))
metric_files = set(os.listdir(path=metric_target_directory))

# configuration check
covered_csv_files = set()
covered_metric_files = set()

for dataset_prefix in dataset_prefixes:
    matches_source = [x for x in csv_files if x.startswith(dataset_prefix)]
    if 0 == len(matches_source):
        print(f"Warn: No source file with prefix '{dataset_prefix}' found")
        continue
    else:
        covered_csv_files.update(matches_source)

    matches_target = [x for x in metric_files if x.startswith(dataset_prefix)]
    if 1 < len(matches_target):
        print(
            f"Warn: {len(matches_target)} target files found with prefix '{dataset_prefix}'"
        )
        print(matches_target)

    print(
        f"'{dataset_prefix}' has {len(matches_source)} source files and {len(matches_target)} target files"
    )
    covered_metric_files.update(matches_target)


print(f"Source files {len(csv_files)}")
print(f"Target files {len(metric_files)}")
# uncovered_csv_files = csv_files - covered_csv_files
# uncovered_metric_files = metric_files - covered_metric_files

Warn: No source file with prefix 'DDI-ViT_T16-student' found
Warn: No source file with prefix 'PAD_UFES_20-ViT_T16-student' found
Warn: No source file with prefix 'HAM10000-ViT_T16-student' found
Warn: No source file with prefix 'Fitzpatrick17k-ViT_T16-student' found
Warn: No source file with prefix 'PlantDoc-ViT_T16-student' found
Warn: No source file with prefix 'PlantDataset-ViT_T16-student' found
Warn: No source file with prefix 'Cassava-ViT_T16-student' found
Warn: No source file with prefix 'PlantVillage-ViT_T16-student' found
'DDI-ResNet50' has 5 source files and 1 target files
'PAD_UFES_20-ResNet50' has 5 source files and 1 target files
'HAM10000-ResNet50' has 5 source files and 1 target files
'Fitzpatrick17k-ResNet50' has 5 source files and 1 target files
'PlantDoc-ResNet50' has 5 source files and 1 target files
'PlantDataset-ResNet50' has 5 source files and 1 target files
'Cassava-ResNet50' has 5 source files and 1 target files
'PlantVillage-ResNet50' has 5 source files and 1

In [32]:
metric_file_path = os.path.join(metric_target_directory, f"master-metrics.txt")
if not os.path.exists(metric_file_path):
    for dataset_prefix in dataset_prefixes: # [:1]
        if "-teacher" in dataset_prefix:
            continue
        dataset_prefix = dataset_prefix.replace(
            "-student", ""
        )
        print(f"Calculating '{dataset_prefix}' to '{metric_file_path}'")
        for i, csv_file in enumerate(csv_files):
            if "-teacher" in csv_file:
                continue
            if csv_file.startswith(dataset_prefix):
                print(f"- Processing: {csv_file}")
                csv_path = os.path.join(csv_source_directory, csv_file)
                calculate_scores_of_file(csv_path, metric_file_path, master_selection_config)

Calculating 'DDI-ViT_T16' to '../runs/master-metrics.txt'
- Processing: DDI-ViT_T16-Plant.csv


100%|██████████| 10/10 [00:44<00:00,  4.47s/it]


- Processing: DDI-ViT_T16-ImageNet_AugReg.csv


100%|██████████| 10/10 [00:46<00:00,  4.66s/it]


- Processing: DDI-ViT_T16-Derma.csv


100%|██████████| 10/10 [00:44<00:00,  4.43s/it]


- Processing: DDI-ViT_T16-ImageNet_1k_SL_WinKawaks.csv


100%|██████████| 10/10 [00:47<00:00,  4.74s/it]


- Processing: DDI-ViT_T16-ImageNet_1k_SSL_Dino.csv


100%|██████████| 10/10 [00:43<00:00,  4.35s/it]


- Processing: DDI-ViT_T16-Random.csv


100%|██████████| 10/10 [00:39<00:00,  3.96s/it]


Calculating 'PAD_UFES_20-ViT_T16' to '../runs/master-metrics.txt'
- Processing: PAD_UFES_20-ViT_T16-ImageNet_AugReg.csv


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

- Processing: PAD_UFES_20-ViT_T16-Derma.csv


100%|██████████| 10/10 [04:02<00:00, 24.23s/it]


- Processing: PAD_UFES_20-ViT_T16-Random.csv


100%|██████████| 10/10 [03:27<00:00, 20.73s/it]


- Processing: PAD_UFES_20-ViT_T16-ImageNet_1k_SL_WinKawaks.csv


100%|██████████| 10/10 [03:14<00:00, 19.49s/it]


- Processing: PAD_UFES_20-ViT_T16-ImageNet_1k_SSL_Dino.csv


100%|██████████| 10/10 [03:21<00:00, 20.10s/it]


- Processing: PAD_UFES_20-ViT_T16-Plant.csv


100%|██████████| 10/10 [03:57<00:00, 23.78s/it]


Calculating 'HAM10000-ViT_T16' to '../runs/master-metrics.txt'
- Processing: HAM10000-ViT_T16-Derma.csv


100%|██████████| 10/10 [36:31<00:00, 219.17s/it]


- Processing: HAM10000-ViT_T16-Plant.csv


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

- Processing: HAM10000-ViT_T16-ImageNet_AugReg.csv


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

- Processing: HAM10000-ViT_T16-ImageNet_1k_SL_WinKawaks.csv


100%|██████████| 10/10 [21:28<00:00, 128.82s/it]


- Processing: HAM10000-ViT_T16-ImageNet_1k_SSL_Dino.csv


100%|██████████| 10/10 [27:08<00:00, 162.84s/it]


- Processing: HAM10000-ViT_T16-Random.csv


100%|██████████| 10/10 [33:34<00:00, 201.43s/it]


Calculating 'Fitzpatrick17k-ViT_T16' to '../runs/master-metrics.txt'
- Processing: Fitzpatrick17k-ViT_T16-Random.csv


100%|██████████| 10/10 [43:55<00:00, 263.57s/it]


- Processing: Fitzpatrick17k-ViT_T16-Derma.csv


100%|██████████| 10/10 [45:33<00:00, 273.37s/it]


- Processing: Fitzpatrick17k-ViT_T16-Plant.csv


100%|██████████| 10/10 [50:19<00:00, 301.95s/it]


- Processing: Fitzpatrick17k-ViT_T16-ImageNet_AugReg.csv


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

- Processing: Fitzpatrick17k-ViT_T16-ImageNet_1k_SSL_Dino.csv


100%|██████████| 10/10 [44:28<00:00, 266.88s/it]


- Processing: Fitzpatrick17k-ViT_T16-ImageNet_1k_SL_WinKawaks.csv


100%|██████████| 10/10 [42:00<00:00, 252.03s/it]


Calculating 'PlantDoc-ViT_T16' to '../runs/master-metrics.txt'
- Processing: PlantDoc-ViT_T16-ImageNet_1k_SL_WinKawaks.csv


100%|██████████| 10/10 [04:14<00:00, 25.43s/it]


- Processing: PlantDoc-ViT_T16-Plant.csv


100%|██████████| 10/10 [03:59<00:00, 23.91s/it]


- Processing: PlantDoc-ViT_T16-Random.csv


100%|██████████| 10/10 [04:22<00:00, 26.23s/it]


- Processing: PlantDoc-ViT_T16-ImageNet_AugReg.csv


100%|██████████| 10/10 [11:44<00:00, 70.47s/it]


- Processing: PlantDoc-ViT_T16-Derma.csv


100%|██████████| 10/10 [06:01<00:00, 36.11s/it]


- Processing: PlantDoc-ViT_T16-ImageNet_1k_SSL_Dino.csv


100%|██████████| 10/10 [03:43<00:00, 22.30s/it]


Calculating 'PlantDataset-ViT_T16' to '../runs/master-metrics.txt'
- Processing: PlantDataset-ViT_T16-Plant.csv


100%|██████████| 10/10 [08:40<00:00, 52.03s/it]


- Processing: PlantDataset-ViT_T16-ImageNet_AugReg.csv


100%|██████████| 10/10 [21:42<00:00, 130.28s/it]


- Processing: PlantDataset-ViT_T16-ImageNet_1k_SSL_Dino.csv


100%|██████████| 10/10 [07:15<00:00, 43.56s/it]


- Processing: PlantDataset-ViT_T16-Derma.csv


100%|██████████| 10/10 [13:23<00:00, 80.37s/it]


- Processing: PlantDataset-ViT_T16-Random.csv


100%|██████████| 10/10 [14:33<00:00, 87.31s/it]


- Processing: PlantDataset-ViT_T16-ImageNet_1k_SL_WinKawaks.csv


100%|██████████| 10/10 [07:36<00:00, 45.61s/it]


Calculating 'Cassava-ViT_T16' to '../runs/master-metrics.txt'
- Processing: Cassava-ViT_T16-Plant.csv


100%|██████████| 10/10 [1:12:34<00:00, 435.45s/it]


- Processing: Cassava-ViT_T16-ImageNet_1k_SL_WinKawaks.csv


100%|██████████| 10/10 [1:08:06<00:00, 408.65s/it]


- Processing: Cassava-ViT_T16-ImageNet_AugReg.csv


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

- Processing: Cassava-ViT_T16-ImageNet_1k_SSL_Dino.csv


100%|██████████| 10/10 [1:36:45<00:00, 580.51s/it]


- Processing: Cassava-ViT_T16-Random.csv


100%|██████████| 10/10 [1:21:49<00:00, 490.94s/it]


- Processing: Cassava-ViT_T16-Derma.csv


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Calculating 'PlantVillage-ViT_T16' to '../runs/master-metrics.txt'
- Processing: PlantVillage-ViT_T16-Derma.csv


100%|██████████| 10/10 [7:48:08<00:00, 2808.89s/it] 


- Processing: PlantVillage-ViT_T16-Plant.csv


100%|██████████| 10/10 [5:38:40<00:00, 2032.04s/it] 


- Processing: PlantVillage-ViT_T16-Random.csv


 50%|█████     | 5/10 [4:35:07<4:34:23, 3292.66s/it]

In [None]:
for dataset_prefix in dataset_prefixes:
    metric_file_path = os.path.join(
        metric_target_directory, f"{dataset_prefix}-metrics.txt"
    )
    if os.path.exists(metric_file_path):
        print(f"Metric file already exists: {metric_file_path}")
        continue

    dataset_prefix = dataset_prefix.replace("-student", "")  # NOTE: dirty workaround
    print(f"Calculating '{dataset_prefix}' to '{metric_file_path}'")
    for i, csv_file in enumerate(csv_files):
        if ("teacher" in csv_file) and (
            "teacher" not in dataset_prefix
        ):  # NOTE: dirty workaround
            continue
        if csv_file.startswith(dataset_prefix):
            print(f"- Processing: {csv_file}")
            csv_path = os.path.join(csv_source_directory, csv_file)
            calculate_scores_of_file(csv_path, metric_file_path, detail_selection_config)

Metric file already exists: ../runs/DDI-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/PAD_UFES_20-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/HAM10000-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/Fitzpatrick17k-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/PlantDoc-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/PlantDataset-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/Cassava-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/PlantVillage-ViT_T16-student-metrics.txt
Metric file already exists: ../runs/DDI-ResNet50-metrics.txt
Metric file already exists: ../runs/PAD_UFES_20-ResNet50-metrics.txt
Metric file already exists: ../runs/HAM10000-ResNet50-metrics.txt
Metric file already exists: ../runs/Fitzpatrick17k-ResNet50-metrics.txt
Metric file already exists: ../runs/PlantDoc-ResNet50-metrics.txt
Metric file already exists: ../runs/PlantDataset-ResNet50-metrics.txt
M