In [10]:
import json
import os
import pandas as pd

from local_python.feature_evaluation import (
    evaluate_with_knn, evaluate_with_lr, calculate_scores
)

In [7]:
csv_source_directory = "../datasets/intermediate-features/"
metric_target_directory = "../runs/"

seed_list = range(100)
max_neighbors = 200
max_iter = 10_000
best_k_f1_score_average = "macro"  # "weighted"
knn_metric = "cosine"

sample_selection_config = {  # number of samples and seeds
    1: seed_list,
    3: seed_list,
    10: seed_list,
    30: seed_list,
    100: seed_list,
    None: [None],
}

dataset_prefixes = [
    # "Cassava_Mini-",
    # "DDI-ViT_T16-student",
    # "PAD_UFES_20-ViT_T16-student",
    # "HAM10000-ViT_T16-student",
    # "Fitzpatrick17k-ViT_T16-student",
    "PlantDoc-ViT_T16-student",
    # "PlantDataset-ViT_T16-student",
    # "Cassava-ViT_T16-student",
    # "PlantVillage-ViT_T16-student",
    # "DDI-ResNet50",
    # "PAD_UFES_20-ResNet50",
    # "HAM10000-ResNet50",
    # "Fitzpatrick17k-ResNet50",
    "PlantDoc-ResNet50",
    # "PlantDataset-ResNet50",
    # "Cassava-ResNet50",
    # "PlantVillage-ResNet50",
    # "DDI-ViT_T16-teacher",
    # "PAD_UFES_20-ViT_T16-teacher",
    # "HAM10000-ViT_T16-teacher",
    # "Fitzpatrick17k-ViT_T16-teacher",
    "PlantDoc-ViT_T16-teacher",
    # "PlantDataset-ViT_T16-teacher",
    # "Cassava-ViT_T16-teacher",
    # "PlantVillage-ViT_T16-teacher",
    # "DDI-",
    # "PAD_UFES_20-",
    # "HAM10000-",
    # "Fitzpatrick17k-",
    # "PlantDoc-",
    # "PlantDataset-",
    # "Cassava-",
    # "PlantVillage-",
]

In [42]:
def calculate_scores_of_dataframe(
    df_full, number_of_samples, selection_seed, csv_path, evaluations
):
    df_train = df_full[df_full["set"] == "train"]
    df_valid = df_full[df_full["set"] == "valid"]
    df_test = df_full[df_full["set"] == "test"]
    if number_of_samples is not None:
        target_group = df_train.groupby("target_code")
        max_samples_possible = target_group["set"].count().min()
        if max_samples_possible < number_of_samples:
            # abort because this is impossible
            return

        df_train = target_group.sample(
            number_of_samples, random_state=selection_seed, replace=False
        )
    features = (
        df_train.loc[:, ~df_train.columns.isin(["target_code", "set"])],
        df_valid.loc[:, ~df_valid.columns.isin(["target_code", "set"])],
        df_test.loc[:, ~df_test.columns.isin(["target_code", "set"])],
    )
    targets = (
        df_train["target_code"].to_numpy(),
        df_valid["target_code"].to_numpy(),
        df_test["target_code"].to_numpy(),
    )

    entries = []
    for evaluation in evaluations:
        entry = {}
        entry["feature_identifier"] = csv_path
        entry["selection_seed"] = selection_seed
        entry["number_of_samples"] = number_of_samples
        entry["model_name"] = evaluation

        if evaluation == "lr":
            entry = evaluate_with_lr(
                features,
                targets,
                entry,
                max_iter=max_iter,
            )
        elif evaluation == "knn":
            entry = evaluate_with_knn(
                features,
                targets,
                entry,
                number_of_samples,
                knn_metric=knn_metric,
                max_neighbors=max_neighbors,
                best_k_f1_score_average=best_k_f1_score_average,
            )
        elif evaluation == "lr_full":
            entry = evaluate_with_lr_full(
                features,
                targets,
                entry,
                max_iter=max_iter,
            )
            return entry, model_lr
        else:
            print(f"Unknown evaluation method {evaluation}")
        entries.append(entry)
    return entries

def calculate_scores_of_file(csv_path, metric_file_path):
    df_full = pd.read_csv(csv_path, index_col=0)
    for number_of_samples in sample_selection_config:
        print(f"number_of_samples: {number_of_samples}")
        selection_seeds = sample_selection_config[number_of_samples]
        for selection_seed in selection_seeds:
            entries = calculate_scores_of_dataframe(
                df_full, number_of_samples, selection_seed, csv_path, ["lr", "knn"]
            )
            for entry in entries:
                with open(metric_file_path, "a") as detaillog:
                    json.dump(entry, detaillog, indent=2)


In [10]:
csv_files = set(os.listdir(path=csv_source_directory))
metric_files = set(os.listdir(path=metric_target_directory))

# configuration check
covered_csv_files = set()
covered_metric_files = set()

for dataset_prefix in dataset_prefixes:
    matches_source = [x for x in csv_files if x.startswith(dataset_prefix)]
    if 0 == len(matches_source):
        print(f"Warn: No source file with prefix '{dataset_prefix}' found")
        continue
    else:
        covered_csv_files.update(matches_source)

    matches_target = [x for x in metric_files if x.startswith(dataset_prefix)]
    if 1 < len(matches_target):
        print(
            f"Warn: {len(matches_target)} target files found with prefix '{dataset_prefix}'"
        )
        print(matches_target)

    print(
        f"'{dataset_prefix}' has {len(matches_source)} source files and {len(matches_target)} target files"
    )
    covered_metric_files.update(matches_target)


print(f"Source files {len(csv_files)}")
print(f"Target files {len(metric_files)}")
# uncovered_csv_files = csv_files - covered_csv_files
# uncovered_metric_files = metric_files - covered_metric_files

Warn: No source file with prefix 'PlantDoc-ViT_T16-student' found
'PlantDoc-ResNet50' has 5 source files and 0 target files
'PlantDoc-ViT_T16-teacher' has 3 source files and 0 target files
Source files 112
Target files 26


In [27]:
for dataset_prefix in dataset_prefixes:
    metric_file_path = os.path.join(
        metric_target_directory, f"{dataset_prefix}-metrics.txt"
    )
    if os.path.exists(metric_file_path):
        print(f"Metric file already exists: {metric_file_path}")
        continue

    dataset_prefix = dataset_prefix.replace("-student", "")  # NOTE: dirty workaround
    print(f"Calculating '{dataset_prefix}' to '{metric_file_path}'")
    for i, csv_file in enumerate(csv_files):
        if ("teacher" in csv_file) and (
            "teacher" not in dataset_prefix
        ):  # NOTE: dirty workaround
            continue
        if csv_file.startswith(dataset_prefix):
            print(f"- Processing: {csv_file}")
            csv_path = os.path.join(csv_source_directory, csv_file)
            calculate_scores_of_file(csv_path, metric_file_path)

Calculating 'PlantDoc-ViT_T16' to '../runs/PlantDoc-ViT_T16-student-metrics.txt'
- Processing: PlantDoc-ViT_T16-Random.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ViT_T16-Plant.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ViT_T16-ImageNet_1k_SSL_Dino.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ViT_T16-ImageNet_1k_SL_WinKawaks.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ViT_T16-ImageNet_AugReg.csv
number_of_samples: 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ViT_T16-Derma.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
Calculating 'PlantDoc-ResNet50' to '../runs/PlantDoc-ResNet50-metrics.txt'
- Processing: PlantDoc-ResNet50-ImageNet_1k_SSL_SimCLR.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ResNet50-ImageNet_1k_SL_V1.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ResNet50-Derma_SSL_SimCLR.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ResNet50-PDDD.csv
number_of_samples: 1
number_of_sa

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

number_of_samples: 3
number_of_samples: 10
number_of_samples: 30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

number_of_samples: 100
number_of_samples: None
Calculating 'PlantDoc-ViT_T16-teacher' to '../runs/PlantDoc-ViT_T16-teacher-metrics.txt'
- Processing: PlantDoc-ViT_T16-teacher-Plant.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ViT_T16-teacher-ImageNet_1k_SSL_Dino.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
- Processing: PlantDoc-ViT_T16-teacher-Derma.csv
number_of_samples: 1
number_of_samples: 3
number_of_samples: 10
number_of_samples: 30
number_of_samples: 100
number_of_samples: None
