In [6]:
import json
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.preprocessing import StandardScaler

In [7]:
csv_root_path = "../datasets/intermediate-features/"
seed_list = range(100)
max_neighbors = 200

sample_selection_config = {  # number of samples and seeds
    1: seed_list,
    3: seed_list,
    10: seed_list,
    30: seed_list,
    100: seed_list,
    None: [None],
}

best_k_f1_score_average = "macro"  # "weighted"

dataset_prefixes = [
    # "Cassava_Mini-", 
    # "DDI_malignant-",
    # "PAD_UFES_20-",
    # "HAM10000-",
    # "Fitzpatrick_17k_3p-",
    "PlantDoc-", 
    "PlantDataset-",
    "Cassava-", 
    # "PlantVillage-", 
]

In [8]:
def calculate_scores(
    targets,
    predictions,
    model_name=None,
    feature_identifier=None,
    number_of_samples=None,
    selection_seed=None,
    metric_file_path=None,
    best_k=None,
):
    accuracy = accuracy_score(targets, predictions)
    balanced_accuracy = balanced_accuracy_score(targets, predictions)
    f1_weighted = f1_score(targets, predictions, average="weighted")
    f1_micro = f1_score(targets, predictions, average="micro")
    f1_macro = f1_score(targets, predictions, average="macro")
    if model_name is None or feature_identifier is None:
        print(f"Accuracy: {accuracy}")
        print(f"Balanced accuracy: {balanced_accuracy}")
        print(f"F1-weighted: {f1_weighted}")
        print(f"F1-micro: {f1_micro}")
        print(f"F1-macro: {f1_macro}")
    else:
        # print(f"model_name={model_name}, feature_identifier={feature_identifier}")
        # print(f"model_name={model_name}")
        # print(f"model_name={model_name}, number_of_samples={number_of_samples}, selection_seed={selection_seed}")
        with open(metric_file_path, "a") as detaillog:
            entry = {}
            entry["model_name"] = model_name
            entry["feature_identifier"] = feature_identifier
            entry["number_of_samples"] = number_of_samples
            entry["selection_seed"] = selection_seed
            entry["accuracy"] = accuracy
            entry["balanced_accuracy"] = balanced_accuracy
            entry["f1_weighted"] = f1_weighted
            entry["f1_micro"] = f1_micro
            entry["f1_macro"] = f1_macro
            entry["best_k"] = best_k
            json.dump(entry, detaillog, indent=2)


def calculate_scores_of_file(csv_path, metric_file_path):
    # TODO: remove these filters after ViT_T16 are added
    # if ("ResNet50" not in csv_path): 
    #     print(f"Skipping: {csv_path}")
    #     return
    if ("ViT_T16" not in csv_path): 
        print(f"Skipping: {csv_path}")
        return
    elif ("Derma" in csv_path):
        print(f"Also skipping: {csv_path}")
        return

    print(f"loading: {csv_path}")
    df_full = pd.read_csv(csv_path, index_col=0)
    for number_of_samples in sample_selection_config:
        print(f"number_of_samples: {number_of_samples}")
        selection_seeds = sample_selection_config[number_of_samples]
        for selection_seed in selection_seeds:
            calculate_scores_of_dataframe(
                df_full, number_of_samples, selection_seed, metric_file_path
            )


def calculate_scores_of_dataframe(
    df_full, number_of_samples, selection_seed, metric_file_path
):
    # print(f"number_of_samples={number_of_samples}, selection_seed={selection_seed}")
    df_train = df_full[df_full["set"] == "train"]
    df_valid = df_full[df_full["set"] == "valid"]
    df_test = df_full[df_full["set"] == "test"]
    if number_of_samples is not None:
        target_group = df_train.groupby("target")
        df_train = target_group.sample(
            number_of_samples, random_state=selection_seed, replace=True
        )
        # NOTE: First option still causes problems with plantdoc-dataset
        # 1) min(len(target_group), number_of_samples)
        # 2) replace=True

    train_features = df_train.loc[:, ~df_train.columns.isin(["target", "set"])]
    valid_features = df_valid.loc[:, ~df_valid.columns.isin(["target", "set"])]
    test_features = df_test.loc[:, ~df_test.columns.isin(["target", "set"])]

    # scaler = StandardScaler()
    # train_features = scaler.fit_transform(df_train_features)
    # valid_features = scaler.transform(df_valid_features)
    # test_features = scaler.transform(df_test_features)

    train_targets = df_train["target"].to_numpy()
    valid_targets = df_valid["target"].to_numpy()
    test_targets = df_test["target"].to_numpy()

    model_lr = LogisticRegression(max_iter=10000)
    model_lr.fit(train_features, train_targets)

    valid_pred = model_lr.predict(valid_features)
    test_pred = model_lr.predict(test_features)
    calculate_scores(
        test_targets,
        test_pred,
        "lr",
        csv_path,
        number_of_samples,
        selection_seed,
        metric_file_path,
    )

    scores = {}
    neighbors_limit = max_neighbors
    if number_of_samples is not None:
        neighbors_limit = min(neighbors_limit, 1 + number_of_samples)

    for k in range(1, neighbors_limit):
        model_knn = KNeighborsClassifier(n_neighbors=k)
        model_knn.fit(train_features, train_targets)
        valid_pred = model_knn.predict(valid_features)
        score = f1_score(valid_targets, valid_pred, average=best_k_f1_score_average)
        scores[k] = score
    # print(f"Scores {scores}")

    best_k = max(scores, key=scores.get)
    model_knn = KNeighborsClassifier(n_neighbors=best_k)
    model_knn.fit(train_features, train_targets)
    test_pred = model_knn.predict(test_features)
    calculate_scores(
        test_targets,
        test_pred,
        "knn",
        csv_path,
        number_of_samples,
        selection_seed,
        metric_file_path,
        best_k=best_k,
    )

In [9]:
csv_paths = os.listdir(path=csv_root_path)

# for dataset_prefix in dataset_prefixes:
#     print(f"Check {dataset_prefix}")
#     df_comparison = pd.DataFrame()
#     for i, path in enumerate(csv_paths):
#         if path.startswith(dataset_prefix):
#             df_full = pd.read_csv(os.path.join(csv_root_path, path), index_col=0)
#             df_comparison[f"target_{i}"] = df_full["target"]
#             df_comparison[f"set_{i}"] = df_full["set"]
#     assert (
#         1 == (~df_comparison[df_comparison.columns[0::2]].T.duplicated()).sum()
#     )  # verify if split is the same
#     assert (
#         1 == (~df_comparison[df_comparison.columns[1::2]].T.duplicated()).sum()
#     )  # verify if split is the same

In [10]:
# 6101m 30.9s for all 4 Plant sets on ResNet50

# c:\workspace\thesis\.venv\lib\site-packages\sklearn\metrics\_classification.py:2399: UserWarning: y_pred contains classes not in y_true
#   warnings.warn("y_pred contains classes not in y_true")

# c:\workspace\thesis\.venv\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
# STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
# Increase the number of iterations (max_iter) or scale the data as shown in:
#     https://scikit-learn.org/stable/modules/preprocessing.html
# Please also refer to the documentation for alternative solver options:
#     https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
#   n_iter_i = _check_optimize_result(


for dataset_prefix in dataset_prefixes:
    print(f"Processing {dataset_prefix}")
    metric_file_path = f"../runs/{dataset_prefix}v1-ViT_T16-metrics.txt"
    if os.path.exists(metric_file_path):
        print(f"Metric file already exists: {metric_file_path}")
        # TODO: skip existing tasks again after ViT_T16 runs are added
        continue
    for i, path in enumerate(csv_paths):
        if path.startswith(dataset_prefix):
            csv_path = os.path.join(csv_root_path, path)
            calculate_scores_of_file(csv_path, metric_file_path)


Processing PlantDoc-
Skipping: ../datasets/intermediate-features/PlantDoc-ResNet50-ImageNet_SSL.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ResNet50-ImageNet_v1.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ResNet50-ImageNet_v2.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ResNet50-PDDD.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ResNet50-Random_19.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ResNet50-Random_20.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ResNet50-Random_21.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ViT_B16-ImageNet_SSL.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ViT_B16-ImageNet_v1.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ViT_B16-ImageNet_v2.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ViT_B16-ImageNet_v3.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ViT_B16-PDDD_197.csv
Skipping: ../datasets/intermediate-features/PlantDoc-ViT_B16-P



number_of_samples: 3




number_of_samples: 10




number_of_samples: 30
number_of_samples: 100
