In [None]:
import os.path as osp

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split


In [None]:
DATA_ROOT_PATH = "../metadata"
SEED = 777
np.random.seed(SEED)
SPLIT_RATIO = 0.9


train_data_2023 = pd.read_csv(osp.join(DATA_ROOT_PATH, "FungiCLEF2023_train_metadata_PRODUCTION.csv"))
# val_data = pd.read_csv(osp.join(DATA_ROOT_PATH, "FungiCLEF2023_val_metadata_PRODUCTION.csv"))
# data_df = pd.concat([train_data, val_data])
data_df = train_data_2023
len(data_df)


In [None]:
def make_df() -> pd.DataFrame:
    train_df20 = pd.read_csv(osp.join(DATA_ROOT_PATH, "DF20-train_metadata_PROD-2.csv"))
    test_df20  = pd.read_csv(osp.join(DATA_ROOT_PATH, "DF20-public_test_metadata_PROD-2.csv"))
    df20 = pd.concat([train_df20, test_df20]).reset_index(drop=True)
    return df20
    

In [None]:
def make_mini() -> pd.DataFrame:
    train_mini = pd.read_csv("../metadata/DF20M-train_metadata_PROD.csv")
    test_mini = pd.read_csv("../metadata/DF20M-public_test_metadata_PROD.csv")

    df20m = pd.concat([train_mini, test_mini]).reset_index(drop=True)

    df20m = df20m.reset_index(drop=True)
    return df20m
    

In [None]:
MAKE_MINI = False
if MAKE_MINI:
    df20m = make_mini()
    unique_genus = df20m["genus"].unique()
    data_df = data_df[data_df["genus"].isin(unique_genus)].reset_index(drop=True)
    assert len(df20m) == len(data_df), "Not same length!"
else:
    df20 = make_df()
    unique_genus = df20["genus"].unique()
    data_df = data_df[data_df["genus"].isin(unique_genus)].reset_index(drop=True)
    assert len(df20) == len(data_df), "Not same length!"
    
len(data_df)


In [None]:
data_df.head()

In [None]:
if not "class_id" in data_df.columns.values:
    from sklearn import preprocessing
    
    le = preprocessing.LabelEncoder()
    
    data_df = data_df.sort_values("scientificName")
    data_df['class_id'] = le.fit_transform(data_df['scientificName']).astype(np.int64)
    data_df = data_df.sort_index()
    data_df.head()

In [None]:
print(len(data_df["class_id"].unique()))
class_counts = data_df.groupby(by="class_id").count()["observationID"].sort_values()
class_counts.hist(bins=len(data_df["class_id"].unique()))
plt.show()

In [None]:

data_df.groupby(by="class_id")["observationID"].unique()

In [None]:
def initial_train_val_split(data_df):
    in_class_unique_observation_ids = data_df.groupby(by="class_id")["observationID"].unique()

    train_indexes, val_indexes = [], []
    for single_class_obs_ids in in_class_unique_observation_ids:  # Observation IDs in one class
        # Split the observations in the ratio
        train_ids, val_ids = train_test_split(single_class_obs_ids, train_size=SPLIT_RATIO, random_state=SEED)
        # Get indexes
        train_indexes += list(data_df[data_df["observationID"].isin(train_ids)].index)
        val_indexes += list(data_df[data_df["observationID"].isin(val_ids)].index)

    train_df = data_df.iloc[train_indexes]
    val_df = data_df.iloc[val_indexes]
    return train_df, val_df


In [None]:
def initial_train_val_split_class_only(data_df):
    num_classes = len(data_df["class_id"].unique())
    train_indexes, val_indexes = [], []
    for class_id in range(num_classes):
        single_class_data = data_df[data_df["class_id"] == class_id]
        train_single_class, val_single_class = train_test_split(single_class_data, train_size=SPLIT_RATIO, random_state=SEED)
        # Get indexes
        train_indexes += list(train_single_class.index)
        val_indexes += list(val_single_class.index)

    train_df = data_df.iloc[train_indexes]
    val_df = data_df.iloc[val_indexes]
    return train_df, val_df
    

In [None]:
USE_OBS_SPLIT = True
if USE_OBS_SPLIT:
    train_df, val_df = initial_train_val_split(data_df)
else:
    train_df, val_df = initial_train_val_split_class_only(data_df)

train_df.head()

In [None]:
def plot_ratio_barplot(original_df, target_df, iteration=None, save=False):
    in_class_ratios = target_df.groupby(by="class_id").count()["observationID"] / original_df.groupby(by="class_id").count()["observationID"]
    in_class_ratios = in_class_ratios.sort_values()
    ax = sns.barplot(
        x=in_class_ratios.values,
        y=in_class_ratios.index,
        orient="h",
        order=in_class_ratios.index[::-1]
    )
    ax.set(xlabel='Train Ratio', ylabel='Class ID', title=f"Iteration: {iteration}" if iteration else "")
    if save:
        plt.savefig(f"../metadata/final_distribution{'_mini' if MAKE_MINI else ''}.png")
    plt.show()
    # print(original_df["observationID"].isin(target_df["observationID"]).sum())

plot_ratio_barplot(data_df, train_df)
plot_ratio_barplot(data_df, val_df)
    

In [None]:
def move_data_by_threshold(
        source_df: pd.DataFrame,
        target_df: pd.DataFrame,
        cls_threshold_sequence: pd.Series,
) -> (pd.DataFrame, pd.DataFrame):

    data_to_be_moved = []
    try_to_move_from_target = []
    for cls_ids_to_move in cls_threshold_sequence.index:
        cls_data = source_df[source_df["class_id"] == cls_ids_to_move]
        if len(cls_data["observationID"].unique()) > 1:
            chosen_observation = np.random.choice(cls_data["observationID"].unique())
            data_to_be_moved.append(source_df[source_df["observationID"] == chosen_observation])
        else:  # Only single observation in source -> try to move a random observation from target
            cls_data = target_df[target_df["class_id"] == cls_ids_to_move]
            if len(cls_data["observationID"].unique()) > 1:
                chosen_observation = np.random.choice(cls_data["observationID"].unique())
                try_to_move_from_target.append(target_df[target_df["observationID"] == chosen_observation])

    if data_to_be_moved:
        data_to_be_moved = pd.concat(data_to_be_moved)
        target_df = pd.concat([target_df, data_to_be_moved])
        source_df = source_df.drop(data_to_be_moved.index)
    if try_to_move_from_target:
        try_to_move_from_target = pd.concat(try_to_move_from_target)
        source_df = pd.concat([source_df, try_to_move_from_target])
        target_df = target_df.drop(try_to_move_from_target.index)

    return source_df, target_df
    

In [None]:
TARGET_INTERVAL_SIZE = 0.015  #2
NUM_ITERATIONS = 100

for i in range(1, NUM_ITERATIONS+1):
    in_class_ratios = train_df.groupby(by="class_id").count()["observationID"] / data_df.groupby(by="class_id").count()["observationID"]
    down_threshold = in_class_ratios[in_class_ratios < SPLIT_RATIO - TARGET_INTERVAL_SIZE]
    upper_threshold = in_class_ratios[in_class_ratios > SPLIT_RATIO + TARGET_INTERVAL_SIZE]
    if len(upper_threshold) == 0 and len(down_threshold) == 0:
        break

    train_df, val_df = move_data_by_threshold(train_df, val_df, upper_threshold)
    val_df, train_df = move_data_by_threshold(val_df, train_df, down_threshold)

    if i % 10 == 0:
        plot_ratio_barplot(data_df, train_df, iteration=i)
        

In [None]:
assert train_df["observationID"].isin(val_df["observationID"]).sum() == 0, "Mixed observations!"
assert len(data_df) == (len(train_df) + len(val_df)), "Start and end amount of data does not correspond!"

plot_ratio_barplot(data_df, train_df, save=False)

original_col_order = data_df.columns.values
train_df = train_df[original_col_order]
val_df = val_df[original_col_order]
train_df = train_df.sort_index()
val_df = val_df.sort_index()

len(train_df) / len(data_df)

In [None]:
if MAKE_MINI:
    train_df.to_csv("../metadata/DanishFungi2020M-train_metadata_FIX.csv", index=False)
    val_df.to_csv("../metadata/DanishFungi2020M-val_mini-BY-CLASS.csv", index=False)
else:
    train_df.to_csv("../metadata/DanishFungi2020-val_metadata_FIX.csv", index=False)
    val_df.to_csv("../metadata/DanishFungi2020-val_metadata_FIX.csv", index=False)