# PASSION Dataset Loader

In [None]:
!pip install -r ../requirements.txt -q

In [None]:
import re
import pandas as pd
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from torchvision import transforms

import sys

sys.path.append("../")

from ssl_library.src.datasets.downstream_tasks.passion_dataset import PASSIONLabel
from ssl_library.src.datasets.helper import DatasetName, get_dataset

In [None]:
save_fig_path = Path("../assets/notebook_outputs/")
dataset_name = DatasetName.PASSION

In [None]:
!ls ../data/PASSION_collection_2020_2023

In [None]:
dataset, torch_dataset = get_dataset(
    dataset_name=dataset_name,
    dataset_path=Path("../data/PASSION_collection_2020_2023"),
    split_file="PASSION_split.csv",
    batch_size=16,
    # label_col=PASSIONLabel.IMPETIGO,
    return_fitzpatrick=False,
    image_extensions=("*.jpeg", "*.jpg", "*.JPG", "*.JPEG", "*.PNG", "*.png"),
)

len(dataset)

In [None]:
dataset[0]

In [None]:
dataset.meta_data.head()

In [None]:
dataset.meta_data.to_csv("actual_merged_passion.csv", index=False)

In [None]:
dataset.meta_data["subject_id"].value_counts()

In [None]:
df = pd.read_csv(
    "../data/PASSION_collection_2020_2023/passion_cleaned_final_case_level.csv"
)
df.drop(columns=["Unnamed: 0", "diagnosis"], inplace=True)
df["impetig"] = df["impetig"].fillna(0.0)
# df["Subject"] = df["Subject"].apply(lambda x: x.split("-")[0])
df.head()

In [None]:
df["subject_id"].value_counts()

In [None]:
dataset.meta_data["country"].value_counts()

In [None]:
len(
    dataset.meta_data[dataset.meta_data["country"] == "Tanzania"]["subject_id"].unique()
)

In [None]:
allowed_localizations = ["arm", "back", "foot", "hair", "hand", "leg", "torso"]
dataset.meta_data = dataset.meta_data[
    dataset.meta_data["body_loc"]
    .fillna("")
    .apply(lambda x: any([l in x for l in allowed_localizations]))
]

In [None]:
# randomly shuffle the dataset
dataset.meta_data = dataset.meta_data.sample(frac=1.0)
dataset.meta_data.reset_index(drop=True, inplace=True)

In [None]:
remove_idx = [2, 4, 23, 30, 37, 34, 31, 32, 44, 45, 48, 52]
l_indices = np.asarray(list(range(9 * 6)))
l_indices = np.delete(l_indices, remove_idx)
l_indices = list(l_indices) + list(range(9 * 6, 9 * 6 + len(remove_idx)))

l_indices = np.asarray(l_indices)
l_indices = np.delete(l_indices, [48])
l_indices = list(l_indices) + [67]

In [None]:
fig, axes = plt.subplots(9, 6, figsize=(8.27, 11.69))
index = 0
for h_idx, h_ax in enumerate(axes):
    for v_idx, ax in enumerate(h_ax):
        i = l_indices[index]
        ax.imshow(transforms.Resize((256, 256))(dataset[i][0]))
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_aspect("equal")
        ax.axis("off")
        # ax.set_title(i)
        index += 1

fig.patch.set_visible(False)
fig.subplots_adjust(wspace=0.025, hspace=0.025)
plt.savefig(
    save_fig_path / "passion_montage.pdf",
    bbox_inches="tight",
)
plt.show()

In [None]:
dataset.meta_data.loc[14]

In [None]:
dataset.meta_data[
    (dataset.meta_data["fitzpatrick"] == 4)
    & (dataset.meta_data["conditions_PASSION"] == "Others")
].iloc[15]

In [None]:
plt.imshow(transforms.Resize((256, 256))(dataset[658][0]))
plt.xticks([])
plt.yticks([])
plt.show()

## Train Splitting

Splitting based on: classes, subject_id, country

In [None]:
_df = dataset.meta_data.groupby("subject_id").agg(list)
for _c in ["conditions_PASSION", "country", "sex", "age", "fitzpatrick", "body_loc"]:
    display(_df[_c].apply(lambda x: len(set(x))).value_counts())

In [None]:
df_splitting = dataset.meta_data.drop_duplicates(subset=["subject_id"])
stratified_label = df_splitting["conditions_PASSION"].astype(str) + df_splitting[
    "country"
].astype(str)
subjects_train, subjects_test = train_test_split(
    df_splitting["subject_id"].values,
    stratify=stratified_label,
    train_size=0.80,
    random_state=42,
)

In [None]:
subjects_train.shape, subjects_test.shape

In [None]:
df_train = dataset.meta_data[dataset.meta_data["subject_id"].isin(subjects_train)]
df_test = dataset.meta_data[dataset.meta_data["subject_id"].isin(subjects_test)]

In [None]:
_sel_train = dataset.meta_data["subject_id"].isin(subjects_train)
_sel_test = dataset.meta_data["subject_id"].isin(subjects_test)
dataset.meta_data.loc[_sel_train, "Split"] = "TRAIN"
dataset.meta_data.loc[_sel_test, "Split"] = "TEST"

In [None]:
dataset.meta_data["Split"].value_counts()

In [None]:
set.intersection(
    set(df_train["subject_id"].unique()), set(df_test["subject_id"].unique())
)

In [None]:
df_subjects_train = pd.DataFrame(subjects_train, columns=["subject_id"])
df_subjects_train["Split"] = "TRAIN"

df_subjects_test = pd.DataFrame(subjects_test, columns=["subject_id"])
df_subjects_test["Split"] = "TEST"

pd.concat([df_subjects_train, df_subjects_test]).to_csv(
    "../data/PASSION_collection_2020_2023/PASSION_split.csv", index=False
)

In [None]:
for _c in ["conditions_PASSION", "country"]:
    for _df in [df_train, df_test]:
        _df[_c].value_counts().plot(kind="bar", figsize=(5, 3))
        plt.show()