# DATASET SPLIT

Connecting to drive folder

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Importing packages

In [None]:
import pandas as pd
from PIL import Image, ImageChops
from torchvision import transforms
import os
import matplotlib.pyplot as plt
import random
from torchvision.transforms import functional as F
import numpy as np
import seaborn as sns
import os, math, numpy as np, pandas as pd
from collections import Counter, defaultdict
from IPython.display import display

Train-test split

In [None]:
# ======================
# Configuration
# ======================
ORIG_CSV  = "/content/drive/MyDrive/Skin_project/original_skin_dataset.csv"
TRAIN_CSV = "/content/drive/MyDrive/Skin_project/train_original.csv"
TEST_CSV  = "/content/drive/MyDrive/Skin_project/test_original.csv"

TEST_FRAC = 0.20
SEED = 42
OVERWRITE = True         # set True to recreate the split
CHECK_FILES = False        # True => only count rows whose image file exists
MIN_SUPPORT_TO_REQUIRE = 3 # require ≥1 test example for a (feature,label) if total support ≥ this


FEATURES = ["texture","hyperpigmentation","oiliness","moisture","elasticity"]
feature2imgcol = {
    "moisture": "moisture_img",
    "oiliness": "oiliness_img",
    "elasticity": "elasticity_img",
    "texture": "texture_img",
    "redness": "redness_img",
    "hyperpigmentation": "hyperpigmentation_img",
}

# ======================
# Load full dataset
# ======================
df_orig = pd.read_csv(ORIG_CSV)
if "patient_id" not in df_orig.columns:
    raise ValueError(" The dataset must have a 'patient_id' column!")

def _counts_table(dfX):
    rows = []
    for feat in FEATURES:
        img_col, lbl_col = feature2imgcol[feat], f"{feat}_score"
        if img_col not in dfX or lbl_col not in dfX:
            continue
        tmp = dfX[[img_col, lbl_col]].dropna()
        if CHECK_FILES:
            tmp = tmp[tmp[img_col].apply(lambda p: isinstance(p, str) and os.path.exists(p))]
        tmp = pd.to_numeric(tmp[lbl_col], errors="coerce").dropna().astype(int)
        s = tmp.value_counts().sort_index()
        s.name = feat; rows.append(s)
    out = pd.DataFrame(rows).fillna(0).astype(int)
    out["total"] = out.sum(axis=1)
    return out

def _make_label_aware_group_split(df, test_frac=TEST_FRAC, seed=SEED):
    rng = np.random.RandomState(seed)

    # Build patient -> Counter((feature,label) -> count), and global bin counts
    patient_bins = defaultdict(Counter)
    global_bins  = Counter()
    for _, r in df.iterrows():
        pid = str(r["patient_id"])
        for feat in FEATURES:
            img_col, lbl_col = feature2imgcol[feat], f"{feat}_score"
            if img_col not in df or lbl_col not in df:
                continue
            y = r[lbl_col]; p = r[img_col]
            if pd.isna(y):
                continue
            if CHECK_FILES and not (isinstance(p, str) and os.path.exists(p)):
                continue
            try:
                y = int(round(float(y)))
            except Exception:
                continue
            patient_bins[pid][(feat, y)] += 1
            global_bins[(feat, y)] += 1

    patients = list(patient_bins.keys())
    rng.shuffle(patients)

    # Requirement: at least one test sample per bin if there is enough support
    req = {k: 1 if n >= MIN_SUPPORT_TO_REQUIRE else 0 for k, n in global_bins.items()}

    # Greedy cover: pick patients that add the most unmet bins
    covered = Counter()
    test_patients = set()

    def additional_gain(pid):
        gain = 0
        for k, c in patient_bins[pid].items():
            if req.get(k, 0) > 0 and covered[k] < req[k] and c > 0:
                gain += 1
        return gain

    remaining = patients.copy()
    while True:
        unmet = [(k, req[k] - covered[k]) for k in req if covered[k] < req[k]]
        if not unmet:
            break
        best, best_gain = None, -1
        for pid in remaining:
            g = additional_gain(pid)
            if g > best_gain:
                best, best_gain = pid, g
        if best is None or best_gain <= 0:

            missing = [k for k in req if covered[k] < req[k]]
            if missing:
                print(f" Could not guarantee these bins in test (insufficient/overlapping support): {missing}")
            break
        test_patients.add(best)
        for k, c in patient_bins[best].items():
            if req.get(k, 0) > 0 and covered[k] < req[k] and c > 0:
                covered[k] += 1
        remaining.remove(best)

    # Fill to target #patients for test
    target_n = max(1, math.ceil(len(patients) * test_frac))
    rng.shuffle(remaining)
    for pid in remaining:
        if len(test_patients) >= target_n:
            break
        test_patients.add(pid)

    is_test = df.patient_id.astype(str).isin(test_patients)
    df_test = df[is_test].reset_index(drop=True)
    df_train = df[~is_test].reset_index(drop=True)

    print(f"Selected {len(test_patients)} test patients "
          f"({len(test_patients)/len(patients):.1%} of patients). "
          f"Train rows: {len(df_train)}, Test rows: {len(df_test)}")

    # Report counts
    print("\nTrain counts per feature/class:")
    display(_counts_table(df_train))
    print("\nTest counts per feature/class:")
    display(_counts_table(df_test))

    return df_train, df_test, test_patients

# ======================
# Create or reuse split
# ======================
if OVERWRITE or not (os.path.exists(TRAIN_CSV) and os.path.exists(TEST_CSV)):
    df_train, df_test, test_pids = _make_label_aware_group_split(df_orig, TEST_FRAC, SEED)
    # Safety: no patient overlap
    overlap = set(df_train.patient_id.astype(str)) & set(df_test.patient_id.astype(str))
    assert not overlap, f"Unexpected overlap in patient split: {overlap}"

    df_train.to_csv(TRAIN_CSV, index=False)
    df_test.to_csv(TEST_CSV, index=False)
    print(f"\nSaved split to:\n  {TRAIN_CSV}\n  {TEST_CSV}")
else:
    df_train = pd.read_csv(TRAIN_CSV)
    df_test  = pd.read_csv(TEST_CSV)
    print("Using existing split (to keep test set consistent).")


features_present = [c for c in feature2imgcol.values() if c in df_train.columns]
train_images_count = int(sum(df_train[c].notna().sum() for c in features_present))
test_images_count  = int(sum(df_test[c].notna().sum()  for c in features_present))
print(f"\n Train images: {train_images_count}, Test images: {test_images_count}")


in the model implementation section we will merge the minumal classes for oiliness and moisture making them binary as to keep some support.