# Preprocessing and Data Loading

Raman COVID data available at: https://figshare.com/articles/dataset/Data_and_code_on_serum_Raman_spectroscopy_as_an_efficient_primary_screening_of_coronavirus_disease_in_2019_COVID-19_/12159924 \
Raman Bacteria data available at: https://www.dropbox.com/scl/fo/fb29ihfnvishuxlnpgvhg/AJToUtts-vjYdwZGeqK4k-Y?rlkey=r4p070nsuei6qj3pjp13nwf6l&e=1&dl=0 \
DRS Tissue data available at: https://springernature.figshare.com/collections/Extended-wavelength_diffuse_reflectance_spectroscopy_dataset_of_animal_tissues_for_bone-related_biomedical_applications/6894172/1

# GLACIER

# Covid

In [None]:
from __future__ import annotations
import json, hashlib, zipfile, shutil
from pathlib import Path
import numpy as np
import pandas as pd
import ramanspy
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

NAME = "RamanCOVID19_ramanspy_preprocessed"
POS_IDS = {"COVID-19"}  # Positive class
NEG_IDS = {"Healthy"}   # Negative class
TEST_SIZE = 0.20
RANDOM_SEED = 4

# Data paths
ROOT = Path.home() / "local-datasets" / "covid19"
OUT_CSV = ROOT / "covid19_serum_raman_preprocessed.csv"

CLASS_FILES = {
    "COVID-19":   "raw_COVID.txt",
    "Healthy":    "raw_Helthy.txt",
    "Suspected":  "raw_Suspected.txt",
}

# wildboar repository settings
REPO_ROOT = Path.home() / "local-datasets"
HTTP_BASE = "http://127.0.0.1:8765"
MANIFEST_NAME = "repo2.json"
REPO_NAME = "repotwo"
BUNDLE_VERSION = "1.2"
BUNDLE_TAG = "default"

ZIP_NAME_NO_EXT = f"ucr-v{BUNDLE_VERSION}-{BUNDLE_TAG}"
ZIP_NAME = f"{ZIP_NAME_NO_EXT}.zip"
SHA_NAME = f"{ZIP_NAME_NO_EXT}.sha1"


def _sha1(path: Path) -> str:
    return hashlib.sha1(path.read_bytes()).hexdigest()


def read_matrix(p: Path) -> np.ndarray:
    """Robust whitespace/CSV reader"""
    return pd.read_csv(p, header=None, sep=r"[\s,;]+", engine="python").values


def get_wavenumbers_from_mat(mat_path: Path) -> np.ndarray:
    """Extract wavenumber vector from MAT file"""
    md = loadmat(mat_path)
    cands = []
    for k, v in md.items():
        if not isinstance(v, np.ndarray):
            continue
        arr = v.squeeze()
        if arr.ndim == 1 and 200 < arr.size < 5000:
            if np.all((arr >= 350) & (arr <= 4000)):
                cands.append(arr.astype(float))
    if not cands:
        raise RuntimeError("No suitable wavenumber vector found in data.mat")
    return max(cands, key=lambda a: a.size)


def main():
    print("Loading raw data...")
    
    # Get wavenumbers from MAT file
    wn = get_wavenumbers_from_mat(ROOT / "data.mat")
    B = wn.size
    print(f"Found {B} wavenumbers from {wn.min():.1f} to {wn.max():.1f} cm⁻¹")

    # Read class matrices
    X_list, y_list = [], []
    for label, fname in CLASS_FILES.items():
        mat = read_matrix(ROOT / fname)
        # fix orientation
        if mat.shape[1] != B and mat.shape[0] == B:
            mat = mat.T
        elif mat.shape[1] != B and mat.shape[0] != B:
            raise ValueError(f"{fname}: shape {mat.shape} doesn't match wavenumber length {B}")
        Xc = mat.astype(np.float32)
        X_list.append(Xc)
        y_list.append(np.full(Xc.shape[0], label, dtype=object))
        print(f"{label:10s}: {Xc.shape[0]} samples, {Xc.shape[1]} wavenumbers")

    X_all_raw = np.vstack(X_list)
    y_all = np.concatenate(y_list)
    
    raman_spectra = ramanspy.Spectrum(X_all_raw, wn)

    print("Applying ramanspy preprocessing...")
    
    pipeline = ramanspy.preprocessing.Pipeline([
        ramanspy.preprocessing.despike.WhitakerHayes(),
        ramanspy.preprocessing.baseline.ASLS(),
        ramanspy.preprocessing.normalise.MinMax(),
    ])
    
    X_all_preprocessed = pipeline.apply(raman_spectra)

    print("Subsetting and splitting preprocessed data...")
    mask = np.isin(y_all, list(POS_IDS | NEG_IDS))
    X = X_all_preprocessed.spectral_data[mask].astype(np.float32)
    y = y_all[mask]
    y_bin = np.where(np.isin(y, list(POS_IDS)), 1, 0).astype(np.int64)

    # Save full preprocessed CSV
    headers = [f"{int(v)}" if float(v).is_integer() else f"{v:g}" for v in wn]
    df = pd.DataFrame(X_all_preprocessed.spectral_data, columns=headers)
    df["diagnostic"] = y_all
    df.to_csv(OUT_CSV, index=False)
    print("Wrote preprocessed CSV ->", OUT_CSV)

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y_bin, test_size=TEST_SIZE, stratify=y_bin, random_state=RANDOM_SEED
    )

    tmp = REPO_ROOT / "tmp_ucr_build"
    (tmp / "ucr").mkdir(parents=True, exist_ok=True)
    np.savez_compressed(tmp / "ucr" / f"{NAME}_TRAIN.npz", x=X_tr, y=y_tr)
    np.savez_compressed(tmp / "ucr" / f"{NAME}_TEST.npz",  x=X_te, y=y_te)

    zip_path = REPO_ROOT / ZIP_NAME
    if zip_path.exists():
        with zipfile.ZipFile(zip_path, "r") as z:
            z.extractall(tmp)

    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
        for p in sorted((tmp / "ucr").glob("*")):
            if p.is_file():
                z.write(p, arcname=f"ucr/{p.name}")
    
    (REPO_ROOT / SHA_NAME).write_text(_sha1(zip_path))

    manifest_path = REPO_ROOT / MANIFEST_NAME
    if manifest_path.exists():
        manifest = json.loads(manifest_path.read_text())
    else:
        manifest = {
            "name": REPO_NAME,
            "version": "1.0",
            "wildboar_requires": "1.0",
            "bundle_url": f"{HTTP_BASE}/{{bundle}}-v{{version}}-{{tag}}",
            "bundles": []
        }
    
    manifest["bundle_url"] = f"{HTTP_BASE}/{{bundle}}-v{{version}}-{{tag}}"
    bundles = manifest.setdefault("bundles", [])
    ucr = next((b for b in bundles if b.get("key") == "ucr" and b.get("version") == BUNDLE_VERSION and b.get("tag", "default") == BUNDLE_TAG), None)
    
    if ucr is None:
        ucr = {"key": "ucr", "name": "ucr", "version": BUNDLE_VERSION, "tag": BUNDLE_TAG, "datasets": []}
        bundles.append(ucr)

    def ensure(part: str):
        fname = f"{NAME}_{'TRAIN' if part == 'train' else 'TEST'}.npz"
        if not any(d.get("name") == NAME and d.get("part") == part for d in ucr["datasets"]):
            ucr["datasets"].append({"name": NAME, "file": fname, "part": part})
    ensure("train"); ensure("test")

    manifest_path.write_text(json.dumps(manifest, indent=2))
    shutil.rmtree(tmp, ignore_errors=True)
    
    print("\n Built and added new preprocessed dataset to the repository.")
    print(f"   Dataset name: '{NAME}'")


if __name__ == "__main__":
    main()

## DRS

In [None]:
from __future__ import annotations
import json, hashlib, zipfile, shutil
from pathlib import Path
import numpy as np
import pandas as pd
import ramanspy
from sklearn.model_selection import train_test_split

NAME = "DRS_TissueClassification"
POS_IDS = {"cortBone"}  # Positive class
NEG_IDS = {"muscle"}  # Negative class
TEST_SIZE = 0.20
RANDOM_SEED = 4

# Data paths
CSV_PATH = Path.home() / "local-datasets" / "DRS.csv"
OUT_CSV = Path.home() / "local-datasets" / "drs_tissue.csv"

# wildboar repository settings
REPO_ROOT = Path.home() / "local-datasets"
HTTP_BASE = "http://127.0.0.1:8765"
MANIFEST_NAME = "repo2.json"
REPO_NAME = "repotwo"
BUNDLE_VERSION = "1.2"
BUNDLE_TAG = "default"

ZIP_NAME_NO_EXT = f"ucr-v{BUNDLE_VERSION}-{BUNDLE_TAG}"
ZIP_NAME = f"{ZIP_NAME_NO_EXT}.zip"
SHA_NAME = f"{ZIP_NAME_NO_EXT}.sha1"


def _sha1(path: Path) -> str:
    return hashlib.sha1(path.read_bytes()).hexdigest()


def main():
    print("Loading DRS data...")
    
    # Load DRS CSV
    df = pd.read_csv(CSV_PATH)
    
    # Remove index column if present
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    
    # Extract features (wavelength columns) and labels
    feature_cols = [col for col in df.columns if col != 'target_y']
    wavelengths = np.array([float(col) for col in feature_cols])
    
    X_all_raw = df[feature_cols].to_numpy(np.float32)
    y_all = df['target_y'].values
    
    print(f"Loaded DRS data: {X_all_raw.shape[0]} samples, {X_all_raw.shape[1]} wavelengths")
    print(f"Wavelength range: {wavelengths.min():.1f} - {wavelengths.max():.1f} nm")
    print(f"Class distribution: {dict(zip(*np.unique(y_all, return_counts=True)))}")
    
    print("Applying DRS-appropriate preprocessing...")
    
    # Apply basic normalisation
    X_all_preprocessed = (X_all_raw - X_all_raw.min(axis=1, keepdims=True)) / (
        X_all_raw.max(axis=1, keepdims=True) - X_all_raw.min(axis=1, keepdims=True)
    )
    
    # Convert to the expected format (mimic ramanspy output structure)
    class DRSSpectra:
        def __init__(self, data):
            self.spectral_data = data.astype(np.float32)
    
    X_all_preprocessed = DRSSpectra(X_all_preprocessed)

    print("Subsetting and splitting preprocessed data...")
    mask = np.isin(y_all, list(POS_IDS | NEG_IDS))
    X = X_all_preprocessed.spectral_data[mask].astype(np.float32)
    y = y_all[mask]
    y_bin = np.where(np.isin(y, list(POS_IDS)), 1, 0).astype(np.int64)

    print(f"Binary classification setup:")
    print(f"  Positive classes: {POS_IDS}")
    print(f"  Negative classes: {NEG_IDS}")
    print(f"  Binary samples: {X.shape[0]} (Positive: {(y_bin==1).sum()}, Negative: {(y_bin==0).sum()})")

    # Save full preprocessed CSV
    headers = [f"{wl:.3f}" for wl in wavelengths]
    df_preprocessed = pd.DataFrame(X_all_preprocessed.spectral_data, columns=headers)
    df_preprocessed["tissue_type"] = y_all
    df_preprocessed.to_csv(OUT_CSV, index=False)
    print("Wrote preprocessed CSV ->", OUT_CSV)

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y_bin, test_size=TEST_SIZE, stratify=y_bin, random_state=RANDOM_SEED
    )

    tmp = REPO_ROOT / "tmp_ucr_build"
    (tmp / "ucr").mkdir(parents=True, exist_ok=True)
    np.savez_compressed(tmp / "ucr" / f"{NAME}_TRAIN.npz", x=X_tr, y=y_tr)
    np.savez_compressed(tmp / "ucr" / f"{NAME}_TEST.npz",  x=X_te, y=y_te)

    zip_path = REPO_ROOT / ZIP_NAME
    if zip_path.exists():
        with zipfile.ZipFile(zip_path, "r") as z:
            z.extractall(tmp)

    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
        for p in sorted((tmp / "ucr").glob("*")):
            if p.is_file():
                z.write(p, arcname=f"ucr/{p.name}")
    
    (REPO_ROOT / SHA_NAME).write_text(_sha1(zip_path))

    manifest_path = REPO_ROOT / MANIFEST_NAME
    if manifest_path.exists():
        manifest = json.loads(manifest_path.read_text())
    else:
        manifest = {
            "name": REPO_NAME,
            "version": "1.0",
            "wildboar_requires": "1.0",
            "bundle_url": f"{HTTP_BASE}/{{bundle}}-v{{version}}-{{tag}}",
            "bundles": []
        }
    
    manifest["bundle_url"] = f"{HTTP_BASE}/{{bundle}}-v{{version}}-{{tag}}"
    bundles = manifest.setdefault("bundles", [])
    ucr = next((b for b in bundles if b.get("key") == "ucr" and b.get("version") == BUNDLE_VERSION and b.get("tag", "default") == BUNDLE_TAG), None)
    
    if ucr is None:
        ucr = {"key": "ucr", "name": "ucr", "version": BUNDLE_VERSION, "tag": BUNDLE_TAG, "datasets": []}
        bundles.append(ucr)

    def ensure(part: str):
        fname = f"{NAME}_{'TRAIN' if part == 'train' else 'TEST'}.npz"
        if not any(d.get("name") == NAME and d.get("part") == part for d in ucr["datasets"]):
            ucr["datasets"].append({"name": NAME, "file": fname, "part": part})
            
    ensure("train"); ensure("test")

    manifest_path.write_text(json.dumps(manifest, indent=2))
    shutil.rmtree(tmp, ignore_errors=True)
    
    print(f"\nTrain set: {X_tr.shape} (Positive: {(y_tr==1).sum()}, Negative: {(y_tr==0).sum()})")
    print(f"Test set:  {X_te.shape} (Positive: {(y_te==1).sum()}, Negative: {(y_te==0).sum()})")
    print("\n Built and added new preprocessed DRS dataset to the repository.")
    print(f"   Dataset name: '{NAME}'")


if __name__ == "__main__":
    main()

# Bacteria

In [None]:
from __future__ import annotations
import json, hashlib, zipfile, shutil
from pathlib import Path
import numpy as np
import ramanspy
from sklearn.model_selection import train_test_split

NAME = "EcoliVsKpneumoniae_ramanspy_singular"
POS_IDS = {3} #E. coli
NEG_IDS = {9} #K. pneumoniae
TEST_SIZE = 0.20
RANDOM_SEED = 4

RAMAN_DIR = Path.home() / "data" / "raman"
X_REF = RAMAN_DIR / "X_reference.npy"
Y_REF = RAMAN_DIR / "y_reference.npy"

# wildboar repository settings
REPO_ROOT = Path.home() / "local-datasets"
HTTP_BASE = "http://127.0.0.1:8765"
MANIFEST_NAME = "repo2.json"
REPO_NAME = "repotwo"
BUNDLE_VERSION = "1.2"
BUNDLE_TAG = "default"

ZIP_NAME_NO_EXT = f"ucr-v{BUNDLE_VERSION}-{BUNDLE_TAG}"
ZIP_NAME = f"{ZIP_NAME_NO_EXT}.zip"
SHA_NAME = f"{ZIP_NAME_NO_EXT}.sha1"


def _sha1(path: Path) -> str:
    return hashlib.sha1(path.read_bytes()).hexdigest()



def main():
    print("Loading raw data...")
    X_all_raw = np.load(X_REF)
    y_all = np.load(Y_REF)

    raman_spectra = ramanspy.Spectrum(X_all_raw, np.arange(X_all_raw.shape[1]))

    print("Applying ramanspy preprocessing...")
    
    pipeline = ramanspy.preprocessing.Pipeline([
        ramanspy.preprocessing.despike.WhitakerHayes(),
        ramanspy.preprocessing.baseline.ASLS(),
        ramanspy.preprocessing.normalise.MinMax(),
    ])
    
    X_all_preprocessed = pipeline.apply(raman_spectra)

    print("Subsetting and splitting preprocessed data...")
    mask = np.isin(y_all, list(POS_IDS | NEG_IDS))
    X = X_all_preprocessed.spectral_data[mask].astype(np.float32)
    y = y_all[mask]
    y_bin = np.where(np.isin(y, list(POS_IDS)), 1, 0).astype(np.int64)

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y_bin, test_size=TEST_SIZE, stratify=y_bin, random_state=RANDOM_SEED
    )

    tmp = REPO_ROOT / "tmp_ucr_build"
    (tmp / "ucr").mkdir(parents=True, exist_ok=True)
    np.savez_compressed(tmp / "ucr" / f"{NAME}_TRAIN.npz", x=X_tr, y=y_tr)
    np.savez_compressed(tmp / "ucr" / f"{NAME}_TEST.npz",  x=X_te, y=y_te)

    zip_path = REPO_ROOT / ZIP_NAME
    if zip_path.exists():
        with zipfile.ZipFile(zip_path, "r") as z:
            z.extractall(tmp)

    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
        for p in sorted((tmp / "ucr").glob("*")):
            if p.is_file():
                z.write(p, arcname=f"ucr/{p.name}")
    
    (REPO_ROOT / SHA_NAME).write_text(_sha1(zip_path))

    manifest_path = REPO_ROOT / MANIFEST_NAME
    if manifest_path.exists():
        manifest = json.loads(manifest_path.read_text())
    else:
        manifest = {
            "name": REPO_NAME,
            "version": "1.0",
            "wildboar_requires": "1.0",
            "bundle_url": f"{HTTP_BASE}/{{bundle}}-v{{version}}-{{tag}}",
            "bundles": []
        }
    
    manifest["bundle_url"] = f"{HTTP_BASE}/{{bundle}}-v{{version}}-{{tag}}"
    bundles = manifest.setdefault("bundles", [])
    ucr = next((b for b in bundles if b.get("key") == "ucr" and b.get("version") == BUNDLE_VERSION and b.get("tag", "default") == BUNDLE_TAG), None)
    
    if ucr is None:
        ucr = {"key": "ucr", "name": "ucr", "version": BUNDLE_VERSION, "tag": BUNDLE_TAG, "datasets": []}
        bundles.append(ucr)

    def ensure(part: str):
        fname = f"{NAME}_{'TRAIN' if part == 'train' else 'TEST'}.npz"
        if not any(d.get("name") == NAME and d.get("part") == part for d in ucr["datasets"]):
            ucr["datasets"].append({"name": NAME, "file": fname, "part": part})
    ensure("train"); ensure("test")

    manifest_path.write_text(json.dumps(manifest, indent=2))
    shutil.rmtree(tmp, ignore_errors=True)
    
    print("\n Built and added new preprocessed dataset to the repository.")
    print(f"  Dataset name: '{NAME}'")

if __name__ == "__main__":
    main()

To load these datasets fo glacier and rsf use:

cd /home/cok7/local-datasets && python -m http.server 8765

Then in Python:
  from wildboar.datasets import install_repository, refresh_repositories, list_repositories, list_datasets
  install_repository('http://127.0.0.1:8765/repo2.json', refresh=True)
  print(list_repositories())
  print(list_datasets('repotwo/ucr'))  # should now include added datasets

# Without Both

In [None]:
from wildboar.datasets import install_repository, refresh_repositories, list_repositories, list_datasets
#install_repository('http://127.0.0.1:8765/repo2.json', refresh=True)
print(list_repositories())
print(list_datasets('repotwo/ucr', force=True))

# CELS

# Bacteria

In [None]:
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import ramanspy
from sklearn.model_selection import train_test_split

NAME = "EcoliVsKpneumoniae_ramanspy_singular"

POS_IDS = {3}
NEG_IDS = {9}
TEST_SIZE = 0.20
RANDOM_SEED = 4

RAMAN_DIR = Path.home() / "data" / "raman"
X_REF = RAMAN_DIR / "X_reference.npy"
Y_REF = RAMAN_DIR / "y_reference.npy"

OUT_DIR = Path("cels_datasets")
OUT_DIR.mkdir(exist_ok=True)

def main():
    print("Loading raw data...")
    X_all_raw = np.load(X_REF)
    y_all = np.load(Y_REF)
    
    raman_spectra = ramanspy.Spectrum(X_all_raw, np.arange(X_all_raw.shape[1]))

    print("Applying ramanspy preprocessing...")
    pipeline = ramanspy.preprocessing.Pipeline([
        ramanspy.preprocessing.despike.WhitakerHayes(),
        ramanspy.preprocessing.baseline.ASLS(),
        ramanspy.preprocessing.normalise.MinMax(),
    ])
    X_all_preprocessed = pipeline.apply(raman_spectra)

    print("Subsetting and splitting preprocessed data...")
    mask = np.isin(y_all, list(POS_IDS | NEG_IDS))
    
    X = X_all_preprocessed.spectral_data[mask].astype(np.float32)
    y = y_all[mask]
    y_bin = np.where(np.isin(y, list(POS_IDS)), 1, 0).astype(np.int64)

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y_bin, test_size=TEST_SIZE, stratify=y_bin, random_state=RANDOM_SEED
    )
    
    output_path = OUT_DIR / f"{NAME}.npz"
    print(f"Saving new preprocessed dataset to {output_path}...")
    np.savez_compressed(
        output_path,
        X_train=X_tr, y_train=y_tr,
        X_test=X_te,  y_test=y_te
    )
    
    label_map = {
        "binary": {"1": sorted(list(POS_IDS)), "0": sorted(list(NEG_IDS))},
        "note": "binary labels correspond to these original class IDs"
    }
    (OUT_DIR / f"{NAME}_labelmap.json").write_text(json.dumps(label_map, indent=2))

    print(f"Wrote preprocessed data to {output_path.resolve()}")

if __name__ == "__main__":
    main()

# COVID

In [None]:
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import pandas as pd
import ramanspy
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

NAME = "RamanCOVID19_ramanspy_preprocessed"

POS_IDS = {"COVID-19"}
NEG_IDS = {"Healthy"}
TEST_SIZE = 0.20
RANDOM_SEED = 4

ROOT = Path.home() / "local-datasets" / "covid19"
CLASS_FILES = {
    "COVID-19":   "raw_COVID.txt",
    "Healthy":    "raw_Helthy.txt",
    "Suspected":  "raw_Suspected.txt",
}

OUT_DIR = Path("/home/cok7/MScProject/cels_datasets")
OUT_DIR.mkdir(exist_ok=True)

def read_matrix(p: Path) -> np.ndarray:
    return pd.read_csv(p, header=None, sep=r"[\s,;]+", engine="python").values

def get_wavenumbers_from_mat(mat_path: Path) -> np.ndarray:
    md = loadmat(mat_path)
    cands = []
    for k, v in md.items():
        if not isinstance(v, np.ndarray):
            continue
        arr = v.squeeze()
        if arr.ndim == 1 and 200 < arr.size < 5000:
            if np.all((arr >= 350) & (arr <= 4000)):
                cands.append(arr.astype(float))
    return max(cands, key=lambda a: a.size)

def main():
    print("Loading raw data...")
    
    # Get wavenumbers from MAT file
    wn = get_wavenumbers_from_mat(ROOT / "data.mat")
    B = wn.size

    # Read class matrices
    X_list, y_list = [], []
    for label, fname in CLASS_FILES.items():
        mat = read_matrix(ROOT / fname)
        if mat.shape[1] != B and mat.shape[0] == B:
            mat = mat.T
        elif mat.shape[1] != B and mat.shape[0] != B:
            raise ValueError(f"{fname}: shape {mat.shape} doesn't match wavenumber length {B}")
        Xc = mat.astype(np.float32)
        X_list.append(Xc)
        y_list.append(np.full(Xc.shape[0], label, dtype=object))

    X_all_raw = np.vstack(X_list)
    y_all = np.concatenate(y_list)
    
    raman_spectra = ramanspy.Spectrum(X_all_raw, wn)

    print("Applying ramanspy preprocessing...")
    pipeline = ramanspy.preprocessing.Pipeline([
        ramanspy.preprocessing.despike.WhitakerHayes(),
        ramanspy.preprocessing.baseline.ASLS(),
        ramanspy.preprocessing.normalise.MinMax(),
    ])
    X_all_preprocessed = pipeline.apply(raman_spectra)

    print("Subsetting and splitting preprocessed data...")
    mask = np.isin(y_all, list(POS_IDS | NEG_IDS))
    
    X = X_all_preprocessed.spectral_data[mask].astype(np.float32)
    y = y_all[mask]
    y_bin = np.where(np.isin(y, list(POS_IDS)), 1, 0).astype(np.int64)

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y_bin, test_size=TEST_SIZE, stratify=y_bin, random_state=RANDOM_SEED
    )
    
    output_path = OUT_DIR / f"{NAME}.npz"
    print(f"Saving new preprocessed dataset to {output_path}...")
    np.savez_compressed(
        output_path,
        X_train=X_tr, y_train=y_tr,
        X_test=X_te,  y_test=y_te
    )
    
    label_map = {
        "binary": {"1": sorted(list(POS_IDS)), "0": sorted(list(NEG_IDS))},
        "note": "binary labels correspond to these original class IDs"
    }
    (OUT_DIR / f"{NAME}_labelmap.json").write_text(json.dumps(label_map, indent=2))

    print(f"Wrote preprocessed data to {output_path.resolve()}")

if __name__ == "__main__":
    main()

# DRS

In [None]:
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

NAME = "DRS_TissueClassification"

POS_IDS = {"cortBone"}
NEG_IDS = {"muscle"}
TEST_SIZE = 0.20
RANDOM_SEED = 4

CSV_PATH = Path.home() / "local-datasets" / "DRS.csv"

OUT_DIR = Path("/home/cok7/MScProject/cels_datasets")
OUT_DIR.mkdir(exist_ok=True)

def main():
    print("Loading DRS data...")
    
    # Load DRS CSV
    df = pd.read_csv(CSV_PATH)
    
    # Remove index column
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    
    # Extract features and labels
    feature_cols = [col for col in df.columns if col != 'target_y']
    wavelengths = np.array([float(col) for col in feature_cols])
    
    X_all_raw = df[feature_cols].to_numpy(np.float32)
    y_all = df['target_y'].values
    
    print(f"Loaded DRS data: {X_all_raw.shape[0]} samples, {X_all_raw.shape[1]} wavelengths")
    print(f"Wavelength range: {wavelengths.min():.1f} - {wavelengths.max():.1f} nm")
    print(f"Class distribution: {dict(zip(*np.unique(y_all, return_counts=True)))}")
    
    print("Applying DRS-appropriate preprocessing...")
    
    # Apply normalisation
    X_all_preprocessed = (X_all_raw - X_all_raw.min(axis=1, keepdims=True)) / (
        X_all_raw.max(axis=1, keepdims=True) - X_all_raw.min(axis=1, keepdims=True)
    )

    print("Subsetting and splitting preprocessed data...")
    mask = np.isin(y_all, list(POS_IDS | NEG_IDS))
    
    X = X_all_preprocessed[mask].astype(np.float32)
    y = y_all[mask]
    y_bin = np.where(np.isin(y, list(POS_IDS)), 1, 0).astype(np.int64)

    print(f"Binary classification setup:")
    print(f"  Positive classes: {POS_IDS}")
    print(f"  Negative classes: {NEG_IDS}")
    print(f"  Binary samples: {X.shape[0]} (Positive: {(y_bin==1).sum()}, Negative: {(y_bin==0).sum()})")

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y_bin, test_size=TEST_SIZE, stratify=y_bin, random_state=RANDOM_SEED
    )
    
    output_path = OUT_DIR / f"{NAME}.npz"
    print(f"Saving new preprocessed dataset to {output_path}...")
    np.savez_compressed(
        output_path,
        X_train=X_tr, y_train=y_tr,
        X_test=X_te,  y_test=y_te
    )
    
    label_map = {
        "binary": {"1": sorted(list(POS_IDS)), "0": sorted(list(NEG_IDS))},
        "note": "DRS tissue classification - binary labels correspond to these tissue types",
        "wavelength_range": f"{wavelengths.min():.1f}-{wavelengths.max():.1f} nm",
        "preprocessing": "Min-max normalisation",
        "total_samples": int(X.shape[0]),
        "train_samples": int(len(X_tr)),
        "test_samples": int(len(X_te))
    }
    (OUT_DIR / f"{NAME}_labelmap.json").write_text(json.dumps(label_map, indent=2))

    print(f"Train set: {X_tr.shape} (Positive: {(y_tr==1).sum()}, Negative: {(y_tr==0).sum()})")
    print(f"Test set:  {X_te.shape} (Positive: {(y_te==1).sum()}, Negative: {(y_te==0).sum()})")
    print(f"Wrote preprocessed data to {output_path.resolve()}")

if __name__ == "__main__":
    main()

Loading DRS data...
Loaded DRS data: 5215 samples, 1531 wavelengths
Wavelength range: 355.0 - 1849.7 nm
Class distribution: {'boneCement': 215, 'boneMarrow': 1000, 'cartilage': 1000, 'cortBone': 1000, 'muscle': 1000, 'traBone': 1000}
Applying DRS-appropriate preprocessing...
Subsetting and splitting preprocessed data...
Binary classification setup:
  Positive classes: {'cortBone'}
  Negative classes: {'muscle'}
  Binary samples: 2000 (Positive: 1000, Negative: 1000)
Saving new preprocessed dataset to /home/cok7/MScProject/cels_datasets/DRS_TissueClassification.npz...
Train set: (1600, 1531) (Positive: 800, Negative: 800)
Test set:  (400, 1531) (Positive: 200, Negative: 200)
Wrote preprocessed data to /home/cok7/MScProject/cels_datasets/DRS_TissueClassification.npz
