In [65]:
def get_stats(tensor, p=True, r=False, minmax=False):
    if not minmax:
        mean, std = tensor.mean(), tensor.std()
        if p: print(f"Mean: {mean}, Std: {std}")
        if r: return mean, std
        
    min, max = tensor.min(), tensor.max()
    mean, std = tensor.mean(), tensor.std()
    if p: print(f"Min: {min}, Max: {max} ,Mean: {mean}, Std: {std}")
    if r: return min, max, mean, std

In [43]:
import os

path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
files = os.listdir(path)
[(i, files[i]) for i in range(len(files))]

[(0, 'sample_submission.csv'),
 (1, 'timegate.csv'),
 (2, 'mettler_toledo.csv'),
 (3, 'kaiser.csv'),
 (4, 'anton_532.csv'),
 (5, 'transfer_plate.csv'),
 (6, '96_samples.csv'),
 (7, 'tornado.csv'),
 (8, 'tec5.csv'),
 (9, 'metrohm.csv'),
 (10, 'anton_785.csv')]

In [46]:
ds_names = [
    "timegate.csv",
    "mettler_toledo.csv",
    "kaiser.csv",
    "anton_532.csv",
    #"transfer_plate.csv",
    "tornado.csv",
    "tec5.csv",
    "metrohm.csv",
    "anton_785.csv"
]

In [None]:
import numpy as np
import pandas as pd


def get_dataset(name, lower, upper):
    path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
    df = pd.read_csv(os.path.join(path, name))
    
    spectra_selection = np.logical_and(
        lower <= np.array([float(one) for one in df.columns[:-5]]),
        np.array([float(one) for one in df.columns[:-5]]) <= upper,
    )
    
    spectra = df.iloc[:, :-5].iloc[:, spectra_selection].values
    label = df.iloc[:, -5:-1].values
    
    wavenumbers = np.array([
        float(one) for one in df.columns[:-5]
    ])[spectra_selection]

    
    return spectra, label, wavenumbers


def load_datasets(ds_names, lower, upper):
    datasets = [get_dataset(name, lower, upper) for name in ds_names]
    wavenumbers = np.arange(lower, upper + 1)

    interpolated_data = [
        np.array([
            np.interp(
                wavenumbers,
                xp=wns,
                fp=spectrum,
            )
            for spectrum in spectra
        ])
        for spectra, _, wns in datasets
    ]

    normed_spectra = np.concatenate(
        [
            spectra / np.max(spectra)
            for spectra in interpolated_data
        ],
        axis=0,
    )

    labels = np.concatenate([ds[1] for ds in datasets])
    return normed_spectra, labels


In [101]:
ds = load_datasets(ds_names, 300, 1600)

In [102]:
ds.shape

(2261, 4)

In [66]:
[get_stats(d, minmax=True) for d in ds]

Min: 0.026162647968249613, Max: 1.0 ,Mean: 0.16025456165443094, Std: 0.10085296226587226
Min: 0.023178450816787294, Max: 1.0 ,Mean: 0.1647588356759129, Std: 0.09868560777166825
Min: 0.0588133846135364, Max: 1.0 ,Mean: 0.2853361901452322, Std: 0.10792517077797799
Min: 0.2691255470244491, Max: 1.0 ,Mean: 0.4161058424891426, Std: 0.11217985811778325
Min: 0.03357494499476438, Max: 1.0 ,Mean: 0.1805342634914567, Std: 0.11437222238568806
Min: 0.0017089935745804783, Max: 1.0 ,Mean: 0.19025765172710124, Std: 0.08984439939291486
Min: 0.06505873030068815, Max: 1.0 ,Mean: 0.15848769574031102, Std: 0.07818925946536216
Min: 0.3670528117480713, Max: 1.0 ,Mean: 0.5421399799865979, Std: 0.09276928746792369


[None, None, None, None, None, None, None, None]