In [1]:
import os

path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
files = os.listdir(path)
[(i, files[i]) for i in range(len(files))]

[(0, 'sample_submission.csv'),
 (1, 'timegate.csv'),
 (2, 'mettler_toledo.csv'),
 (3, 'kaiser.csv'),
 (4, 'anton_532.csv'),
 (5, 'transfer_plate.csv'),
 (6, '96_samples.csv'),
 (7, 'tornado.csv'),
 (8, 'tec5.csv'),
 (9, 'metrohm.csv'),
 (10, 'anton_785.csv')]

In [53]:
import numpy as np
import pandas as pd


def load_transfer_data():
    csv_path = os.path.join(path, files[5])
    df = pd.read_csv(csv_path)

    input_cols = df.columns[1:2049]
    target_cols = df.columns[2050:]

    targets  = df[target_cols].dropna().to_numpy()

    df = df[input_cols]
    df['Unnamed: 1'] = df['Unnamed: 1'].str.replace("[\[\]]", "", regex=True).astype('int64')
    df['Unnamed: 2048'] = df['Unnamed: 2048'].str.replace("[\[\]]", "", regex=True).astype('int64')

    inputs = df.to_numpy().reshape(-1, 2, 2048)
    inputs = inputs.mean(axis=1)
    wavenumbers = np.arange(0, 2049, dtype=np.float32)
    return inputs, targets, wavenumbers


def load_test_data():
    test = pd.read_csv(os.path.join(path, files[6]))

    row1 = test.columns[1:].to_numpy().copy()
    row1[-1] = "5611"
    row1 = row1.astype(np.float64)


    cols = test.columns[1:]
    test = test[cols]
    test[" 5611]"] = test[" 5611]"].str.replace('[\[\]]', '', regex=True).astype('int64')
    test = test.to_numpy()

    test = np.insert(test, 0, row1, axis=0)
    return test.reshape(-1, 2, 2048).mean(axis=1)


In [54]:

lower_bounds = {
    'anton_532': 200,
    'anton_785': 100,
    'kaiser': -37,
    'mettler_toledo': 300,
    'metrohm': 200,
    'tec5': 85,
    'timegate': 200,
    'tornado': 300,
    "transfer_plate": 1
}


upper_bounds = {
    'anton_532': 3500,
    'anton_785': 2300,
    'kaiser': 1942,
    'mettler_toledo': 3350,
    'metrohm': 3350,
    'tec5': 3210,
    'timegate': 2000,
    'tornado': 3300,
    "transfer_plate": 2048
}

In [55]:

def get_dataset(name, lower=-1000, upper=10000):
    path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
    df = pd.read_csv(os.path.join(path, name))

    lower = max(lower, lower_bounds[name[:-4]])
    upper = min(upper, upper_bounds[name[:-4]])

    spectra_selection = np.logical_and(
        lower <= np.array([float(one) for one in df.columns[:-5]]),
        np.array([float(one) for one in df.columns[:-5]]) <= upper,
    )
    
    spectra = df.iloc[:, :-5].iloc[:, spectra_selection].values
    label = df.iloc[:, -5:-2].values

    wavenumbers = np.array([
        float(one) for one in df.columns[:-5]
    ])[spectra_selection]

    #indices = get_indices(spectra, num_samples)                         
    return spectra, label, wavenumbers

In [60]:
lower_bounds, upper_bounds

({'anton_532': 200,
  'anton_785': 100,
  'kaiser': -37,
  'mettler_toledo': 300,
  'metrohm': 200,
  'tec5': 85,
  'timegate': 200,
  'tornado': 300,
  'transfer_plate': 1},
 {'anton_532': 3500,
  'anton_785': 2300,
  'kaiser': 1942,
  'mettler_toledo': 3350,
  'metrohm': 3350,
  'tec5': 3210,
  'timegate': 2000,
  'tornado': 3300,
  'transfer_plate': 2048})

In [56]:
kaiser = get_dataset("kaiser.csv")
ds = load_transfer_data()

In [88]:
wavenumbers = np.arange(300, 1942 + 1)
wns = kaiser[2]

x = [np.interp(
    wavenumbers,
    xp=wns,
    fp=spectrum,
)for spectrum in kaiser[0]]

In [89]:
np.stack(x).shape

(134, 1643)

In [77]:
kaiser[0].shape

(134, 6593)

In [74]:
wavenumbers.shape, wns.shape

((1980,), (6593,))

In [None]:
wavenumbers = np.arange(-37, 1942 + 1)

interpolated_data = [
    np.array([
        np.interp(
            wavenumbers,
            xp=wns,
            fp=spectrum,
        )
        for spectrum in spectra
    ])
    for spectra, _, wns in datasets
]

In [76]:
df = pd.read_csv(os.path.join(path, files[3]))
df

Unnamed: 0,-36.3,-36.0,-35.7,-35.4,-35.1,-34.8,-34.5,-34.2,-33.9,-33.6,...,1940.1001,1940.4001,1940.7001,1941.0001,1941.3001,glucose,Na_acetate,Mg_SO4,MSM_present,fold_idx
0,10.3585,10.2157,10.1018,10.0218,9.9651,9.9239,9.8962,9.8803,9.8744,9.8776,...,259.2205,258.3166,257.4057,256.6129,255.8397,0.554460,0.000000,0.000000,0.0,0
1,11.0992,11.1688,11.2328,11.2880,11.3322,11.3636,11.3810,11.3836,11.3716,11.3454,...,272.7687,272.7478,272.7581,272.7973,272.9031,0.000000,0.000000,0.000000,0.0,0
2,8.5593,8.5672,8.5637,8.5448,8.5100,8.4597,8.3954,8.3190,8.2341,8.1446,...,304.5617,304.1005,303.7981,303.6360,303.5873,0.574024,0.210141,0.000000,1.0,0
3,14.3677,13.8007,13.2977,12.8765,12.5280,12.2442,12.0212,11.8532,11.7308,11.6443,...,297.7085,296.9513,295.8577,294.6594,293.0710,4.632610,0.699690,0.022235,1.0,0
4,9.9371,9.9609,10.0106,10.0917,10.1972,10.3215,10.4622,10.6163,10.7794,10.9467,...,301.8498,301.0805,300.4763,300.0391,299.7130,3.633410,1.108970,0.051811,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,11.0373,10.9870,10.9461,10.9185,10.9058,10.9088,10.9278,10.9620,11.0091,11.0658,...,253.3485,253.1788,253.0501,252.9488,252.8235,10.353900,0.000000,0.000000,0.0,4
130,13.0093,12.7577,12.5281,12.3298,12.1657,12.0381,11.9475,11.8936,11.8734,11.8832,...,432.6355,432.7879,432.7451,432.5565,432.1094,0.000000,0.652860,0.046091,0.0,4
131,8.1120,8.1863,8.2504,8.3013,8.3394,8.3654,8.3793,8.3814,8.3724,8.3528,...,230.5279,230.5069,230.5576,230.6549,230.8311,1.213120,2.114350,0.048350,0.0,4
132,8.4466,8.4495,8.4301,8.3843,8.3200,8.2437,8.1594,8.0713,7.9847,7.9040,...,249.1279,249.4063,249.7078,249.9865,250.2569,1.083790,1.986670,0.000000,0.0,4


In [None]:
import pandas as pd


lower_bounds = {
    'anton_532': 200,
    'anton_785': 100,
    'kaiser': -37,
    'mettler_toledo': 300,
    'metrohm': 200,
    'tec5': 85,
    'timegate': 200,
    'tornado': 300,
}


upper_bounds = {
    'anton_532': 3500,
    'anton_785': 2300,
    'kaiser': 1942,
    'mettler_toledo': 3350,
    'metrohm': 3350,
    'tec5': 3210,
    'timegate': 2000,
    'tornado': 3300,
}

def get_dataset(name, lower=-1000, upper=10000):
    path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
    df = pd.read_csv(os.path.join(path, name))

    lower = max(lower, lower_bounds[name[:-4]])
    upper = min(upper, upper_bounds[name[:-4]])

    spectra_selection = np.logical_and(
        lower <= np.array([float(one) for one in df.columns[:-5]]),
        np.array([float(one) for one in df.columns[:-5]]) <= upper,
    )
    
    spectra = df.iloc[:, :-5].iloc[:, spectra_selection].values
    label = df.iloc[:, -5:-2].values

    wavenumbers = np.array([
        float(one) for one in df.columns[:-5]
    ])[spectra_selection]

    #indices = get_indices(spectra, num_samples)                         
    return spectra, label, wavenumbers


def load_datasets(ds_names, lower=-1000, upper=10000):
        
    lower = max(
        lower,
        *[lower_bounds[n[:-4]] for n in ds_names])
    
    upper = min(
        upper,
        *[upper_bounds[n[:-4]] for n in ds_names]
    )

    datasets = [get_dataset(name, lower, upper) for name in ds_names]
    wavenumbers = np.arange(lower, upper + 1)

    interpolated_data = [
        np.array([
            np.interp(
                wavenumbers,
                xp=wns,
                fp=spectrum,
            )
            for spectrum in spectra
        ])
        for spectra, _, wns in datasets
    ]
    return interpolated_data
    normed_spectra = np.concatenate(
        [
            spectra / np.max(spectra)
            for spectra in interpolated_data
        ],
        axis=0,
    )

    labels = np.concatenate([ds[1] for ds in datasets])
    return normed_spectra, labels

ds_names = ["anton_532.csv", "anton_785.csv", "kaiser.csv", "mettler_toledo.csv", "metrohm.csv", "tornado.csv", "tec5.csv", "timegate.csv"]
inputs, targets = load_datasets(ds_names)

300 1942


ValueError: too many values to unpack (expected 2)

In [86]:
inputs= load_datasets(ds_names)

300


In [84]:
inputs[2].shape

(134, 1643)