In [1]:
import os

path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
files = os.listdir(path)
[(i, files[i]) for i in range(len(files))]

[(0, 'sample_submission.csv'),
 (1, 'timegate.csv'),
 (2, 'mettler_toledo.csv'),
 (3, 'kaiser.csv'),
 (4, 'anton_532.csv'),
 (5, 'transfer_plate.csv'),
 (6, '96_samples.csv'),
 (7, 'tornado.csv'),
 (8, 'tec5.csv'),
 (9, 'metrohm.csv'),
 (10, 'anton_785.csv')]

In [195]:
import numpy as np
import pandas as pd


def load_transfer_data():
    csv_path = os.path.join(path, files[5])
    df = pd.read_csv(csv_path)

    input_cols = df.columns[1:2049]
    target_cols = df.columns[2050:]

    targets  = df[target_cols].dropna().to_numpy()

    df = df[input_cols]
    df['Unnamed: 1'] = df['Unnamed: 1'].str.replace("[\[\]]", "", regex=True).astype('int64')
    df['Unnamed: 2048'] = df['Unnamed: 2048'].str.replace("[\[\]]", "", regex=True).astype('int64')

    inputs = df.to_numpy().reshape(-1, 2, 2048)
    inputs = inputs.mean(axis=1)
    
    return inputs, targets


def preprocess_transfer_data():
    inputs, targets = load_transfer_data()
    
    spectra_selection = np.logical_and(
        300 <= np.array([float(one) for one in range(2048)]),
        np.array([float(one) for one in range(2048)]) <= 1942,
    )
    
    inputs = inputs[:, spectra_selection]
    
    wns = np.array([
        float(one) for one in range(2048)
    ])[spectra_selection]
    wavenumbers = np.arange(300, 1943)
    
    interpolated_data = np.array(
        [np.interp(wavenumbers, xp=wns, fp=i) for i in inputs]
    )
    
    normed_spectra = interpolated_data / np.max(interpolated_data)
    return normed_spectra, targets

inputs, targets = preprocess_transfer_data()
inputs.shape, targets.shape

((96, 1643), (96, 3))

In [None]:
def load_test_data():
    test = pd.read_csv(os.path.join(path, files[6]))

    row1 = test.columns[1:].to_numpy().copy()
    row1[-1] = "5611"
    row1 = row1.astype(np.float64)


    cols = test.columns[1:]
    test = test[cols]
    test[" 5611]"] = test[" 5611]"].str.replace('[\[\]]', '', regex=True).astype('int64')
    test = test.to_numpy()

    test = np.insert(test, 0, row1, axis=0)
    return test.reshape(-1, 2, 2048).mean(axis=1)


def get_test_data():
    inputs = load_test_data()
    
    spectra_selection = np.logical_and(
        300 <= np.array([float(one) for one in range(2048)]),
        np.array([float(one) for one in range(2048)]) <= 1942,
    )
    
    inputs = inputs[:, spectra_selection]

    wns = np.array([
        float(one) for one in range(2048)
    ])[spectra_selection]
    wavenumbers = np.arange(300, 1943)

    interpolated_data = np.array(
        [np.interp(wavenumbers, xp=wns, fp=i) for i in inputs]
    )

    normed_spectra = interpolated_data / np.max(interpolated_data)
    return normed_spectra
    

(96, 2048)

In [197]:
spectra_selection = np.logical_and(
        300 <= np.array([float(one) for one in range(2048)]),
        np.array([float(one) for one in range(2048)]) <= 1942,
    )
    
inputs = inputs[:, spectra_selection]

wns = np.array([
    float(one) for one in range(2048)
])[spectra_selection]
wavenumbers = np.arange(300, 1943)

interpolated_data = np.array(
    [np.interp(wavenumbers, xp=wns, fp=i) for i in inputs]
)

normed_spectra = interpolated_data / np.max(interpolated_data)
normed_spectra


array([[0.9931171 , 0.97427514, 0.97573776, ..., 0.43405317, 0.42803063,
        0.42364278],
       [0.90983395, 0.88806676, 0.89047578, ..., 0.41159769, 0.40729588,
        0.40764002],
       [0.9883851 , 0.9701454 , 0.96773638, ..., 0.43964553, 0.43250452,
        0.42519143],
       ...,
       [0.98038372, 0.96068141, 0.9632625 , ..., 0.43826895, 0.42458918,
        0.42742837],
       [0.97943732, 0.96610169, 0.96223006, ..., 0.43310677, 0.42398692,
        0.42458918],
       [0.99733287, 0.97444722, 0.96988729, ..., 0.43061172, 0.42398692,
        0.4222662 ]])