In [4]:
import os

path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
files = os.listdir(path)
[(i, files[i]) for i in range(len(files))]

[(0, 'sample_submission.csv'),
 (1, 'timegate.csv'),
 (2, 'mettler_toledo.csv'),
 (3, 'kaiser.csv'),
 (4, 'anton_532.csv'),
 (5, 'transfer_plate.csv'),
 (6, '96_samples.csv'),
 (7, 'tornado.csv'),
 (8, 'tec5.csv'),
 (9, 'metrohm.csv'),
 (10, 'anton_785.csv')]

In [9]:
import pandas as pd

df = pd.read_csv(os.path.join(path, files[6]))
df

Unnamed: 0,sample1,6409,7097,8247,9853,11768,14434,19349,36210,65535,...,1709,1706.1,1598.3,1627.2,1653,1008,1035,1061,1282.5,5611]
0,,6445,7147,8385,9837,11930,14613,19526,37225,65535,...,1668,1659,1594,1625,1656,1022,1020,1057,1301,5731]
1,sample2,5749,6351,7447,8777,10491,12729,17102,32323,65535,...,1650,1617,1544,1558,1617,1012,1012,1059,1295,5112]
2,,5888,6475,7461,8822,10565,12960,17295,32686,65535,...,1629,1644,1551,1545,1559,1013,1036,1070,1274,5161]
3,sample3,6247,6975,8166,9554,11401,14010,18883,35979,65535,...,1687,1705,1602,1634,1620,1008,1016,1051,1278,5513]
4,,6376,7074,8328,9823,11846,14419,19359,36710,65535,...,1721,1711,1594,1652,1603,1007,1018,1056,1259,5659]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,,6101,6945,8118,9620,11647,14123,18593,35243,65535,...,1685,1701,1573,1638,1615,1037,1003,1039,1271,5402]
187,sample95,6579,7337,8605,10249,12368,14983,19810,37117,65535,...,1658,1690,1569,1585,1640,1001,1007,1043,1260,5863]
188,,6283,7057,8234,9832,11784,14313,19266,35806,65535,...,1680,1678,1563,1607,1641,984,1002,1044,1241,5601]
189,sample96,6102,6821,7989,9493,11505,13939,18305,34519,65535,...,1679,1657,1553,1606,1631,994,991,1052,1272,5458]


In [52]:
def load_comp_data(filepath, is_train=True):
    """Load and preprocess the Raman spectroscopy data"""
    if is_train:
        df = pd.read_csv(filepath)
        # Extract target variables
        target_cols = ['Glucose (g/L)', 'Sodium Acetate (g/L)', 'Magnesium Acetate (g/L)']
        y = df[target_cols].dropna().values
        
        # Process spectral data
        X = df.iloc[:, :-4] # Remove last 4 columns (analyte info and targets)
    else:
        df = pd.read_csv(filepath, header=None)
        X = df
        y = None
    
    # Set column names
    X.columns = ["sample_id"] + [str(i) for i in range(X.shape[1]-1)]
    
    # Fill sample_id using forward fill
    X['sample_id'] = X['sample_id'].ffill()
    
    # Clean sample_id
    if is_train:
        X['sample_id'] = X['sample_id'].str.strip()
    else:
        X['sample_id'] = X['sample_id'].str.strip().str.replace('sample', '').astype(int)
    
    # Clean spectral data (remove brackets)
    spectral_cols = X.columns[1:]
    for col in spectral_cols:
        X[col] = X[col].astype(str).str.replace('[', '', regex=False).str.replace(']', '', regex=False)
        X[col] = pd.to_numeric(X[col], errors='coerce')

    return X, y


inputs, targets = load_comp_data(os.path.join(path, files[5]))
inputs, targets

(    sample_id     0     1     2      3      4      5      6      7      8  \
 0     sample1  6293  7095  8325   9934  11917  14394  18925  34874  65535   
 1     sample1  6505  7332  8482  10175  12132  14792  19594  35813  65535   
 2     sample2  6478  7158  8444   9979  11932  14503  19309  35118  65535   
 3     sample2  6511  7308  8520  10205  12260  14777  19569  35825  65535   
 4     sample3  6561  7342  8562  10166  12202  14838  19593  35869  65535   
 ..        ...   ...   ...   ...    ...    ...    ...    ...    ...    ...   
 187  sample94  6652  7453  8641  10270  12168  15014  20000  36732  65535   
 188  sample95  6798  7514  8786  10431  12372  15419  20547  37854  65535   
 189  sample95  6764  7534  8828  10532  12454  15504  20566  37705  65535   
 190  sample96  6847  7545  8795  10452  12588  15515  20492  37710  65535   
 191  sample96  6811  7577  8818  10471  12570  15407  20425  37509  65535   
 
      ...  2038  2039  2040  2041  2042  2043  2044  2045  204

In [55]:
import numpy as np
import pandas as pd


def load_transfer_data():
    csv_path = os.path.join(path, files[5])
    df = pd.read_csv(csv_path)

    input_cols = df.columns[1:2049]
    target_cols = df.columns[2050:]

    targets  = df[target_cols].dropna().to_numpy()

    df = df[input_cols]
    df['Unnamed: 1'] = df['Unnamed: 1'].str.replace("[\[\]]", "", regex=True).astype('int64')
    df['Unnamed: 2048'] = df['Unnamed: 2048'].str.replace("[\[\]]", "", regex=True).astype('int64')

    inputs = df.to_numpy().reshape(-1, 2, 2048)
    inputs = inputs.mean(axis=1)
    
    return inputs, targets


def preprocess_transfer_data():
    inputs, targets = load_transfer_data()
    
    spectra_selection = np.logical_and(
        300 <= np.array([float(one) for one in range(2048)]),
        np.array([float(one) for one in range(2048)]) <= 1942,
    )
    
    inputs = inputs[:, spectra_selection]
    
    wns = np.array([
        float(one) for one in range(2048)
    ])[spectra_selection]
    wavenumbers = np.arange(300, 1943)
    
    interpolated_data = np.array(
        [np.interp(wavenumbers, xp=wns, fp=i) for i in inputs]
    )
    
    normed_spectra = interpolated_data / np.max(interpolated_data)
    return normed_spectra, targets

i, t = preprocess_transfer_data()
i.shape, t.shape

((96, 1643), (96, 3))

In [53]:
inputs = inputs.drop('sample_id', axis=1).values.reshape(-1, 2, 2048).mean(axis=1)
inputs

array([[6399. , 7213.5, 8403.5, ..., 1058. , 1274. , 5687. ],
       [6494.5, 7233. , 8482. , ..., 1037.5, 1262.5, 5762. ],
       [6550. , 7343. , 8545.5, ..., 1053. , 1306.5, 5823. ],
       ...,
       [6666. , 7354.5, 8598.5, ..., 1071.5, 1288.5, 5841.5],
       [6781. , 7524. , 8807. , ..., 1068. , 1275.5, 5955. ],
       [6829. , 7561. , 8806.5, ..., 1056.5, 1279.5, 6015.5]])

In [56]:
i

array([[0.97299983, 0.95949975, 0.95461322, ..., 0.42661918, 0.41394732,
        0.41195958],
       [0.96513169, 0.95386782, 0.94931257, ..., 0.42405168, 0.41717741,
        0.41253934],
       [0.97672685, 0.96024515, 0.95800894, ..., 0.4226437 , 0.41477555,
        0.41659765],
       ...,
       [0.95759483, 0.9431837 , 0.93597813, ..., 0.4189995 , 0.40723869,
        0.40988902],
       [0.96720225, 0.9501408 , 0.94409475, ..., 0.41924797, 0.41634918,
        0.41668047],
       [0.9750704 , 0.95991386, 0.95378499, ..., 0.4189995 , 0.41485837,
        0.4156866 ]])

In [54]:
inputs = fix_val_test_shape(inputs)
inputs / inputs.max()

array([[0.96486347, 0.96433742, 0.95887131, ..., 0.14550203, 0.14882057,
        0.14882057],
       [0.96486347, 0.96487858, 0.96503565, ..., 0.14494311, 0.14660238,
        0.14660238],
       [0.96688001, 0.96661093, 0.96381505, ..., 0.14461416, 0.1470561 ,
        0.1470561 ],
       ...,
       [0.94605924, 0.94614692, 0.94705793, ..., 0.14493574, 0.147409  ,
        0.147409  ],
       [0.96309899, 0.96300225, 0.96199699, ..., 0.1453768 , 0.14882057,
        0.14882057],
       [0.96299817, 0.96324305, 0.96578762, ..., 0.14481857, 0.14866933,
        0.14866933]])

In [47]:
def fix_val_test_shape(X):
    lower_wns = 300
    upper_wns = 1942
    joint_wns = np.arange(lower_wns, upper_wns + 1)
    spectral_values = np.linspace(65, 3350, 2048)

    spectra_selection = np.logical_and(
        lower_wns <= spectral_values, spectral_values <= upper_wns,
    )
    wns = spectral_values[spectra_selection]
    X = X[:, spectra_selection]
    X = np.array([np.interp(joint_wns, xp=wns, fp=spectrum,)for spectrum in X])
    return X