In [None]:
import numpy as np
import pandas as pd

import os

import itertools
from astropy.stats import sigma_clip
from tqdm import tqdm

import glob
import gc


## Data Preprocessing

In [None]:
dir_name = "/kaggle/input/ariel-data-challenge-2025/"
train_dir_name = "/kaggle/input/ariel-data-challenge-2025/train/"
star_0_dir = f"{train_dir_name}1010375142"
path_out = '/kaggle/working/'   # path to store the signal data

Analogue to digital conversion 

In [None]:
def adc_airs(signal, gain = float(adc_df["AIRS-CH0_adc_gain"].iloc[0]), offset = float(adc_df["AIRS-CH0_adc_offset"].iloc[0])):

    signal = signal.astype(np.float64)
    signal /= gain
    signal += offset

    return signal

In [None]:
def adc_fgs(signal, gain = float(adc_df["FGS1_adc_gain"].iloc[0]), offset = float(adc_df["FGS1_adc_offset"].iloc[0])):

    signal = signal.astype(np.float64)
    signal /= gain
    signal += offset

    return signal

### Calibration and binning

In [None]:
def masking_dead_hot(signal, dead, dark):
    '''
    Mask dead and dark pixels from the signals :\n
    
    '''
    hot = sigma_clip(              # clipping values over upper 5 sigma and lower 5 sigma for the dark calibration dataset
        data = dark, sigma = 5, maxiters = 5
    )     # returns boolean array 

    hot = np.tile(hot, (signal.shape[0], 1, 1))   # repeating the boolean array for all the timestamps of the signal dataset
    dead = np.tile(dead, (signal.shape[0], 1, 1))

    signal = np.ma.masked_where(dead, signal)   # masking the signal dataset using the boolean dead and dark arrays
    signal = np.ma.masked_where(hot, signal)

    return signal

Dark current subtraction

In [None]:
def dark_current_sub(signal, dead, dark, dt):   # subtracting the dark current noise from the signals

    dark = np.ma.masked_where(dead, dark)    # correcting the dark current map for the dead pixels
    dark = np.tile(dark, (signal.shape[0],1,1))

    signal -= dark * dt[:, np.newaxis, np.newaxis]   # expanding the dimensionality of the time integration 

    return signal

Non-Linearity correction 

In [None]:
def apply_lin_corr(lin_corr, clean_signal):
    lin_corr = np.flip(lin_corr, axis = 0)   # flipping the order of 0th axis of the dataframe to maintain the coefficients in descending powers 

    for x,y in itertools.product(     # looping through the index of each pixel in the image for a given time stamp 
        range(clean_signal.shape[1]), range(clean_signal.shape[2])
    ):
        poli = np.poly1d(lin_corr[:, x ,y])    # creating the polynomial function 
        clean_signal[: ,x,y] = poli(clean_signal[:,x,y])     # fitting the polynomial function on the input signal
    return clean_signal

Flat Field correction

In [3]:
def correct_flat_field(flat,dead, signal):
    
    flat = flat.transpose(1, 0)
    dead = dead.transpose(1, 0)
    
    flat = np.ma.masked_where(dead, flat) # masking the dead pixels
    flat = np.tile(flat, (signal.shape[0], 1, 1))   # repeating the 2D array for all the time stamps 
    signal = signal / flat  
    return signal

Getting correlated double sampling

In [4]:
def get_cds(signal):
    cds = signal[:,1::2,:,:] - signal[:,::2,:,:]  # start exposure signal - end exposure signal 
    return cds

Binning Imagery

In [5]:
def bin_obs(binning, cds_signals):
    cds_transposed = cds_signals.transpose(0,1,3,2)    # transposing the x,y frame axises in the signal array
    # creating a zeros array with binned time axis
    cds_binned = np.zeros((cds_transposed.shape[0], cds_transposed.shape[1]//binning, cds_transposed.shape[2], cds_transposed.shape[3]))

    # inserting values in the binned zeros array
    for i in range(cds_transposed.shape[1]//binning):
        cds_binned[:,i,:,:] = np.sum(cds_transposed[:,i * binning:(i + 1) * binning,:,:], axis = 1)

    return cds_binned

Getting the index of the training data :

In [None]:
def get_index(files, chunk_size):
    index = []

    for file in files:
        file_name = file.split("/")[-1]   # getting the file name 
        # making sure we are working with AIRS parquet file for the star 
        if file_name.split('_')[0] == 'AIRS-CH0' and file_name.split('_')[1] == 'signal' and file_name.split('_')[2] == '0.parquet':
            file_index = os.path.basename(os.path.dirname(file))  # going one directory up and extracting the basename for the path
            index.append(int(file_index)) 
    index = np.array(index)   
    index = np.sort(index)    # sorting the indices 

    index=np.array_split(index, len(index)//chunk_size)   # dividing the data indices into chunks 
    
    return index

Calibrating all the data 

In [None]:
files = glob.glob(os.path.join(dir_name + 'train/', '*/*'))   # search subdirectories for all the files
CHUNKS_SIZE = 10
index = get_index(files,CHUNKS_SIZE)   # getting the index of the training data into chunks of given chunk size

axis_info = pd.read_parquet(os.path.join(dir_name,'axis_info.parquet'))
DO_MASK = True
DO_THE_NL_CORR = False
DO_DARK = True
DO_FLAT = True
TIME_BINNING = True

# cutting the wavelength pixel axis to match last targets' points in the AIRS

cut_inf, cut_sup = 39, 321
l = cut_sup - cut_inf   

# keeping the track of indices n using enumerate and initializing a progress bar 
# looping through the chunks in the index list (index is the list of all the chunks), n is the index number of the chunk
for n, index_chunk in enumerate(tqdm(index)):
    AIRS_CH0_clean = np.zeros((CHUNKS_SIZE, 11250, 32, l))
    FGS1_clean = np.zeros((CHUNKS_SIZE, 135000, 32, 32))

    for i in range(CHUNKS_SIZE):
        df = pd.read_parquet(os.path.join(dir_name, f"train/{index_chunk[i]}/AIRS-CH0_signal_0.parquet"))

        signal = df.values.astype(np.float64).reshape((df.shape[0], 32, 356))

        signal = adc_airs(signal)

        dt_airs = axis_info["AIRS-CH0-integration_time"].dropna().values
        dt_airs[1::2] += 0.1    # time integration - exposure time for an observation
        chopped_signal = signal[:, :, cut_inf:cut_sup]   # trimmed signal pixels

        del signal, df    # prevent memory buildup during chunk processing

        # Cleaning the airs data (trimmming the wavelength pixels and changing the datatype of the values )

        flat = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/AIRS-CH0_calibration_0/flat.parquet')).values.astype(np.float64).reshape((32, 356))[:, cut_inf:cut_sup]
        dark = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/AIRS-CH0_calibration_0/dark.parquet')).values.astype(np.float64).reshape((32, 356))[:, cut_inf:cut_sup]
        dead_airs = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/AIRS-CH0_calibration_0/dead.parquet')).values.astype(np.float64).reshape((32, 356))[:, cut_inf:cut_sup]
        linear_corr = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/AIRS-CH0_calibration_0/linear_corr.parquet')).values.astype(np.float64).reshape((6, 32, 356))[:, :, cut_inf:cut_sup]

        if DO_MASK:    # masking dead and thermal pixels
            chopped_signal = masking_dead_hot(chopped_signal, dead_airs, dark)
            AIRS_CH0_clean[i] = chopped_signal
        else:
            AIRS_CH0_clean[i] = chopped_signal

        if DO_THE_NL_CORR:    # correcting non-linearity of pixels 
            linear_corr_signal = apply_lin_corr(linear_corr, AIRS_CH0_clean[i])
            AIRS_CH0_clean[i] = linear_corr_signal

        if DO_DARK:    # subtracting the dark current noise 
            cleaned_signal = dark_current_sub(AIRS_CH0_clean[i], dead_airs, dark, dt_airs)
            AIRS_CH0_clean[i] = cleaned_signal

        else:
            pass
        del flat, dark, linear_corr, chopped_signal, dt_airs
        # Cleaning the FGS1 data 

        df = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/FGS1_signal_0.parquet'))
        fgs_signal = df.values.astype(np.float64).reshape((df.shape[0], 32, 32))

        fgs_signal = adc_fgs(fgs_signal)

        dt_fgs1 = np.ones(len(fgs_signal))*0.1
        dt_fgs1[1::2] += 0.1
        chopped_FGS1 = fgs_signal.copy()  
        del fgs_signal, df

        gc.collect()

        # cleaning the data

        flat = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/FGS1_calibration_0/flat.parquet')).values.astype(np.float64).reshape((32, 32))
        dark = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/FGS1_calibration_0/dark.parquet')).values.astype(np.float64).reshape((32, 32))
        dead_fgs1 = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/FGS1_calibration_0/dead.parquet')).values.astype(np.float64).reshape((32, 32))
        linear_corr = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/FGS1_calibration_0/linear_corr.parquet')).values.astype(np.float64).reshape((6, 32, 32))

        if DO_MASK:
            chopped_FGS1 = masking_dead_hot(chopped_FGS1, dead_fgs1, dark)
            FGS1_clean[i] = chopped_FGS1
        else:
            FGS1_clean[i] = chopped_FGS1

        if DO_THE_NL_CORR: 
            linear_corr_signal = apply_lin_corr(linear_corr,FGS1_clean[i])
            FGS1_clean[i,:, :, :] = linear_corr_signal
        del linear_corr
        gc.collect()
        
        if DO_DARK: 
            cleaned_signal = dark_current_sub(FGS1_clean[i], dead_fgs1, dark,dt_fgs1)
            FGS1_clean[i] = cleaned_signal
        del flat, dark, chopped_FGS1, dt_fgs1

        gc.collect()

    # SAVE DATA AND FREE SPACE
    AIRS_cds = get_cds(AIRS_CH0_clean)   # correlated double sampling
    FGS1_cds = get_cds(FGS1_clean)
    
    del AIRS_CH0_clean, FGS1_clean
    gc.collect()

    # Time binning 

    if TIME_BINNING:
        AIRS_cds_binned = bin_obs(30, AIRS_cds)
        FGS1_cds_binned = bin_obs(30*12, FGS1_cds)
    else:
        AIRS_cds = AIRS_cds.transpose(0,1,3,2) ## this is important to make it consistent for flat fielding, but you can always change it
        AIRS_cds_binned = AIRS_cds
        FGS1_cds = FGS1_cds.transpose(0,1,3,2)
        FGS1_cds_binned = FGS1_cds
    
    del AIRS_cds, FGS1_cds
    gc.collect()

    # Flat field correction 

    for i in range (CHUNKS_SIZE):
        flat_airs = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/AIRS-CH0_calibration_0/flat.parquet')).values.astype(np.float64).reshape((32, 356))[:, cut_inf:cut_sup]
        flat_fgs = pd.read_parquet(os.path.join(dir_name,f'train/{index_chunk[i]}/FGS1_calibration_0/flat.parquet')).values.astype(np.float64).reshape((32, 32))
        if DO_FLAT:
            corrected_AIRS_cds_binned = correct_flat_field(flat_airs,dead_airs, AIRS_cds_binned[i])
            AIRS_cds_binned[i] = corrected_AIRS_cds_binned
            corrected_FGS1_cds_binned = correct_flat_field(flat_fgs,dead_fgs1, FGS1_cds_binned[i])
            FGS1_cds_binned[i] = corrected_FGS1_cds_binned
        del flat_airs, flat_fgs
        gc.collect()

    # saving the data chunk by chunk
    np.save(os.path.join(path_out, f"AIRS_clean_train_{n}.npy"), AIRS_cds_binned)
    np.save(os.path.join(path_out, f"FGS1_clean_train_{n}.npy"), FGS1_cds_binned)

    del AIRS_cds_binned
    del FGS1_cds_binned

    gc.collect()

Concatenating all the dataset into one single datasheet

In [7]:
def load_data(file, chunk_size, nb_files):
    data0 = np.load(file + "_0.npy")   # loading the first chunk
    data_all = np.zeros((nb_files*chunk_size, data0.shape[1], data0.shape[2], data0.shape[3]))

    data_all[:chunk_size] = data0    # imputing the first chunk values in the zeros array

    for i in range(1, nb_files):   # loading all the chunks 
        data_all[i*chunk_size:(i+1)*chunk_size] = np.load(file + f'_{i}.npy')
    return data_all


Loading and saving the data imagery

In [None]:
data_train_airs = load_data(path_out + "AIRS_clean_train", CHUNKS_SIZE, len(index))
data_train_fgs1 = load_data(path_out + "FGS1_clean_train", CHUNKS_SIZE, len(index))

In [None]:
np.save("./" + "airs_data_train.npy", data_train_airs)
np.save("./" + "FGS1_data_train.npy", data_train_fgs1)

In [None]:
# saving the data

np.save("./" + "airs_data_train.npy", data_train_airs)
np.save("./" + "FGS1_data_train.npy", data_train_fgs1)