In [1]:
# import library
import numpy as np
import pandas as pd
import itertools
import os
import glob 
# from astropy.stats import sigma_clip

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
code_folder = "/kaggle/input/ariel-submission/"
exec(open(code_folder+'feature_engineering.py', 'r').read())
exec(open(code_folder+'preprocess_data.py', 'r').read())
exec(open(code_folder+'mean_transit.py', 'r').read())
exec(open(code_folder+'preprocess_targets.py', 'r').read())
exec(open(code_folder+'models.py', 'r').read())
exec(open(code_folder+'MC_dropout.py', 'r').read())
exec(open(code_folder+'atmospheric_feature.py', 'r').read())
exec(open(code_folder+'postprocessing.py', 'r').read())

data preprocess

In [3]:
path_folder = "/kaggle/input/ariel-data-challenge-2024/"
model_folder = "/kaggle/input/ariel_sub/pytorch/default/1/"

In [4]:
CHUNKS_SIZE = 1

In [5]:
test_adc_info = pd.read_csv(os.path.join(path_folder, 'test_adc_info.csv'))
test_adc_info = test_adc_info.set_index('planet_id')
axis_info = pd.read_parquet(os.path.join(path_folder,'axis_info.parquet'))

index = test_adc_info.index
data_train_AIRS, data_train_FGS = preprocess_data(index, CHUNKS_SIZE, path_folder, test_adc_info, axis_info)

100%|██████████| 1/1 [00:20<00:00, 20.57s/it]


In [6]:
SEED = 42

train_solution = np.loadtxt(f'{path_folder}/train_labels.csv', delimiter = ',', skiprows = 1)

targets, targets_mean = get_targets(train_solution)

# mean transit
wc = get_wc(data_train_AIRS)
wc_norm = normalize_wlc(wc)
targets_norm, min_targets_wc, max_targets_wc = get_targets_wc(targets_mean)

model_wc = CNN1D()
model_wc.load_state_dict(torch.load(model_folder + 'model_1dcnn.pth'))

nb_dropout_wc = 1000 # predict 1000 times
prediction_wc = MC_dropout_WC(model_wc, wc_norm, nb_dropout_wc)
spectre_wc_all = unstandardizing(prediction_wc.numpy(), min_targets_wc, max_targets_wc)
spectre_wc = spectre_wc_all.mean(axis=0)
spectre_std_wc = spectre_wc_all.std(axis=0)

# residuals = targets_mean - spectre_wc
# print('RMSE : ', np.sqrt((residuals**2).mean())*1e6, 'ppm') #（ppm，parts per million）

# atmospheric features
targets_shift = suppress_mean(targets, targets_mean)
targets_norm, targets_abs_max = targets_normalization(targets_shift)

dataset = combine_data(data_train_AIRS, data_train_FGS)
dataset_norm = norm_star_spectrum(dataset)
ingress, egress = 75, 115
data_in = suppress_out_transit(dataset_norm, ingress, egress)
data_in_mean = substract_data_mean(data_in)
data_in_norm, data_abs_max = data_norm(data_in_mean)

model = CNN2D()
model.load_state_dict(torch.load(model_folder + 'model_2dcnn.pth'))

nb_dropout = 5  # the number of MC Dropout
spectre_data_shift, spectre_data_shift_std = NN_uncertainty(model, data_in_norm, targets_abs_max, T=nb_dropout)

# residuals = targets_shift - spectre_data_shift
# print('RMSE : ', np.sqrt((residuals**2).mean())*1e6, 'ppm')


  model_wc.load_state_dict(torch.load(model_folder + 'model_1dcnn.pth'))
  model.load_state_dict(torch.load(model_folder + 'model_2dcnn.pth'))


In [7]:
# combine the mean of transit and atmospheric spectrum to get final results
predictions = spectre_data_shift + spectre_wc[:, np.newaxis]

# final uncertainty
predictions_std = np.sqrt(spectre_data_shift_std**2 + spectre_std_wc[:, np.newaxis]**2)

sub_df = postprocessing(predictions, predictions_std, index)
sub_df.to_csv('/kaggle/working/submission.csv')