# Download and preprocessing of the DRIAMS datasets to obtain the files necessary to train the ResMLP model

The raw data from Weis et al. (2022) is available through the Dryad platform at:
[https://datadryad.org/stash/dataset/doi:10.5061/dryad.bzkh1899q](https://datadryad.org/stash/dataset/doi:10.5061/dryad.bzkh1899q)

This script will require the download and extraction of the original files to then obtain the processed matrices.


In [None]:
import os
import json
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

In [None]:
# We make use of the preprocessed long table that includes information on all samples suitable for the analysis
driams_long_table = pd.read_csv("../processed_data/DRIAMS_combined_long_table.csv")
driams_long_table.head()

# DRIAMS A

In [None]:
os.makedirs("DRIAMS-A", exist_ok=True)
driams_a_samples = sorted(list(driams_long_table[driams_long_table["dataset"]=="A"]["sample_id"].unique()))

dataset_folder = "DRIAMS/DRIAMS-A/binned_6000" # Replace with the path where you downloaded the data if needed
spectra_matrix = []
for sample_id in tqdm(driams_a_samples):
    sample_data = None
    for year in ["2015", "2016", "2017", "2018"]:
        if os.path.exists(os.path.join(dataset_folder, year, f"{sample_id}.txt")):
            sample_data = pd.read_csv(os.path.join(dataset_folder, year, f"{sample_id}.txt"), sep=" ", index_col=0)
            break
    if sample_data is None:
        print(f"File for sample {sample_id} not found")
        continue
    spectra_matrix.append(sample_data["binned_intensity"].values)

spectra_matrix = np.vstack(spectra_matrix)
print(spectra_matrix.shape)
np.save("DRIAMS-A/spectra_binned_6000_all.npy", spectra_matrix)

# DRIAMS B

In [None]:
os.makedirs("DRIAMS-B", exist_ok=True)
driams_b_samples = sorted(list(driams_long_table[driams_long_table["dataset"]=="B"]["sample_id"].unique()))

dataset_folder = "DRIAMS/DRIAMS-B/binned_6000/2018" # Replace with the path where you downloaded the data if needed
spectra_matrix = []
for sample_id in tqdm(driams_b_samples):
    sample_data = pd.read_csv(os.path.join(dataset_folder, f"{sample_id}.txt"), sep=" ", index_col=0)
    spectra_matrix.append(sample_data["binned_intensity"].values)

spectra_matrix = np.vstack(spectra_matrix)
print(spectra_matrix.shape)
np.save("DRIAMS-B/spectra_binned_6000_2018.npy", spectra_matrix)

# DRIAMS C

In [None]:
os.makedirs("DRIAMS-C", exist_ok=True)
driams_c_samples = sorted(list(driams_long_table[driams_long_table["dataset"]=="C"]["sample_id"].unique()))

dataset_folder = "DRIAMS/DRIAMS-C/binned_6000/2018" # Replace with the path where you downloaded the data if needed
spectra_matrix = []
for sample_id in tqdm(driams_c_samples):
    sample_data = pd.read_csv(os.path.join(dataset_folder, f"{sample_id}.txt"), sep=" ", index_col=0)
    spectra_matrix.append(sample_data["binned_intensity"].values)

spectra_matrix = np.vstack(spectra_matrix)
print(spectra_matrix.shape)
np.save("DRIAMS-C/spectra_binned_6000_2018.npy", spectra_matrix)

# DRIAMS D

In [None]:
os.makedirs("DRIAMS-D", exist_ok=True)
driams_d_samples = sorted(list(driams_long_table[driams_long_table["dataset"]=="D"]["sample_id"].unique()))

dataset_folder = "DRIAMS/DRIAMS-D/binned_6000/2018" # Replace with the path where you downloaded the data if needed
spectra_matrix = []
for sample_id in tqdm(driams_d_samples):
    sample_data = pd.read_csv(os.path.join(dataset_folder, f"{sample_id}.txt"), sep=" ", index_col=0)
    spectra_matrix.append(sample_data["binned_intensity"].values)

spectra_matrix = np.vstack(spectra_matrix)
print(spectra_matrix.shape)
np.save("DRIAMS-D/spectra_binned_6000_2018.npy", spectra_matrix)