In [1]:
import pandas as pd
import numpy as np

In [2]:
path_ast = "./astral/PXD046453_duplicates.parquet"
path_tof = "./timsTOF/tof_train.parquet"
path_pro = "./proteome_tools/proteome_tools_0.parquet"

In [3]:
df_ast = pd.read_parquet(path_ast)
df_tof = pd.read_parquet(path_tof)
df_pro = pd.read_parquet(path_pro)

In [4]:
dataset_cols = [
    'prosit_sequence',
    'charge',
    'collision_energy',
    'method_nr',
    'machine',
    'intensities_raw',
]

# set collision_energy_aligned_normed as collision_energy
df_tof['collision_energy'] = df_tof['collision_energy_aligned_normed']

# temporarily norm astral collision energy
df_ast['collision_energy'] = df_ast['collision_energy'] / 100

df_ast = df_ast[dataset_cols]
df_tof = df_tof[dataset_cols]
df_pro = df_pro[dataset_cols]

In [5]:
df_ast = df_ast.sample(frac=1, random_state=42).reset_index(drop=True)
df_pro = df_pro.sample(frac=1, random_state=42).reset_index(drop=True)
df_tof = df_tof.sample(frac=1, random_state=42).reset_index(drop=True)

rows_per_dataset = 210_000

df_combined = pd.concat([df_ast.head(rows_per_dataset), df_tof.head(rows_per_dataset), df_pro.head(rows_per_dataset)], axis=0, ignore_index=True)
df_combined

df_combined['intensities_raw'] = df_combined['intensities_raw'].apply(lambda x: x.astype(np.float64))

df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [22]:
df_combined.head(100)

Unnamed: 0,prosit_sequence,charge,collision_energy,method_nr,machine,intensities_raw
0,FEFHHGDYEK,3,0.340000,HCD,TOF,"[0.0012383208003038033, 0.0, 0.0, 0.0, 0.0, 0...."
1,GRDDITVHDNIK,2,0.280000,HCD,TOF,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03308445538..."
2,VTSSGVLLLDNYSDR,2,0.300000,CID,Astral,"[0.4873846769332886, 0.0, -1.0, 0.0, 0.0, -1.0..."
3,LLESDYFR,2,0.310000,HCD,TOF,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.02768777614..."
4,LDSSDIYNELK,2,0.300000,CID,Astral,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.41207116842..."
...,...,...,...,...,...,...
95,YRNFmIDTY,2,0.379769,HCD,Orbitrap_Fusion_Lumos,"[1.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.45783132314..."
96,KEFEEVFLQSm,2,0.300000,HCD,TOF,"[0.048480930833872005, 0.0, -1.0, 0.0, 0.0, -1..."
97,HEYFMHATDK,2,0.280000,HCD,TOF,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.26278465421..."
98,HCGYTQLSPFSEDSAK,2,0.230000,HCD,TOF,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03128232010..."


In [25]:
method_map = {
    'CID': 0,
    'HCD': 1,
}

machine_map = {
    'Astral': 0,
    'TOF': 1,
    'Orbitrap_Fusion_Lumos': 2
}

df_combined['charge_oh']    = df_combined['charge']   .apply(lambda x: np.eye(6)[x-1]           .astype(int))
df_combined['method_nr_oh'] = df_combined['method_nr'].apply(lambda x: np.eye(2)[method_map[x]] .astype(int))
df_combined['machine_oh']   = df_combined['machine']  .apply(lambda x: np.eye(3)[machine_map[x]].astype(int))

df_combined['modified_sequence'] = df_combined['prosit_sequence']

In [26]:
df_combined

Unnamed: 0,prosit_sequence,charge,collision_energy,method_nr,machine,intensities_raw,charge_oh,method_nr_oh,machine_oh,modified_sequence
0,FEFHHGDYEK,3,0.34,HCD,TOF,"[0.0012383208003038033, 0.0, 0.0, 0.0, 0.0, 0....","[0, 0, 1, 0, 0, 0]","[0, 1]","[0, 1, 0]",FEFHHGDYEK
1,GRDDITVHDNIK,2,0.28,HCD,TOF,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03308445538...","[0, 1, 0, 0, 0, 0]","[0, 1]","[0, 1, 0]",GRDDITVHDNIK
2,VTSSGVLLLDNYSDR,2,0.30,CID,Astral,"[0.4873846769332886, 0.0, -1.0, 0.0, 0.0, -1.0...","[0, 1, 0, 0, 0, 0]","[1, 0]","[1, 0, 0]",VTSSGVLLLDNYSDR
3,LLESDYFR,2,0.31,HCD,TOF,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.02768777614...","[0, 1, 0, 0, 0, 0]","[0, 1]","[0, 1, 0]",LLESDYFR
4,LDSSDIYNELK,2,0.30,CID,Astral,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.41207116842...","[0, 1, 0, 0, 0, 0]","[1, 0]","[1, 0, 0]",LDSSDIYNELK
...,...,...,...,...,...,...,...,...,...,...
629995,DPETLVGYSMVGCQR,2,0.30,CID,Astral,"[0.07622350007295609, 0.0, -1.0, 0.0, 0.0, -1....","[0, 1, 0, 0, 0, 0]","[1, 0]","[1, 0, 0]",DPETLVGYSMVGCQR
629996,YLEKSGVL,1,0.32,HCD,TOF,"[0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.0, -1.0, ...","[1, 0, 0, 0, 0, 0]","[0, 1]","[0, 1, 0]",YLEKSGVL
629997,LQHGTQQQDLNKK,3,0.32,HCD,TOF,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2686567164179...","[0, 0, 1, 0, 0, 0]","[0, 1]","[0, 1, 0]",LQHGTQQQDLNKK
629998,FMADIDLDPGCTLNK,2,0.30,CID,Astral,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.05899238213...","[0, 1, 0, 0, 0, 0]","[1, 0]","[1, 0, 0]",FMADIDLDPGCTLNK


In [27]:
df_combined.to_parquet("./combined_dlomix_format.parquet")

In [28]:
df_combined.head(10).to_parquet("./combined_dlomix_format_testing.parquet")

In [12]:
df_combined[df_combined['prosit_sequence'].str.len() >= 28].head(10).to_parquet("./combined_dlomix_format_testing2.parquet")

Unnamed: 0,prosit_sequence,charge,collision_energy,method_nr,machine,intensities_raw
88,KDPEGLFLQDNIVAEFSVDETGQmSATAK,3,0.334532,HCD,Orbitrap_Fusion_Lumos,"[0.4300000071525574, 0.0, 0.0, 0.8799999952316..."
678,NHGVVMPDANKENTLNQLVGAAFGAAGQR,3,0.213971,HCD,Orbitrap_Fusion_Lumos,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
679,TGLAGAPGPPGVKGSSGLPGSPGIQGPK,2,0.35,CID,Orbitrap_Fusion_Lumos,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.0, 0.0, -1...."
722,AGLPCQDLEFVQFHPTGIYGAGCLITEGCR,4,0.3,CID,Astral,"[0.11255692690610886, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1571,EGSVVVDLAAEAGGNFETTKPGELYIHK,4,0.3,CID,Astral,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2667414844036..."
1635,ILLSQTTGVAIPLHASSLDDVSLASTPK,3,0.3,CID,Astral,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2109750658273..."
1939,PSHTLSSLATGASGGPPVSKAPTMDAQQDR,4,0.235426,HCD,Orbitrap_Fusion_Lumos,"[0.0476190485060215, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2825,LIAALSTPSQQVQESVASCLPPLVPAIK,3,0.3,CID,Astral,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3803,SSGATPVSGPPPPSASSTPAGQPTAVSR,2,0.317961,HCD,Orbitrap_Fusion_Lumos,"[0.03999999910593033, 0.0, -1.0, 0.0, 0.0, -1...."
4183,DLEEDREENPGLTSPEPQLPNSPTGVVGAK,3,0.3853,HCD,Orbitrap_Fusion_Lumos,"[0.10999999940395355, 0.0, 0.0, 0.0, 0.0, 0.0,..."
