# build a dataset

In [1]:
# Training set parameter settings
training = 'training' # Name of the training set folder
group_num = [73,49,99] # Arranged in Windows system order
group_name = ['H','MP','P']
mass_rate = 0.90 # m/z detection ratio

# Test set parameter settings
tolerance = 0.5 # Mass spectrum peak resolution selection, do not change, tests show 0.5 is better than 0.3!
peak_int_learn = 0.01 # Select mass spectrum peaks based on peak intensity
pvalues = 0.05 # p-value screening

# Dataset paths
path = .'/'+training+'/' # file route of single mass files continging m/z and intensity
path_store =  .'/'+training+'_store/' # file store route of training file

# Perform m/z alignment and filtering through pyopenms

In [2]:
import numpy as np
import pyopenms as oms
import numpy as np
from scipy import signal

# Data quality control, mass spectrum peak denoising, remove flat-top peaks
def noise_removal(prim,tolerance): 
    total = prim.values.tolist()
    ref_total = total[1:]+[[0,0]]
    new_total = [[r[0]-m[0],r[1]-m[1]] for r,m in zip(ref_total,total)]   
    tf = [total[0]]
    for new,ref,to in zip(new_total,ref_total,total):
        if new[0] >= 0.5:        
            tf = tf+[ref]
        else:
            if new[1]>=0:                      
                tf = tf[:-1]+[ref]+[ref]
            else:
                tf = tf[:-1]+[to]+[to]
    tf = [m for i,m in enumerate(tf) if m not in tf[:i]]            
    return tf

# Generate OpenMS data format
def openms_data_format(mass,intensity,decimal=5):
    # Retain mass spectrum data
    mz = np.round(mass.values,decimal)
    mz_intensity = intensity.values
    spectrum = oms.MSSpectrum()
    spectrum.set_peaks([mz,mz_intensity])
    spectrum.sortByPosition()
    return spectrum

# Mass alignment
def mass_align(ref_spectrum,obs_spectrum,tolerance=0.5):
    alignment = []
    spa = oms.SpectrumAlignment()
    p = spa.getParameters()
    # use 0.5 Da tolerance (Note: for high-resolution data we could also use ppm 
    # by setting the is_relative_tolerance value to true)
    p.setValue("tolerance", tolerance)
    p.setValue("is_relative_tolerance", "false")
    spa.setParameters(p)
    # align both spectra
    spa.getSpectrumAlignment(alignment, ref_spectrum, obs_spectrum)
    return alignment

# Mass calculation: average (version 1)
def mass_calculation(re_spectrum,ob_spectrum,alignment,decimal=4):   
    ref = [i[0] for i in alignment]
    obs = [j[1] for j in alignment]
    ref_mass = [re_spectrum.mass[i] for i in ref]
    obs_mass = [ob_spectrum.mass[j] for j in obs]
    ave_mass = np.round((np.array(ref_mass)+np.array(obs_mass))/2,decimal)
    for i,j,q in zip(ref,obs,range(len(ave_mass))):
        re_spectrum.iloc[i, 0] = ave_mass[q]
        ob_spectrum.iloc[j, 0] = ave_mass[q]
    return re_spectrum,ob_spectrum

# Mass calculation based on reference file (version 2)
def mass_calculation_ref(re_spectrum,ob_spectrum,alignment,decimal=4):   
    ref = [i[0] for i in alignment]
    obs = [j[1] for j in alignment]
    for i,j in zip(ref,obs):
        ob_spectrum.iloc[j, 0] = re_spectrum.iloc[i, 0]         
    return re_spectrum,ob_spectrum

# Generation of initial file (default to use the first data file)

In [3]:
import os
import pandas as pd
from natsort import natsorted
from jcamp import jcamp_readfile

# Read file list 2023-09-22
file_list = natsorted(os.listdir(path))
column_list = [fst.split('.')[0] for fst in file_list]

# Add initial file values 2023-09-22
name_list = file_list[1:]
col_list = column_list[1:]

# Generate initial file 2023-09-22
first_file,first_column = file_list[0],column_list[0]
prim = pd.read_excel(path+first_file)
prim = noise_removal(prim,tolerance)
prim = pd.DataFrame(prim,columns=['mass',first_column])
#prim = prim[prim[first_column] >= peak_int_learn*max_peak].reset_index(drop=True)

# Remaining samples addition

In [4]:
import numpy as np
import pandas as pd

# Training set generation 20241124
for name, col in zip(name_list, col_list):
    # Read file
    indata = pd.read_excel(path + name)
    denoise = noise_removal(indata, tolerance)  # Remove noise
    framefile = pd.DataFrame(denoise, columns=['mass', col])

    # Generate openms data
    ref_spectrum = openms_data_format(prim.mass, prim.iloc[:, 1])
    obs_spectrum = openms_data_format(framefile.mass, framefile.iloc[:, 1])
    alignment = mass_align(ref_spectrum, obs_spectrum, tolerance)

    # Data integration
    r_spectrum, o_spectrum = mass_calculation_ref(prim, framefile, alignment)
    prim = pd.merge(prim, o_spectrum, how='outer', on='mass')  # Merge is not easy to use well
    prim = prim.sort_values('mass', ascending=True).reset_index(drop=True)  # Merge is not easy to use well

# Final data generation
outfile = prim.replace(0, np.nan).dropna(axis=0, how='all')

# Data statistics

In [5]:
# Generate and save training set
prim_int = outfile

# Calculate data statistics
data_num = [0]
for p in range(len(group_num)):
    data_num.append(sum(group_num[:p+1]))

minum_num = [round(m * mass_rate) for m in group_num]
new_group_name = ['num_'+ name for name in group_name]

# Generate new data columns
for i,name in zip(range(len(data_num)-1),new_group_name):
    prim_int[name] = prim_int.iloc[:,data_num[i]+1:data_num[i+1]+1].count(axis=1)

# File filtering
total_file = prim_int[(prim_int[new_group_name[0]] >= minum_num[0])]    
for name,mini in zip(new_group_name[1:],minum_num[1:]):
    internal_file = prim_int[(prim_int[name] >= mini)]  
    total_file = pd.merge(total_file,internal_file,how='outer') 
int_scd = total_file.copy().sort_values(by = 'mass').reset_index(drop=True)

# Data preprocessing
#SimpleImputer, NaN data imputation, optional
#strategy=['mean','median','most_frequent']
#Data Regularization
The sum of the absolute values of the 'l1' eigenvalues, the square root of the sum of the absolute values squared of the 'l2' eigenvalues, and the maximum value of the 'max' eigenvalues

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,Normalizer
#-----------------Dataset processing------------------
makepip = make_pipeline(SimpleImputer(missing_values=np.nan,\
                                      strategy='median'),Normalizer(norm='l1'),StandardScaler())
int_sc = int_scd[column_list].T
treat = makepip.fit(int_sc)
int_sc_norm = pd.DataFrame(treat.transform(int_sc).T,columns=column_list)

# Statistical analysis and data storage
#Data Statistical Analysis No.2: Inter-group Comparison and Statistics #Revised on September 16th, using np.nanmean() to calculate the average #Revised on February 19th, 2025, adding ANOVA

In [7]:
# Generate and save training set
from scipy import stats
# Split the dataset
internal_list = []
for i,name in zip(range(len(data_num)-1),new_group_name):
    data_internal = int_sc_norm.iloc[:,data_num[i]:data_num[i+1]].T.values
    internal_list.append(data_internal)

# Determine whether ANOVA analysis is needed
if len(group_num)>=3:
    f_statistic, p_value = stats.f_oneway(*internal_list)
    p_column = 'ANOVA'
else:
    s_statistic, p_value = stats.ttest_ind(*internal_list,equal_var=False)
    p_column = 'ttest'
    
# Filter m/z with p-value less than threshold
p_value = pd.DataFrame(p_value,columns = [p_column])
f_value = pd.DataFrame(f_statistic,columns = [p_column+'_f'])
int_scf = pd.concat([int_scd,p_value,f_value],axis=1)
int_F = int_scf[(int_scf[p_column] <= 0.05)]
int_F_sort = int_F.sort_values(by = ['mass'])

# Save data
int_F_sort.to_excel(path_store+training+'_screened_int.xlsx',index=False)
int_scf.to_excel(path_store+training+'_int.xlsx',index=False)