In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
import pickle 
import sys
sys.path.append('pyGlyNet')
from pyGlyNet import glycan as gly

# Function to generate bin edges using ppm
def bin_ppm(min_val, max_val, ppm):
    value = min_val
    bin_edges = []
    while value < max_val:
        bin_edges.append(value)
        value = value * (1 + ppm/1000000)
    
    return bin_edges


# Function to generate bin edges using constant bin size
def bin_linear(min_val, max_val, size):
    bin_edges = np.arange(min_val, max_val+size, size)
    
    return bin_edges


# Function to normalize RT values
def normalize_rt(group):
    # Convert RT to numeric, setting non-convertible values to NaN. 
    # This is to allow RTs to be unspecified.
    rt_numeric = pd.to_numeric(group['RT'], errors='coerce')
    
    # Perform normalization
    max_rt = max(rt_numeric.max(), 30)
    normalized_values = rt_numeric / max_rt
    group['RT'].loc[rt_numeric.notna()] = normalized_values

    return group


# Function to bin RT 
def rt_binning(rt_value, RT_bin_edges):
    if isinstance(rt_value, (int, float)):
        # Bin the numeric value
        binned_index = np.digitize([rt_value], RT_bin_edges)[0]
        return f'rt{binned_index}'
    else:
        # Return string value, for example unspecified
        return str(rt_value)
    
    
# Function to bin MZ 
def mz_binning(mz_values, bin_edges):
    binned_indices = np.digitize(mz_values, bin_edges)
    return ['mz' + str(index) for index in binned_indices]


# Function to bin peaks 
def peak_binning(peak_values, bin_edges):
    binned_indices = np.digitize(peak_values, bin_edges)
    return ['pk' + str(index) for index in binned_indices]


# Function to process, normalize, sort the dictionary, and bin both keys and values
def process_peak_d(dict_string, mz_bin_edges, peak_bin_edges, threshold):
    try:
        # convert peak_d to dictionary
        dict_data = ast.literal_eval(dict_string) 
    except ValueError:
        return [], []

    # Normalize and threshold peak intensity 
    total = sum(dict_data.values())
    normalized_dict = {k: v / total for k, v in dict_data.items() if (v / total) > threshold}
    
    # Sort and bin the peaks
    sorted_dict = dict(sorted(normalized_dict.items(), key=lambda item: item[1], reverse=True))
    mzs = list(sorted_dict.keys())
    peaks = list(sorted_dict.values())
    binned_mzs = mz_binning(mzs, mz_bin_edges)
    binned_peaks = peak_binning(peaks, peak_bin_edges)

    return binned_mzs, binned_peaks


# Function to process data
def process_data(df, RT_bin_edges, mz_bin_edges, peak_bin_edges, threshold):

    # Normalize RTs from each file
    df = df.groupby('filename', group_keys=False).apply(normalize_rt)

    # Get RT bin indices
    df['binned_RT'] = df['RT'].apply(lambda x: rt_binning(x, RT_bin_edges))
    
    # Get m/z bin index for precursor mass
    df['binned_mass'] = df['reducing_mass'].apply(lambda x: 'mz' + str(np.digitize(x, mz_bin_edges)))
    
    # Process peak_d column
    df['processed_peak_d'] = df['peak_d'].apply(lambda x: process_peak_d(x, mz_bin_edges, peak_bin_edges, threshold))

    # Get m/z and peak bin indices
    df['binned_mz'] = df['processed_peak_d'].apply(lambda x: x[0])
    df['binned_peak'] = df['processed_peak_d'].apply(lambda x: x[1])

    return df


# Function to generate corpus
def generate_corpus_mz(df):

    # Construct sentences
    corpus = pd.DataFrame()
    corpus['sentence'] = df[['LC_type', 'mode', 'ionization', 'modification', 'trap', 'fragmentation', 'glycan_type', 'binned_RT', 'binned_mass']].agg(' '.join, axis=1)
    corpus['sentence'] += ' ' + df['binned_mz'].apply(' '.join)

    return corpus['sentence'].tolist()

def generate_corpus_mz_peak(df):

    # Function to interleave mz and peak values
    def interleave_mz_peak(mz, peak):
        paired = [f"{m} {p}" for m, p in zip(mz, peak)]
        return ' '.join(paired)

    # Construct sentences
    corpus = pd.DataFrame()
    corpus['sentence'] = df[['LC_type', 'mode', 'ionization', 'modification', 'trap', 'fragmentation', 'glycan_type', 'binned_RT', 'binned_mass']].agg(' '.join, axis=1)
    corpus['sentence'] += ' ' + df.apply(lambda x: interleave_mz_peak(x['binned_mz'], x['binned_peak']), axis=1)

    return corpus['sentence'].tolist()

# Function to generate glycan corpus
def split_antennae(antennae):
    # Process each line
    processed_lines = []
    for line in antennae:
        # Apply replacements and stripping
        line = line.replace('(', ' ')  # Replacing open parenthesis with space
        line = line.replace(')', ' ')  # Replacing close parenthesis with space
        line = line.replace(',', ' ')  # Replacing commas with space
#        line = line.replace('{', ' ')   # Replacing open curly bracket with space
#        line = line.replace('}', ' ')   # Replacing close curly bracket with space
        line = line.strip()  # Removing any leading or trailing whitespace

        # Add the processed line to the list
        processed_lines.append(line)

    return processed_lines

def split_antenna(antenna):
    # Apply replacements and stripping
    antenna = antenna.replace('(', ' ')  # Replacing open parenthesis with space
    antenna = antenna.replace(')', ' ')  # Replacing close parenthesis with space
    antenna = antenna.replace(',', ' ')  # Replacing commas with space
    antenna = antenna.strip()  # Removing any leading or trailing whitespace

    return antenna

def calculate_antenna(iupac):
    g = gly.glycan(iupac)
    return ' '.join(g.Antennae())

In [2]:
# Settings for binning
threshold = 0.001 #peak intensity thresholding

minMZ = 39.714 #minimum m/z
maxMZ = 3000 #maximum m/z
sizeMZ = 0.3 #m/z bin size
mz_bin_edges = bin_linear(minMZ, maxMZ, sizeMZ) #use linear binning
#mz_bin_edges = bin_ppm(minMZ, maxMZ, 20) #use ppm binning

minI = 0
maxI = 1
sizeI = 0.001 #peak intensity bin size
peak_bin_edges = bin_linear(minI, maxI, sizeI)

minRT = 0
maxRT = 1
sizeRT = 0.01 #relative retention time bin size
RT_bin_edges = bin_linear(minRT, maxRT, sizeRT)


In [3]:
# Load data
dataDir = '/Users/rudi/Data/CandyCrunch/Apr2024/'
fulldata_file = ''.join([dataDir, 'training.xlsx'])

# Load full dataset
df = pd.read_excel(fulldata_file)

# Process data
df = process_data(df, RT_bin_edges, mz_bin_edges, peak_bin_edges, threshold)

In [4]:
# Replace glycan labels with antennae
# Get the value counts of unique glycans and reset the index to turn it into a DataFrame
glycan_labels = df['glycan'].value_counts().reset_index()
glycan_labels.columns = ['glycan', 'frequency']
glycan_labels = glycan_labels.sort_values(by='glycan').reset_index(drop=True)

# Create a mapping from glycans to their labels (indices)
glycan_to_antenna = {row['glycan']: calculate_antenna(row['glycan']) for _, row in glycan_labels.iterrows()}

# Search glycan_to_label for glycans with curly brackets and mark them for removal
for glycan in glycan_to_antenna.keys():
    if '{' in glycan or '}' in glycan:
        glycan_to_antenna[glycan] = 'remove'

# Assign class label to spectra
df['antenna'] = df['glycan'].map(glycan_to_antenna)

# Remove rows where class_label is 'remove'
df = df[df['antenna'] != 'remove']

# Split antenna
df['antenna'] = df['antenna'].apply(split_antenna)

In [5]:
# Prepare vocab for MS
# Initialize an empty set to store unique words
words = set()

# Loop through each specified column
for column in ['LC_type', 'mode', 'ionization', 'modification', 'trap', 'fragmentation', 'glycan_type']:
    # Extract unique values from the column and put them into words
    words.update(df[column].astype(str).dropna().unique())

# Convert the set to a list
words = list(words)

# Prepare vocabulary for m/z
for index in range(0, len(mz_bin_edges) + 1):
    words.append('mz' + str(index))

# Prepare vocabulary for retention time
for index in range(0, len(RT_bin_edges) + 1):
    words.append('rt' + str(index))


In [6]:
# Prepare glycan vocab 
antennae = df['antenna'].unique().tolist()
antennae = split_antennae(antennae)

# If desired, print out the processed antennae:
#for line in antennae:
#    print(line)

# Get unique glycan words
glycan_words = []
for line in antennae:
    # First split by space
    glycan_words = glycan_words + line.split()

# Filter out any empty strings that might have been generated
glycan_words = [word for word in glycan_words if word]
glycan_vocab = list(set(glycan_words))  

# To check the list of unique words:
for word in glycan_vocab:
    print(word)


GalOMe
ManOS
Gal
Glc
HexNAc
GalNAcOPCho
Fuc
GalNAcOMe
GlcOS
IdoA2S
GlcNS6S
.1-3
.1-4
IdoA
HexA
GalNAc
GlcNAc6S
GalNAc4S
Gal6S
Ins
a2-.
.1-6
GalN
GlcOP
b1-3
GalNAc6S
GalNAcOS
Man
Glc-ol
Rha
ManOMe
GlcNAc
FucOS
1-.
a1-.
Ara
GlcN
GlcN6S
Xyl
Man6P
GlcNAcOS
HexNAcOS
b1-.
a1-4
GalOS
GlcA
Neu5Ac
HexA2S
Gal4S
GlcNS3S6S
a1-6
a2-8
a2-6
a1-3
Neu5Gc
a2-3
Rha3S
Hex
GlcNS3S
b1-4
b1-6
.1-.
b1-2
Gal3S
Neu5Ac8S
Kdn
a1-2
GlcNS


In [7]:
# Prepare vocabulary

words = words + glycan_vocab

# Write the unique words to a file
with open('vocab.txt', 'w') as file:
    for word in words:
        file.write(word + '\n')

print(f"Vocabulary file created with {len(words)} words.")

Vocabulary file created with 10073 words.


In [8]:
# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.15, random_state=0)

print('Size of train set: ', len(train_df))
print('Size of test set: ', len(test_df))

Size of train set:  426688
Size of test set:  75299


In [9]:
# Create text corpus for spectra using mz only
sentences = generate_corpus_mz(train_df)

with open('train_corpus_mz_bart.txt', 'w') as file:
    for sentence in sentences:
        file.write(sentence + '\n')

sentences = generate_corpus_mz(test_df)

with open('test_corpus_mz_bart.txt', 'w') as file:
    for sentence in sentences:
        file.write(sentence + '\n')

In [10]:
# Create text corpus for glycan antennae

with open('train_corpus_glycan.txt', 'w') as file:
    for antenna in train_df['antenna']:
        file.write(antenna + '\n')
        
with open('test_corpus_glycan.txt', 'w') as file:
    for antenna in test_df['antenna']:
        file.write(antenna + '\n')

In [11]:
# Create text corpus for spectra for full training
sentences = generate_corpus_mz(df)

with open('corpus_mz_bart.txt', 'w') as file:
    for sentence in sentences:
        file.write(sentence + '\n')

with open('corpus_glycan.txt', 'w') as file:
    for antenna in df['antenna']:
        file.write(antenna + '\n')