In [54]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
from hdx.utils_series_HDX import find_series

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Batch analysis for the number of exchanges for all samples from Pivot table

## Load the dataset

In [55]:
# Load the dataset using pathlib's Path
file_path = Path('../data/Test-samples.csv')  # Update to your relevant path

# Check if file exists
if file_path.is_file():
    data = pd.read_csv(file_path, sep=',')  # Assuming comma as a separator
else:
    raise FileNotFoundError(f"File {file_path} not found!")


## Process original data

In [56]:
# Process data: Sort, reset index, and rename columns
data = data.sort_values('M').reset_index(drop=True)
data['M'] = data['M'] - 1.00728  # Adjusting for neutral masses if needed

# Renaming columns for readability
data.rename(columns={"M": "mz", "C": 'c', "H": 'h', "O": 'o', "N": 'n', 'S': 's'}, inplace=True)

# Check processed data
data.head()


Unnamed: 0,mz,MF,c,h,o,n,s,Test_sample_1,Test_sample_2,AI,DBE,O/C,H/C,class,AI_class
0,123.008765,C6H4O3,6,4,3,0,0,0.242446,0.009707,0.8,5,0.5,0.666667,CHO,condensed
1,125.024415,C6H6O3,6,6,3,0,0,0.171305,0.011602,0.6,4,0.5,1.0,CHO,aromatic
2,125.0608,C7H10O2,7,10,2,0,0,0.218891,0.012492,0.333333,3,0.285714,1.428571,CHO,unsaturated
3,126.019664,C5H5O3N,5,5,3,1,0,0.023526,0.00579,0.666667,4,0.6,1.0,CHON,condensed
4,127.040065,C6H8O3,6,8,3,0,0,0.246685,0.014441,0.4,3,0.5,1.333333,CHO,unsaturated


In [57]:
l = len(data.columns.values)
l

15

## Loading H/D data

In [58]:
#D2O path using pathlib's Path
D2O_path = Path("../data/")

# List all files in the directory that contain "HDX"
D2O_files = [i.stem for i in D2O_path.iterdir() if "HDX" in i.name]

D2O_files

['HDX_1', 'HDX_2']

# Calculating and adding H/D data

## Parameters

In [59]:
Err = 0.0001 ### set the error with which you search for hdx series. Use Xcalibur to choose conservative value 

mz_accrcy = 0.1 ##window for serching peak within expected mass. Don't have to change it

d = 1.00628 # deiterium mass

## Adding H/D data

In [68]:
# Loop through the files and append new columns directly to the original dataframe
for idx, file_name in enumerate(D2O_files, start=1):
    print(f"\t{idx} out of {len(D2O_files)}")
    
    hdx_path = D2O_path / f"{file_name}.csv"
    hdx = pd.read_csv(hdx_path, sep=",", skiprows=1, header=None)
    
    hdx.columns = ["mz", "int"]
    hdx.dropna(inplace=True)
    hdx.reset_index(drop=True, inplace=True)
    
    data_w_hdx1, _ = find_series(data, hdx, Err, mz_accrcy, d)
    
    # Add the new column to the original data
    data[f'{file_name}_D'] = data_w_hdx1['H_Labile']

	1 out of 2


Finding series: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2141/2141 [00:05<00:00, 394.76it/s]
2024-09-19 16:28:41,622 - INFO - 2141


	2 out of 2


Finding series: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2141/2141 [00:05<00:00, 376.79it/s]
2024-09-19 16:28:47,362 - INFO - 2141


In [53]:
data.to_csv("../results/Results-HDX.csv")