In [15]:
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm

# Load the best model
best_rf_model = joblib.load('Fentanyl_Finder.pkl')

# Read the data
data = pd.read_excel("peak_clean.xlsx")
data['Spectra'] = data['Spectra'].apply(lambda x: x.split('\n'))

# Define parameters
ms2int_threshold = 10.0
mz_min = 50.0
mz_max = 400.0
num_bins = 3500

# Data preprocessing
data_processed = data.copy()

# Initialize m/z bins matrix
mz_bins_matrix = np.zeros((len(data_processed), num_bins))

for i, row in tqdm(enumerate(data_processed.itertuples()), total=len(data_processed), desc=f"Processing rows for num_bins = {num_bins}"):
    MS2_list = row.Spectra
    mz = []
    Relative_int = []
    for pair in MS2_list:
        if pair:
            try:
                mz_val, intensity_val = pair.split()
                mz.append(float(mz_val))
                Relative_int.append(float(intensity_val))
            except ValueError:
                continue

    filtered_pairs = [(mz_val, int_val) for mz_val, int_val in zip(mz, Relative_int) if int_val >= ms2int_threshold and mz_min <= mz_val <= mz_max]
    for mz_val, int_val in filtered_pairs:
        bin_index = int((mz_val - mz_min) / (mz_max - mz_min) * num_bins)
        bin_index = max(0, min(bin_index, num_bins - 1))
        mz_bins_matrix[i, bin_index] += int_val

# Convert m/z bins matrix to DataFrame and merge into data_processed
mz_bins_df = pd.DataFrame(mz_bins_matrix, columns=[f'bin_{i}' for i in range(num_bins)])
data_processed = pd.concat([data_processed.reset_index(drop=True), mz_bins_df.reset_index(drop=True)], axis=1)

# Drop the original 'Spectra' column
data_processed.drop('Spectra', axis=1, inplace=True)

# List of columns to drop
columns_to_drop = [
    'peak ID', 'Title', 'Scans', 'RT left(min)', 'RT (min)', 
    'RT right (min)', 'Precursor m/z', 'Height', 'Area', 
    'Model masses', 'Adduct', 'Isotope', 'Comment', 
    'Reference RT', 'Reference m/z', 'Formula', 'Ontology', 
    'InChIKey', 'SMILES', 'Annotation tag (VS1.0)', 
    'RT matched', 'm/z matched', 'MS/MS matched', 
    'RT similarity', 'Dot product', 'Reverse dot product', 
    'Fragment presence %', 'Total score', 'S/N', 'MS1 isotopes', 
    'MSMS spectrum'
]

# Drop the specified columns from the DataFrame
X = data_processed.drop(columns=columns_to_drop, axis=1)

# Make predictions
y_pred = best_rf_model.predict(X)
y_proba = best_rf_model.predict_proba(X)[:, 1]

# Add predictions to the processed data
data_processed['Predicted Label'] = y_pred
data_processed['Prediction Probability'] = y_proba

# Drop columns starting with 'bin_'
data_processed = data_processed.drop(columns=data_processed.filter(like='bin_').columns)

# Export the results to a new CSV file
output_file = "Predicted.csv"
data_processed.to_csv(output_file, index=False)


Processing rows for num_bins = 3500: 100%|█████████████████████████████████████████| 771/771 [00:00<00:00, 8886.02it/s]
