# Example for PdCl2 and CuCl data

Example of routine used to find clusters inside test data and perform their identification by using classes and examples from MSARIS package

Provided example includes linear optimization search together with founding clusters and plotting plots with found spectrum

Currently for prototype runned routine is conducted mostly in user defined functions 

## Installing dependencies

Install dependencies to run package

In [None]:
%pip install -r ../requirements.txt
%pip install -e ..

Run imports to run example scripts

In [None]:
from collections import defaultdict
import glob
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from msaris.clusterisation.clusterisation import ClusterSearch, MaxClustering
from msaris.molecule.molecule import Molecule, compare_and_visualize
from msaris.search.optimization_search import SearchClusters
from msaris.reader.reader import load_data
from msaris.reader.preprocessing import filter_intensities, reduce_data_resolution
from msaris.utils.intensities_util import norm
from msaris.utils.molecule_utils import plot_results

## Running data loading

Read `.mzML` file and pre-proccess data with reducing data dimensionality

In [None]:
file_name = "file_name"
DATA_SOURCE = f"./{file_name}.mzML"
path = f"./{file_name}"

In [None]:
mz, it = load_data(
    DATA_SOURCE, range_spectrum=(0, 1000), min_intensity=None, mz_binning_width=25.0,
)
mz_processed, it_processed = reduce_data_resolution(mz, it, int(mz.shape[0]/10), int(it.shape[0]/10))

### Estimation of individual spectrum

Allowing to compare some graphs with predefined spectra to estiamte their validaty

In [None]:
formula = "Cu3Pd1Cl4(CH3CN)"
save = True
compare_and_visualize(
    mz,
    it,
    formula=formula,
    save=False,
    path=path,
    window=0.1
)

## Clustering spectrum to find existing peaks

Find MaxClustering to find clusters in data to find weighted masses with peaks

In [None]:
clust = MaxClustering(
    window=7,
    threshold = max(it)*0.01
)
masses_ = clust.find(mz, it, cluster_mod="max")

## Running script to find brutto formula

Running parameters in order with different coefficients calculated
started from Metal to adding various ligands

In [None]:
iteration_steps = [
    dict(no_Cu=True,),
    dict(no_Cu=True, no_CH3CN=False, no_Pd=False,),
    dict(no_Cu=True, no_CH3CN=False, no_Pd=False, no_N2=False,),
    dict(no_Cu=True, no_CH3CN=False, no_H2O=False, no_Pd=False),
    dict(no_Pd=True,),
    dict(no_Cu=False, no_CH3CN=False, no_Pd=True,),
    dict(no_Cu=False, no_CH3CN=False, no_Pd=True, no_N2=False,),
    dict(no_Cu=False, no_CH3CN=False, no_H2O=False, no_Pd=True),
    dict(no_CH3CN=False,),
    dict(no_Na=False,),
    dict(no_H2O=False,),
    dict(no_O2=False,),
    dict(no_O=False,),
    dict(no_N2=False,)
]

Running custom script run with found weighted masses and determing isotope pattern formulas

Running class with function for search
Here we used threshold for cosine value equal to 0.7

Data is saved in `ranked` dictionary with all found metrics

In [None]:
ranked = defaultdict(list)
srch = SearchClusters(
    mz=mz,
    it=it,
    charge = -1,
    threshold = 0.5,
    verbose=True,
    adjusted=False,
    njobs=5
)
for target_mass in tqdm(sorted(masses_)):
    found = []
    for params in iteration_steps:
        found.extend(srch.recognise_masses(
                    target_mass,
                    params,
                    epsilon_range =(0, 5, 0.1,), 
                ))
    ranked[target_mass] = sorted(found, key=lambda x: x["delta_max"])   

In [None]:
report = {
    "mass": [],
    "brutto": [],
    "brutto_formal": [],
    "cosine": [],
    "relative": [],
    "delta_max": [],
    "delta_avg": []
}
variables = [
    "Pd1",
    "Pd2",
    "Cu1",
    "Cu2",
    "Na",
    "K",
    "O",
    "CF3COO",
    "TBA",
    "OH",
    "Cl",
    "Br",
    "CH3CN",
    "CH3OH",
    "H2O",
    "N2",
    "O2_1",
    "O2_2"
]
for variable in variables:
    report[variable] = []
for mass, data in ranked.items():

    for match in data:
        composition = match["composition"]
        report["mass"].append(round(mass))
        report["brutto"].append(match["formula"])
        report["brutto_formal"].append(formal_formula(composition))
        report["cosine"].append(match["cosine"])
        report["relative"].append(match["relative"])
        report["delta_max"].append(abs(match["delta_max"]))
        report["delta_avg"].append(abs(match["delta_avg"]))
        for variable in variables:
            if variable in composition:
                report[variable].append(composition[variable])
            else:
                report[variable].append(0)
if not os.path.exists(path):
    os.makedirs(path)
df = pd.DataFrame(report)
df.to_csv(f"{path}/{file_name}.csv")

## Representation of found results

Drawing general plot of the found spectra data and adding individual drawn spectrum

### Plotting all found peaks

Results from `ranked` are presented in total.png plot where all found peaks could be observed

### Generating csv report with total found data

Performing construction of the `.csv` file with defined and found peaks

In [None]:
cols=[col for col in df.columns if col not in ["brutto_formal","brutto"]]
cos = df.groupby(pd.cut(df["mass"], np.arange(0, 1000, 2))).min(["cosine"]).cosine.values
min_cosine = df[df.cosine.isin(cos)]
min_cosine.to_csv(f'{path}/{file_name}_min_cosine.csv')
min_cosine

In [None]:
plot_results(mz, it, min_cosine)