# Example for PdCl2 and CuCl data

Example of routine used to find clusters inside test data and perform their identification by using classes and examples from MSARIS package

Provided example includes linear optimization search together with founding clusters and plotting plots with found spectrum

Currently for prototype runned routine is conducted mostly in user defined functions 

## Installing dependencies

Install dependencies to run package

In [None]:
%pip install -r ../requirements.txt
%pip install -e ..

Run imports to run example scripts

In [None]:
from collections import defaultdict
import glob
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from msaris.clusterisation.clusterisation import ClusterSearch, MaxClustering
from msaris.molecule.molecule import Molecule, plot_graph_for_comparing_molecules
from msaris.search.optimization_search import SearchClusters
from msaris.reader.reader import load_data
from msaris.reader.preprocessing import filter_intensities, reduce_data_resolution
from msaris.utils.intensities_util import norm
from msaris.utils.recognizing_utils import formal_formula

## Running data loading

Read `.mzML` file and pre-proccess data with reducing data dimensionality

In [None]:
file_name = "Test_entry"
DATA_SOURCE = f"../data/{file_name}.mzML"

In [None]:
mz, it = load_data(
    DATA_SOURCE, range_spectrum=(0, 1000), min_intensity=None, mz_binning_width=25.0,
)
mz_processed, it_processed = reduce_data_resolution(mz, it, int(mz.shape[0]/10), int(it.shape[0]/10))

### Estimation of individual spectrum

Allowing to compare some graphs with predefined spectra to estiamte their validaty

In [None]:
formula = "Cu3Pd3Cl10O2"
save = True
plot_graph_for_comparing_molecules(
    mz,
    it,
    formula=formula,
    save=True,
)

## Clustering spectrum to find existing peaks

Find MaxClustering to find clusters in data to find weighted masses with peaks

In [None]:
clust = MaxClustering(
    window=7,
    threshold = max(it)*0.01
)
masses_ = clust.find(mz, it, cluster_mod="max")

## Running script to find brutto formula

Running parameters in order with different coefficients calculated
started from Metal to adding various ligands

In [None]:
iteration_steps = [
    dict(no_Cu=True,),
    dict(no_Cu=True, no_CH3CN=False, no_Pd=False,),
    dict(no_Cu=True, no_CH3CN=False, no_Pd=False, no_N2=False,),
    dict(no_Cu=True, no_CH3CN=False, no_H2O=False, no_Pd=False),
    dict(no_Pd=True,),
    dict(no_Cu=False, no_CH3CN=False, no_Pd=True,),
    dict(no_Cu=False, no_CH3CN=False, no_Pd=True, no_N2=False,),
    dict(no_Cu=False, no_CH3CN=False, no_H2O=False, no_Pd=True),
    dict(no_CH3CN=False,),
    dict(no_Na=False,),
    dict(no_H2O=False,),
    dict(no_O2=False,),
    dict(no_O=False,),
    dict(no_N2=False,)
]

Running custom script run with found weighted masses and determing isotope pattern formulas

In [None]:
ranked = defaultdict(list)
path = f"./{file_name}"

Running class with function for search
Here we used threshold for cosine value equal to 0.7

Data is saved in `ranked` dictionary with all found metrics

In [None]:
srch = SearchClusters(
    mz=mz,
    it=it,
    charge = -1,
    threshold = 0.7,
    verbose=True,
    adjusted=False,
    njobs=5
)
for target_mass in tqdm(sorted(masses_)):
    found = []
    for params in iteration_steps:
        found.extend(srch.recognise_masses(
                    target_mass,
                    params,
                    epsilon_range =(0, 5, 0.1,), 
                ))
    ranked[target_mass] = sorted(found, key=lambda x: x["delta_max"])   

## Representation of found results

Drawing general plot of the found spectra data and adding individual drawn spectrum

In [None]:
import matplotlib as mpl

def color_fader(c1: str, c2: str, *, mix: int = 0) -> list:
    """
    Calculate fading gradient from one colour to another in RGB byte format
    
    :param c1: the first color
    :param c2: the second calor
    :param mix: int number of spectrums to get colors
    
    :return: list from colors
    """
    c1 = np.array(mpl.colors.to_rgb(c1))
    c2 = np.array(mpl.colors.to_rgb(c2))
    return mpl.colors.to_hex((1 - mix) * c1 + mix * c2)

### Plotting all found peaks

Results from `ranked` are presented in total.png plot where all found peaks could be observed

In [None]:
count_entries = len(ranked)
fig, ax = plt.subplots(1, 1, figsize=(60, 40))
max_it = max(it)
ax.plot(mz, it/ max_it, color="black")
count: int = 0
c1: str = "blue"
c2: str = "red"
for mass, data in ranked.items():
    if data:
        colour = color_fader(c1, c2, mix=count / (count_entries + 1))
        it_n = (data[0]["spectrum"][1] / max_it) * 100
        ax.plot(
            data[0]["spectrum"][0], it_n, color=colour,
        )
        max_ind = np.argmax(it_n)
        height = it_n[max_ind]
        ax.text(
            data[0]["spectrum"][0][max_ind],
            height,
            round(mass),
            color=colour,
            horizontalalignment="center",
            verticalalignment="center",
            fontsize=30,
        )
        count += 1
ax.set_title(f"Recognised masses for total spectrum", fontsize=40)
ax.set_xlabel("M/Z", fontsize=40)
ax.set_ylabel("Intensity", fontsize=40)
if not os.path.exists(path):
    os.makedirs(path)
plt.savefig(f"{path}/total.png", dpi=300)
plt.close()

### Generating csv report with total found data

Performing construction of the `.csv` file with defined and found peaks

In [None]:
report = {
    "mass": [],
    "brutto": [],
    "brutto_formal": [],
    "cosine": [],
    "relative": [],
    "delta_max": [],
    "delta_avg": []
}
variables = [
    "Pd1",
    "Pd2",
    "Cu1",
    "Cu2",
    "Na",
    "K",
    "O",
    "CF3COO",
    "TBA",
    "OH",
    "Cl",
    "Br",
    "CH3CN",
    "CH3OH",
    "H2O",
    "N2",
    "O2_1",
    "O2_2"
]
for variable in variables:
    report[variable] = []
for mass, data in ranked.items():

    for match in data:
        composition = match["composition"]
        report["mass"].append(round(mass))
        report["brutto"].append(match["formula"])
        report["brutto_formal"].append(formal_formula(composition))
        report["cosine"].append(match["metrics"]["cosine"])
        report["relative"].append(match["relative"])
        report["delta_max"].append(abs(match["delta_max"]))
        report["delta_avg"].append(abs(match["delta_avg"]))
        for variable in variables:
            if variable in composition:
                report[variable].append(composition[variable])
            else:
                report[variable].append(0)
    df = pd.DataFrame(report)
    df.to_csv(f"{path}/{file_name}.csv")

In [None]:
data_entries = defaultdict(list)
colors = []
count_entries = min_cosine.shape[0]
saved_mol = {}

plt.figure(figsize=(20,10))
max_it = max(it)
plt.plot(mz, (it/max_it)*100, color="black")
count: int = 0
c1: str = "blue"
c2: str = "red"
txt = ""
heights, locations = [], []
for row in tqdm(min_cosine.iterrows()):
    colour = color_fader(c1, c2, mix=count / (count_entries + 1))
    colors.append(colour)
    brut = row[1]['brutto']
    if brut in saved_mol:
        ml = saved_mol[brut]
    else:
        ml = Molecule(formula=row[1]['brutto'])
        ml.calculate()
        saved_mol[brut] = ml
    
    it_n = row[1]['relative']/max(ml.it)
    plt.plot(
        ml.mz, it_n*ml.it, color=colour,
    )
    data_entries["#"].append(count+1)
    data_entries["Mass"].append(row[1]['mass'])
    data_entries["Formula"].append(row[1]['brutto_formal'])
    data_entries["Cosine"].append(f"{row[1]['cosine']:.3f}")
    data_entries["Relative int., %"].append(f"{row[1]['relative']:.3f}")

    max_ind = np.argmax(ml.it)
    heights.append(row[1]['relative'])
    locations.append(row[1]['mass'])
    count += 1
for ind, _ in enumerate(heights):
    x1 = [0, 0]
    y1 = [0,0]
    if abs(locations[ind-1] - locations[ind]) <= 10:
        x1[0] += locations[ind]
        y1[0] += (heights[ind])
        heights[ind] = (heights[ind]+2)
        locations[ind] = (locations[ind-1] + 15)
        x1[1] += locations[ind]
        y1[1] += (heights[ind] + 0.7)
    plt.plot(x1, y1, '-', color=colors[ind])
    plt.text(
        locations[ind],
        heights[ind] + 1,
        f"{ind+1}",
        color=colors[ind],
        horizontalalignment="center",
        verticalalignment="center",
        fontsize=10,
    )
    count += 1

plt.xticks(list(range(0, 1600, 200))) 
plt.yticks(list(range(0, 120, 10))) 
plt.title(f"Recognised isotope patterns", fontsize=15)
plt.xlabel("M/Z", fontsize=10)
plt.ylabel("Intensity, %", fontsize=10)
plt.autoscale(enable=True, axis='x', tight=True)
plt.autoscale(enable=True, axis='y', tight=True)
plt.savefig(f"{path}/{file_name}_total.png", dpi=300)
plt.clf()
plt.cla()
plt.close()

df_d = pd.DataFrame(data_entries)
fig, ax = plt.subplots()
fig.patch.set_visible(False)
fig.tight_layout()
ax.axis("off")
table=ax.table(
    cellText=df_d.values, 
    colLabels=df_d.columns,
    loc='center',
    cellLoc='center'
)
table.scale(1, 2)
table_props = table.properties()
table_cells = table_props['celld']
clr = 0
for i, cell in enumerate(table_cells.values()):
    if i < len(table_cells)-5:
        cell.get_text().set_fontsize(15)
        try:
            cell.get_text().set_color(colors[clr])
        except:
            pass
        if i != 0 and i % 5 == 0:
            clr +=1
table.auto_set_column_width(col=list(range(len(df_d.columns))))
if not os.path.isdir(path):
    os.makedirs(path)
fig.savefig(
    f"{path}/{file_name}_total_table.png", 
    dpi=300,
    bbox_inches = "tight"
)