# Normalization toolbox

Data used: AUT-440-446_FIAMSMatrixLinearityR2-Reproducibility_DataProcessing\20201223_AddAdducts-SumInt-BackSub

This data set is blank corrected

In [1]:
%cd ..

/Users/matmat/Documents/GitHub/AutoFlow-OmicsDataHandling


In [2]:
import pandas as pd
import numpy as np
import copy
import pickle
import BFAIR.normalization as normalization

In [3]:
%cd examples

/Users/matmat/Documents/GitHub/AutoFlow-OmicsDataHandling/examples


## Data

In [4]:
with open("data/normalization_Data/Ecoli_intensities_linearity.txt", 'rb') as handle:
    intensities_Ecoli = pickle.loads(handle.read())

In [5]:
intensities_Ecoli

Unnamed: 0,sample_group_name,Metabolite,Formula,Intensity
0,Linearity_P1Ecoli_100xDil_1,12dgr120_c,C27H52O5,9.583652e+03
1,Linearity_P1Ecoli_100xDil_1,12dgr140_c,C31H60O5,1.276414e+03
2,Linearity_P1Ecoli_100xDil_1,12dgr141_c,C31H56O5,1.282265e+03
3,Linearity_P1Ecoli_100xDil_1,12dgr160_c,C35H68O5,2.968548e+04
4,Linearity_P1Ecoli_100xDil_1,12dgr161_c,C35H64O5,2.144749e+04
...,...,...,...,...
10388,Linearity_P1Ecoli_1xDil_3,xdp_c,C10H14N4O12P2,1.575200e+06
10389,Linearity_P1Ecoli_1xDil_3,xmp_c,C10H13N4O9P1,9.985480e+04
10390,Linearity_P1Ecoli_1xDil_3,xtp_c,C10H15N4O15P3,3.546930e+05
10391,Linearity_P1Ecoli_1xDil_3,xtsn_c,C10H12N4O6,9.593014e+05


## Normalization methods

### min/max feature scaling: value - min / max - min

In [6]:
normalization.min_max_norm(intensities_Ecoli, columnname='Intensity')

Unnamed: 0,sample_group_name,Metabolite,Formula,Intensity
0,Linearity_P1Ecoli_100xDil_1,12dgr120_c,C27H52O5,1.416323e-05
1,Linearity_P1Ecoli_100xDil_1,12dgr140_c,C31H60O5,4.546221e-07
2,Linearity_P1Ecoli_100xDil_1,12dgr141_c,C31H56O5,4.642775e-07
3,Linearity_P1Ecoli_100xDil_1,12dgr160_c,C35H68O5,4.733527e-05
4,Linearity_P1Ecoli_100xDil_1,12dgr161_c,C35H64O5,3.374094e-05
...,...,...,...,...
10388,Linearity_P1Ecoli_1xDil_3,xdp_c,C10H14N4O12P2,5.051276e-04
10389,Linearity_P1Ecoli_1xDil_3,xmp_c,C10H13N4O9P1,3.167472e-05
10390,Linearity_P1Ecoli_1xDil_3,xtp_c,C10H15N4O15P3,1.134548e-04
10391,Linearity_P1Ecoli_1xDil_3,xtsn_c,C10H12N4O6,3.074797e-04


### Total Sum Intensity, value / tsi

In [7]:
normalization.tsi_norm(intensities_Ecoli, columnname='Intensity')

Unnamed: 0,sample_group_name,Metabolite,Formula,Intensity
0,Linearity_P1Ecoli_100xDil_1,12dgr120_c,C27H52O5,1.401056e-06
1,Linearity_P1Ecoli_100xDil_1,12dgr140_c,C31H60O5,1.866019e-07
2,Linearity_P1Ecoli_100xDil_1,12dgr141_c,C31H56O5,1.874573e-07
3,Linearity_P1Ecoli_100xDil_1,12dgr160_c,C35H68O5,4.339788e-06
4,Linearity_P1Ecoli_100xDil_1,12dgr161_c,C35H64O5,3.135457e-06
...,...,...,...,...
10388,Linearity_P1Ecoli_1xDil_3,xdp_c,C10H14N4O12P2,5.544914e-05
10389,Linearity_P1Ecoli_1xDil_3,xmp_c,C10H13N4O9P1,3.515023e-06
10390,Linearity_P1Ecoli_1xDil_3,xtp_c,C10H15N4O15P3,1.248567e-05
10391,Linearity_P1Ecoli_1xDil_3,xtsn_c,C10H12N4O6,3.376870e-05


## These methods are based on the E. coli model that we used in the INCA example for now; iJS2012.
### Biomass reactions and the corresponding metabolite identifiers will be added for different organisms. All metabolites that are part of the biomass function, substrates and products, are being considered

### Main Biomass components

In [8]:
biomass_mets = ['phe__L_c', 'mlthf_c' ,'oaa_c', 'lys__L_c',
                'atp_c', 'ser__L_c', 'g3p_c', 'tyr__L_c', 'pep_c',
                'met__L_c', 'g6p_c', 'akg_c', 'glu__L_c',
                'gln__L_c', 'r5p_c', 'f6p_c', 'pyr_c', 'gly_c',
                'thr_c', 'asp__L_c', 'nadph_c', 'cys__L_c',
                '3pg_c', 'val__L_c', 'ala__L_c', 'ile__L_c',
                'asn__L_c', 'his__L_c', 'leu__L_c', 'accoa_c',
                'arg__L_c', 'pro__L_c', 'trp__L_c', 'nadh_c']
bm_vals = [0.176, 0.443, 0.34, 0.326, 33.247, 0.205, 0.129, 
           0.131, 0.051, 0.146, 0.205, 0.087, 0.25, 0.25, 0.754,
           0.071, 0.083, 0.582, 0.241, 0.229, 5.363, 0.087, 0.619,
           0.402, 0.488, 0.276, 0.229, 0.09, 0.428, 2.51, 0.281,
           0.21, 0.054, -1.455]
biomass_value = 39.68
biomass_df = pd.DataFrame()
biomass_df['Metabolite'] = biomass_mets
biomass_df['Value'] = bm_vals

#### Using just the metabolites

In [9]:
normalization.lim_tsi_norm(biomass_df['Metabolite'], intensities_Ecoli, columnname='Intensity')

Unnamed: 0,sample_group_name,Metabolite,Formula,Intensity
0,Linearity_P1Ecoli_100xDil_1,12dgr120_c,C27H52O5,0.000073
1,Linearity_P1Ecoli_100xDil_1,12dgr140_c,C31H60O5,0.000010
2,Linearity_P1Ecoli_100xDil_1,12dgr141_c,C31H56O5,0.000010
3,Linearity_P1Ecoli_100xDil_1,12dgr160_c,C35H68O5,0.000227
4,Linearity_P1Ecoli_100xDil_1,12dgr161_c,C35H64O5,0.000164
...,...,...,...,...
10388,Linearity_P1Ecoli_1xDil_3,xdp_c,C10H14N4O12P2,0.000289
10389,Linearity_P1Ecoli_1xDil_3,xmp_c,C10H13N4O9P1,0.000018
10390,Linearity_P1Ecoli_1xDil_3,xtp_c,C10H15N4O15P3,0.000065
10391,Linearity_P1Ecoli_1xDil_3,xtsn_c,C10H12N4O6,0.000176


#### Also using the stoichiometric coefficients

In [10]:
normalization.lim_tsi_norm(biomass_df, intensities_Ecoli, biomass_value=biomass_value, columnname='Intensity')

Unnamed: 0,sample_group_name,Metabolite,Formula,Intensity
0,Linearity_P1Ecoli_100xDil_1,12dgr120_c,C27H52O5,0.000896
1,Linearity_P1Ecoli_100xDil_1,12dgr140_c,C31H60O5,0.000119
2,Linearity_P1Ecoli_100xDil_1,12dgr141_c,C31H56O5,0.000120
3,Linearity_P1Ecoli_100xDil_1,12dgr160_c,C35H68O5,0.002774
4,Linearity_P1Ecoli_100xDil_1,12dgr161_c,C35H64O5,0.002004
...,...,...,...,...
10388,Linearity_P1Ecoli_1xDil_3,xdp_c,C10H14N4O12P2,0.007044
10389,Linearity_P1Ecoli_1xDil_3,xmp_c,C10H13N4O9P1,0.000447
10390,Linearity_P1Ecoli_1xDil_3,xtp_c,C10H15N4O15P3,0.001586
10391,Linearity_P1Ecoli_1xDil_3,xtsn_c,C10H12N4O6,0.004290


### Amino acids
same but using AAs

In [11]:
# maybe add L-Selenocysteine and L-Pyrrolysine
amino_acids = ['ala__L_c', 'arg__L_c', 'asn__L_c', 'asp__L_c',
               'cys__L_c', 'glu__L_c', 'gln__L_c', 'gly_c',
               'his__L_c', 'ile__L_c', 'leu__L_c', 'lys__L_c',
               'met__L_c', 'phe__L_c', 'pro__L_c', 'ser__L_c',
               'thr_c', 'trp__L_c', 'tyr__L_c', 'val__L_c']

In [13]:
normalization.lim_tsi_norm(amino_acids, intensities_Ecoli, columnname='Intensity')

Unnamed: 0,sample_group_name,Metabolite,Formula,Intensity
0,Linearity_P1Ecoli_100xDil_1,12dgr120_c,C27H52O5,0.000101
1,Linearity_P1Ecoli_100xDil_1,12dgr140_c,C31H60O5,0.000013
2,Linearity_P1Ecoli_100xDil_1,12dgr141_c,C31H56O5,0.000014
3,Linearity_P1Ecoli_100xDil_1,12dgr160_c,C35H68O5,0.000313
4,Linearity_P1Ecoli_100xDil_1,12dgr161_c,C35H64O5,0.000226
...,...,...,...,...
10388,Linearity_P1Ecoli_1xDil_3,xdp_c,C10H14N4O12P2,0.000394
10389,Linearity_P1Ecoli_1xDil_3,xmp_c,C10H13N4O9P1,0.000025
10390,Linearity_P1Ecoli_1xDil_3,xtp_c,C10H15N4O15P3,0.000089
10391,Linearity_P1Ecoli_1xDil_3,xtsn_c,C10H12N4O6,0.000240


### Probabilistic Quotient Normalization
This method adjusts for dilutions so we would use it to compare different dilutions of the same sample, e.g. E. coli 1x, 10x, 100x etc

**0)** Put all the corresponding samples into one dataframe. Because of the architecture I'd say columns correspond to sample, rows to metabolites. Use suitable values for missing metabolites, probably NA is fine but check that this is conform with the mean/median functions

That one is 0 because it's still pre-processing. Here is the logic of the method

**1)** tsi correct each sample separately

**2)** Set up a QC vector (or additional column) with the mean/median for each metabolite over all samples

**3)** Divide each value for each sample with the correscponding QC value

**4)** Get the mean/median over all metabolites for each sample -> This is your dilution factor for this sample

**5)** Take the values from 1) and multiply each value with the dilution factor corresponding to this sample

And to finish that, export it like the input

**6)** Copy the input df and insert the new values

In [29]:
normalization.pqn_norm(intensities_Ecoli, 'sample_group_name', 'Intensity', 'median')

Unnamed: 0,sample_group_name,Metabolite,Formula,Intensity
0,Linearity_P1Ecoli_100xDil_1,12dgr120_c,C27H52O5,1.401056e-06
1,Linearity_P1Ecoli_100xDil_1,12dgr140_c,C31H60O5,1.866019e-07
2,Linearity_P1Ecoli_100xDil_1,12dgr141_c,C31H56O5,1.874573e-07
3,Linearity_P1Ecoli_100xDil_1,12dgr160_c,C35H68O5,4.339788e-06
4,Linearity_P1Ecoli_100xDil_1,12dgr161_c,C35H64O5,3.135457e-06
...,...,...,...,...
10388,Linearity_P1Ecoli_1xDil_3,xdp_c,C10H14N4O12P2,5.773747e-05
10389,Linearity_P1Ecoli_1xDil_3,xmp_c,C10H13N4O9P1,3.660085e-06
10390,Linearity_P1Ecoli_1xDil_3,xtp_c,C10H15N4O15P3,1.300094e-05
10391,Linearity_P1Ecoli_1xDil_3,xtsn_c,C10H12N4O6,3.516230e-05


In [16]:
normalization.pqn_norm(intensities_Ecoli, 'sample_group_name', 'Intensity', 'mean')

Unnamed: 0,sample_group_name,Metabolite,Formula,Intensity
0,Linearity_P1Ecoli_100xDil_1,12dgr120_c,C27H52O5,1.386942e-06
1,Linearity_P1Ecoli_100xDil_1,12dgr140_c,C31H60O5,1.847222e-07
2,Linearity_P1Ecoli_100xDil_1,12dgr141_c,C31H56O5,1.855689e-07
3,Linearity_P1Ecoli_100xDil_1,12dgr160_c,C35H68O5,4.296071e-06
4,Linearity_P1Ecoli_100xDil_1,12dgr161_c,C35H64O5,3.103872e-06
...,...,...,...,...
10388,Linearity_P1Ecoli_1xDil_3,xdp_c,C10H14N4O12P2,6.273600e-05
10389,Linearity_P1Ecoli_1xDil_3,xmp_c,C10H13N4O9P1,3.976950e-06
10390,Linearity_P1Ecoli_1xDil_3,xtp_c,C10H15N4O15P3,1.412648e-05
10391,Linearity_P1Ecoli_1xDil_3,xtsn_c,C10H12N4O6,3.820641e-05
