# Process proteomics data for Bayesian MCA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cobra, gzip, re

## Get genes from model

In [3]:
# model needs to updated for round 2 to include ethanol, erythritol, and citric acid secretion
model = cobra.io.load_json_model(
    "../../models/iJB1325_HP.nonnative_genes.pubchem.flipped.nonzero.reduced.json"
)
len(model.metabolites)
# 

171

In [6]:
model.boundary

[<Reaction BOUNDARY_ACe at 0x7fec245957c0>,
 <Reaction BOUNDARY_CO2e at 0x7fec245c4790>,
 <Reaction BOUNDARY_FORe at 0x7fec245cf730>,
 <Reaction BOUNDARY_GLCe at 0x7fec245cfa60>,
 <Reaction BOUNDARY_GLCNTe at 0x7fec245d7dc0>,
 <Reaction BOUNDARY_He at 0x7fec245dbdf0>,
 <Reaction BOUNDARY_H2Oe at 0x7fec245dbd90>,
 <Reaction BOUNDARY_HNO3e at 0x7fec245df520>,
 <Reaction BOUNDARY_O2e at 0x7fec245e7be0>,
 <Reaction BOUNDARY_TARe at 0x7fec245e7d90>,
 <Reaction BOUNDARY_SMPYRKe at 0x7fec25714820>,
 <Reaction EX_3hpp_e at 0x7fec25754ee0>]

# transcript to protein map for Aspni7



In [7]:
prot_gene_re = re.compile(r"proteinId=(\d+);.*transcriptId=(\d+)")

In [10]:
# this maps transcript IDs to protein ID 
prot_gene_map, gene_prot_map = {}, {}
prot_gene_re = re.compile(r"proteinId=(\d+);.*transcriptId=(\d+)")
with gzip.open("../../data/round1/Aspni7_FilteredModels1_deflines.gff3.gz", "rt") as gff3:
    for line in gff3:
        m = prot_gene_re.search(line)
        # print(line)
        if m:
            gene_prot_map[m.group(2)] = m.group(1)
            prot_gene_map[m.group(1)] = m.group(2)
len(gene_prot_map)

11910

In [21]:
print(len([g for g in model.genes if g.id in gene_prot_map]))
print(len([g for g in model.genes if g.id not in gene_prot_map]))

1317
11


In [23]:
[g for g in model.genes if g.id not in gene_prot_map]

[<Gene 1097533 at 0x7fec1c3a2220>,
 <Gene 1090687 at 0x7fec1c3da190>,
 <Gene 1123432 at 0x7fec1c3da250>,
 <Gene Unknown5 at 0x7fec1c3dad30>,
 <Gene 1070139 at 0x7fec1c3e5460>,
 <Gene 205484 at 0x7fec1c3fc040>,
 <Gene 1092679 at 0x7fec1c4071f0>,
 <Gene 1172305 at 0x7fec1c407610>,
 <Gene PAND_Tribolium_castaneum at 0x7fec1c429220>,
 <Gene HPDH_escherichia_coli at 0x7fec1c429250>,
 <Gene BAPAT_Bacillus_cereus at 0x7fec1c429280>]

## Read round 2 data

In [25]:
data_file = '../../data/round2/ABF_Aniger_BMCA_2_DATA.xlsx'

In [26]:
pd.ExcelFile(data_file).sheet_names

['notes',
 'Extracellular data summary',
 'Multiomics metadata',
 'Extracellular_metabolite',
 'global_protein',
 'targeted_protein',
 'intracellular_metabolite']

In [31]:
pd.read_excel(data_file, sheet_name='Extracellular data summary', index_col=[0,1,2], header=[0,1,2])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates,averages of the 3 replicates
Unnamed: 0_level_1,Unnamed: 1_level_1,ICE_ID,Unnamed: 3_level_1,Glucose (g/L),Glucose (g/L),Glucose (g/L),3-Hydroxy-propionic Acid (g/L),3-Hydroxy-propionic Acid (g/L),3-Hydroxy-propionic Acid (g/L),Ethanol (g/L),Ethanol (g/L),Ethanol (g/L),Erythritol (g/L),Erythritol (g/L),Erythritol (g/L),Citric Acid (g/L),Citric Acid (g/L)
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,t = 0,t = 92,t=160,0,92,160,0,92,160,0,92,160,0,92,160
1,wild-type,ABF_008340,120,47.422211,16.022751,0,0.0,0.0,0,3.367684,1.47396,0,0.519323,0.570694,0,0.279435,0.382329
2,(βAl-3HP)+,ABF_008343,120,49.789625,18.922934,0,8.426783,13.80534,0,2.299179,1.827184,0,0.43294,0.486393,0,0.172415,0.168575
3,"(βAl-3HP)+, pyc2+",ABF_008345,120,50.478728,23.368617,0,15.345327,22.491995,0,0.726514,0.0,0,0.454602,0.592637,0,0.0,0.149135
4,"(βAl-3HP)+, pyc2+, Δald6a",ABF_008348,120,50.585983,22.113392,0,16.448291,27.682851,0,0.776775,0.0,0,0.504219,0.696082,0,0.0,0.135727
5,"(βAl-3HP)+, pyc2+, Δald6a, aat2(cy)+",ABF_011231,120,57.240266,31.496616,0,13.878095,25.147361,0,0.986835,0.0,0,0.382474,0.423012,0,0.0,0.023021
6,"(βAl-3HP)+, pyc2+, Δald6a, aat2(mt)+",ABF_011232,120,49.985311,25.908226,0,15.763841,22.578719,0,0.605911,0.0,0,0.554831,0.86156,0,0.139337,0.159394
7,"(βAl-3HP)+, pyc2++, Δald6a",ABF_011233,120,46.198211,22.300653,0,19.209051,30.163453,0,0.597598,0.0,0,0.501672,0.634209,0,0.0,0.056057
8,"(βAl-3HP)+, pyc2+, Δald6a, mdhA+",ABF_011234,120,46.368167,21.331636,0,17.981485,28.407465,0,0.826655,0.0,0,0.50094,0.712425,0,0.0,0.0
9,"(βAl-3HP)+, pyc2+, Δald6a, purU+",ABF_011236,120,47.643453,18.972825,0,17.442338,27.230663,0,0.92507,0.0,0,0.505712,0.735149,0,0.0,0.0
10,"(βAl-3HP)+, pyc2+, Δald6a, Δadh1",ABF_011239,120,47.875584,19.16741,0,18.431711,30.560001,0,0.0,0.0,0,0.48004,0.674704,0,0.0,0.0


## Reference strain SF ABF180_17_R2

**Best performing strain: [(βAl-3HP)++, pyc2++, Δald6a+NeoR] (Strain ABF-10216R8, ICE ID ABF_011245)**

In [113]:
reference_strain = "SF ABF180_17_R2"

# Global proteomics at 92 hours

In [94]:
global_proteomics = pd.read_excel(data_file, sheet_name='global_protein')
global_proteomics.head()

Unnamed: 0,Protein,Redundancy,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
0,jgi|Aspni7|1169809,1,32.176158,33.153051,33.02862,32.895799,33.449526,33.073643,33.113237,32.740618,...,33.138326,33.273066,32.658036,33.370635,33.284373,33.477577,33.231232,32.414363,32.79951,32.67257
1,jgi|Aspni7|1141219,1,31.46843,31.376778,31.415586,31.452133,31.314783,31.380998,32.073355,31.353739,...,31.404182,31.683614,31.175667,31.716532,31.392035,31.774963,31.876349,31.544927,31.568684,31.174506
2,jgi|Aspni7|201546,1,31.821116,31.846684,31.613288,31.870536,32.422308,32.420785,32.220315,31.869863,...,32.118247,32.307963,31.825711,32.26426,32.544977,32.375467,32.288612,31.90245,32.053782,31.860052
3,jgi|Aspni7|1147622,1,33.517377,33.677164,33.641382,34.362115,34.341117,34.455973,33.899787,33.849046,...,33.894014,33.701087,33.921813,33.806313,33.762136,34.156727,34.426063,33.736694,33.53066,33.751044
4,jgi|Aspni7|1142593,1,34.154545,34.126483,34.117472,34.047973,33.851506,33.671474,34.104701,33.881193,...,33.55236,33.609413,33.737357,33.67607,33.590278,33.876611,33.284638,33.554197,33.490469,33.466278


### Remove redundant rows

In [95]:
print(len(global_proteomics))
print(len(global_proteomics[global_proteomics['Redundancy'] == 1]))

4585
4155


In [99]:
print(len(global_proteomics.Protein.unique()))
print(len(global_proteomics.drop(columns='Redundancy').drop_duplicates()))

4233
4233


In [101]:
global_proteomics = global_proteomics.drop(columns='Redundancy').drop_duplicates()
global_proteomics.head()

Unnamed: 0,Protein,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,SF ABF180_3_R3,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
0,jgi|Aspni7|1169809,32.176158,33.153051,33.02862,32.895799,33.449526,33.073643,33.113237,32.740618,32.76824,...,33.138326,33.273066,32.658036,33.370635,33.284373,33.477577,33.231232,32.414363,32.79951,32.67257
1,jgi|Aspni7|1141219,31.46843,31.376778,31.415586,31.452133,31.314783,31.380998,32.073355,31.353739,31.441164,...,31.404182,31.683614,31.175667,31.716532,31.392035,31.774963,31.876349,31.544927,31.568684,31.174506
2,jgi|Aspni7|201546,31.821116,31.846684,31.613288,31.870536,32.422308,32.420785,32.220315,31.869863,32.164412,...,32.118247,32.307963,31.825711,32.26426,32.544977,32.375467,32.288612,31.90245,32.053782,31.860052
3,jgi|Aspni7|1147622,33.517377,33.677164,33.641382,34.362115,34.341117,34.455973,33.899787,33.849046,33.773548,...,33.894014,33.701087,33.921813,33.806313,33.762136,34.156727,34.426063,33.736694,33.53066,33.751044
4,jgi|Aspni7|1142593,34.154545,34.126483,34.117472,34.047973,33.851506,33.671474,34.104701,33.881193,34.417227,...,33.55236,33.609413,33.737357,33.67607,33.590278,33.876611,33.284638,33.554197,33.490469,33.466278


### Map protein IDs to transcript IDs in model

In [109]:
for x in global_proteomics.Protein:
    if x.split('|')[-1] not in prot_gene_map:
        print(x, x in model.genes)

HPDH_escherichia_coli True
BAPAT_Bacillus_cereus True
PAND_Tribolium_castaneum True


In [111]:
global_proteomics.index = [prot_gene_map[x.split('|')[-1]] if x.split('|')[-1] in prot_gene_map
                           else x for x in global_proteomics.Protein]
global_proteomics.index.name = 'Transcript'
global_proteomics = global_proteomics.drop(columns='Protein')
global_proteomics.head()

Unnamed: 0_level_0,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,SF ABF180_3_R3,SF ABF180_4_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
Transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1170085,32.176158,33.153051,33.02862,32.895799,33.449526,33.073643,33.113237,32.740618,32.76824,32.839315,...,33.138326,33.273066,32.658036,33.370635,33.284373,33.477577,33.231232,32.414363,32.79951,32.67257
1141495,31.46843,31.376778,31.415586,31.452133,31.314783,31.380998,32.073355,31.353739,31.441164,31.516954,...,31.404182,31.683614,31.175667,31.716532,31.392035,31.774963,31.876349,31.544927,31.568684,31.174506
201546,31.821116,31.846684,31.613288,31.870536,32.422308,32.420785,32.220315,31.869863,32.164412,32.174083,...,32.118247,32.307963,31.825711,32.26426,32.544977,32.375467,32.288612,31.90245,32.053782,31.860052
1147898,33.517377,33.677164,33.641382,34.362115,34.341117,34.455973,33.899787,33.849046,33.773548,33.887928,...,33.894014,33.701087,33.921813,33.806313,33.762136,34.156727,34.426063,33.736694,33.53066,33.751044
1142869,34.154545,34.126483,34.117472,34.047973,33.851506,33.671474,34.104701,33.881193,34.417227,33.874547,...,33.55236,33.609413,33.737357,33.67607,33.590278,33.876611,33.284638,33.554197,33.490469,33.466278


### Normalize data to the reference strain

In [114]:
normalized_global_proteomics = global_proteomics.divide(
    global_proteomics[reference_strain], axis=0).replace(-np.inf, 1).replace(np.inf,1).fillna(1)
normalized_global_proteomics.head()

Unnamed: 0_level_0,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,SF ABF180_3_R3,SF ABF180_4_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
Transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1170085,0.967266,0.996633,0.992893,0.9889,1.005546,0.994246,0.995436,0.984235,0.985065,0.987202,...,0.996191,1.000241,0.981752,1.003174,1.000581,1.006389,0.998983,0.974427,0.986005,0.982189
1141495,1.012991,1.010041,1.01129,1.012467,1.008045,1.010177,1.032464,1.009299,1.012114,1.014553,...,1.010923,1.019918,1.003567,1.020978,1.010532,1.022859,1.026123,1.015454,1.016219,1.00353
201546,0.979618,0.980405,0.97322,0.98114,0.998126,0.998079,0.991908,0.981119,0.990187,0.990484,...,0.988765,0.994606,0.97976,0.99326,1.001902,0.996684,0.99401,0.982122,0.986781,0.980817
1147898,0.995988,1.000736,0.999673,1.02109,1.020466,1.023879,1.007351,1.005843,1.0036,1.006999,...,1.00718,1.001447,1.008006,1.004574,1.003261,1.014986,1.02299,1.002505,0.996382,1.002931
1142869,1.018378,1.017542,1.017273,1.015201,1.009343,1.003975,1.016892,1.010228,1.026211,1.01003,...,1.000423,1.002124,1.005939,1.004112,1.001554,1.010091,0.99244,1.000478,0.998578,0.997856


In [115]:
normalized_global_proteomics.to_csv('../../data/round2/normalized_global_proteomics.csv')

# Targeted proteomics at 92 hours

In [147]:
targeted_proteomics = pd.read_excel(data_file, sheet_name='targeted_protein')
targeted_proteomics.head()

Unnamed: 0,Protein,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,SF ABF180_3_R3,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
0,jgi|Aspni7|1145331,0.863964,0.86252,0.784762,1.387217,1.222693,1.338794,2.253985,2.312679,2.219523,...,1.949292,1.939182,2.000392,2.072654,1.588859,1.738713,1.701155,2.27992,2.16391,2.22194
1,jgi|Aspni7|1136064,-0.36572,-0.177613,-0.216719,-0.697055,-0.819734,-0.585171,-1.308847,-1.083605,-1.19297,...,-1.261217,-1.341484,-1.355311,-1.309444,-0.832876,-0.755307,-0.674192,-1.03483,-1.133321,-1.127115
2,jgi|Aspni7|1099056,-6.522509,-6.291478,-6.417873,-6.335275,-6.45088,-6.323233,-6.591499,-6.581977,-6.606718,...,-6.559411,-6.516137,-6.610014,-6.661304,-6.809476,-6.743531,-6.711409,-6.644841,-6.440649,-6.430682
3,jgi|Aspni7|1128408,-0.423381,-0.402166,-0.347003,-1.168125,-1.355903,-1.260792,-1.863573,-1.630895,-1.810275,...,-1.945666,-1.964705,-2.096481,-2.029213,-1.790707,-1.846235,-1.801093,-2.114836,-2.029731,-2.032081
4,pand,-4.342076,-5.353449,-4.744258,4.954097,4.760225,5.051498,5.349948,5.566743,5.271986,...,5.600039,5.312262,5.209465,5.45894,4.827779,5.020662,4.765847,5.724487,5.634604,5.672021


### Map protein IDs to transcript IDs in model

In [148]:
for x in targeted_proteomics.Protein:
    if x.split('|')[-1] not in prot_gene_map:
        print(x, x in model.genes)

In [118]:
prot_gene_map['pand'] = 'PAND_Tribolium_castaneum'
prot_gene_map['bapat'] = 'BAPAT_Bacillus_cereus'
prot_gene_map['hpdh'] = 'HPDH_escherichia_coli'

In [149]:
targeted_proteomics.index = [prot_gene_map[x.split('|')[-1]] if x.split('|')[-1] in prot_gene_map
                           else x for x in targeted_proteomics.Protein]
targeted_proteomics.index.name = 'Transcript'
targeted_proteomics = targeted_proteomics.drop(columns='Protein')
targeted_proteomics = np.exp2(targeted_proteomics)
targeted_proteomics.head()

Unnamed: 0_level_0,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,SF ABF180_3_R3,SF ABF180_4_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
Transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1145607,1.820032,1.818211,1.722808,2.615737,2.333819,2.529397,4.769985,4.968046,4.657393,4.772252,...,3.86185,3.834883,4.001088,4.206598,3.008113,3.337374,3.251612,4.856509,4.481277,4.665205
1136340,0.776081,0.884164,0.86052,0.61683,0.566546,0.66657,0.403643,0.471848,0.437401,0.436557,...,0.417192,0.394615,0.390851,0.403476,0.561409,0.59242,0.626683,0.488073,0.455865,0.45783
1099332,0.010878,0.012767,0.011696,0.012385,0.011431,0.012489,0.01037,0.010438,0.010261,0.009591,...,0.010603,0.010926,0.010237,0.00988,0.008915,0.009332,0.009543,0.009993,0.011513,0.011592
1128684,0.745675,0.756721,0.786216,0.444999,0.39069,0.417315,0.274795,0.322888,0.285137,0.294728,...,0.259595,0.256192,0.233828,0.244989,0.28903,0.278117,0.286957,0.230872,0.244901,0.244502
PAND_Tribolium_castaneum,0.049307,0.02446,0.037311,30.997876,27.100073,33.162901,40.784456,47.397633,38.639,41.074168,...,48.504239,39.732883,37.000289,43.984995,28.399219,32.461593,27.205893,52.874032,49.680377,50.985713


### Normalize data to the reference strain

In [150]:
normalized_targeted_proteomics = targeted_proteomics.divide(
    targeted_proteomics[reference_strain], axis=0).replace(-np.inf, 1).replace(np.inf,1).fillna(1)
normalized_targeted_proteomics.head()

Unnamed: 0_level_0,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,SF ABF180_3_R3,SF ABF180_4_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
Transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1145607,0.495569,0.495073,0.469096,0.712228,0.635466,0.688719,1.2988,1.352729,1.268142,1.299417,...,1.051527,1.044184,1.08944,1.145397,0.819067,0.90872,0.885368,1.322359,1.220189,1.27027
1136340,1.713099,1.951679,1.899487,1.361573,1.250578,1.471368,0.89099,1.041544,0.965507,0.963642,...,0.920897,0.871061,0.862752,0.890622,1.239238,1.307692,1.383322,1.077359,1.006263,1.010601
1099332,1.387406,1.628361,1.491771,1.57967,1.458029,1.592912,1.322621,1.331379,1.308742,1.22329,...,1.352368,1.393547,1.305755,1.260149,1.13715,1.190335,1.217135,1.274612,1.468405,1.478584
1128684,3.022591,3.067366,3.186922,1.803803,1.583661,1.691583,1.11388,1.308824,1.1558,1.194678,...,1.052267,1.038471,0.947821,0.99306,1.171583,1.127347,1.163179,0.935838,0.992704,0.991088
PAND_Tribolium_castaneum,0.001022,0.000507,0.000773,0.642476,0.561688,0.687349,0.845317,0.982385,0.800849,0.851322,...,1.005321,0.823522,0.766885,0.911653,0.588615,0.672814,0.563882,1.095891,1.029698,1.056753


In [153]:
normalized_targeted_proteomics.to_csv('../../data/round2/normalized_targeted_proteomics.csv')

### Calculate enzyme activity from normalized targeted proteomics

In [154]:
def get_enzyme_activity_expression(proteins, model):
    """Get enzyme activity expression as a function of protein expression

    Take the min over all subunits for each isoenzyme and sum over all isoenzymes

    :param proteins:  protein x experiment dataframe
    :param model:  cobra model
    :returns:  reaction x experiment dataframe
    """
    enzyme_expression = {}
    transcripts = list(proteins.index)
    for r in model.reactions:
        if r.gene_reaction_rule and (
            len(set([g.id for g in r.genes]) & set(transcripts)) > 0
        ):
            subunits_expression = {}
            for x in [x.strip("() ") for x in r.gene_reaction_rule.split(" or ")]:
                # Take the min over all subunits for each study line
                subunits = [
                    y.strip("() ")
                    for y in x.split(" and ")
                    if y.strip("() ") in transcripts
                ]
                if len(subunits) > 0:
                    subunits_expression[x] = proteins.loc[subunits].min(axis=0)
            enzyme_expression[r.id] = pd.DataFrame(subunits_expression).sum(axis=1)
    enzyme_expression = pd.DataFrame(enzyme_expression).T
    enzyme_expression.index.name = "rxn"
    return enzyme_expression

In [155]:
normalized_rxns_targeted_prot = get_enzyme_activity_expression(
    normalized_targeted_proteomics, model
)
normalized_rxns_targeted_prot.to_csv("../../data/round2/normalized_targeted_enzyme_activities.csv")
normalized_rxns_targeted_prot.head()

Unnamed: 0_level_0,SF ABF180_1_R1,SF ABF180_1_R2,SF ABF180_1_R3,SF ABF180_2_R1,SF ABF180_2_R2,SF ABF180_2_R3,SF ABF180_3_R1,SF ABF180_3_R2,SF ABF180_3_R3,SF ABF180_4_R1,...,SF ABF180_21_R3,SF ABF180_22_R1,SF ABF180_22_R2,SF ABF180_22_R3,SF ABF180_23_R1,SF ABF180_23_R2,SF ABF180_23_R3,SF ABF180_24_R1,SF ABF180_24_R2,SF ABF180_24_R3
rxn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
r5a,1.91238,1.89575,2.080383,1.481414,1.258474,1.433795,1.332868,1.552501,1.273918,1.378337,...,1.338826,1.335671,1.244583,1.39606,1.112577,1.120157,1.117108,1.241495,1.220584,1.214739
r7,1.312238,1.40022,1.464486,1.310432,1.101322,1.20279,1.142955,1.451123,1.286141,1.274674,...,0.912842,0.943922,0.915576,0.969757,0.863084,0.976217,0.934266,1.090511,1.049418,1.074448
r8,1.312238,1.40022,1.464486,1.310432,1.101322,1.20279,1.142955,1.451123,1.286141,1.274674,...,0.912842,0.943922,0.915576,0.969757,0.863084,0.976217,0.934266,1.090511,1.049418,1.074448
r10,3.022591,3.067366,3.186922,1.803803,1.583661,1.691583,1.11388,1.308824,1.1558,1.194678,...,1.052267,1.038471,0.947821,0.99306,1.171583,1.127347,1.163179,0.935838,0.992704,0.991088
r12a,2.102054,2.361921,2.32058,1.342503,1.245856,1.312038,1.003191,1.163386,1.017943,1.063756,...,1.021976,1.089393,1.031384,1.106492,1.120588,1.100068,1.182511,0.987195,0.919679,0.952225
