In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.chdir(os.path.expanduser('/models/ecoli/analysis/mia_local_notebooks/C_limited_PDR_analyses/'))
import plotly.graph_objects as go

In [32]:
# Load in the raw C-lim data from Gupta, et al., 2023: https://www.biorxiv.org/content/10.1101/2022.08.01.502339v2.full
df_original = pd.read_excel('C_lim_PDRs_full.xlsx')
original_columns = df_original.columns
df_original

Unnamed: 0,Protein ID,Gene name,C-lim3_1,C-lim3_2,C-lim3_difference,C-lim3_mean,C-lim6_1,C-lim6_2,C-lim6_difference,C-lim6_mean,C-lim12_1,C-lim12_2,C-lim12_difference,C-lim12_mean
0,sp|A5A614|YCIZ_ECOLI,yciZ,,,,,3.932642,3.601144,0.331498,3.766893,,,,
1,sp|O32583|THIS_ECOLI,thiS,,,,,,,,,,4.469682,,4.469682
2,sp|P00350|6PGD_ECOLI,gnd,3.249310,3.163151,0.086160,3.206231,5.572427,6.131487,0.559061,5.851957,10.201045,9.866243,0.334801,10.033644
3,sp|P00363|FRDA_ECOLI,frdA,2.994039,2.946457,0.047583,2.970248,6.080384,6.670409,0.590025,6.375396,13.518765,10.761427,2.757338,12.140096
4,sp|P00370|DHE4_ECOLI,gdhA,3.079699,2.810401,0.269298,2.945050,4.462108,4.855534,0.393427,4.658821,7.653326,6.918810,0.734516,7.286068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3248,sp|P77736|YAHD_ECOLI,yahD,,,,,,,,,,,,
3249,sp|Q2M7X4|YICS_ECOLI,yicS,,,,,,,,,,,,
3250,sp|Q46796|YGEP_ECOLI,ygeP,,,,,,,,,,,,
3251,sp|Q47013|ELAD_ECOLI,elaD,,,,,,,,,,,,


In [37]:
# remove all rows that have a NaN value in any column:
df_original_no_nan = df_original.dropna()
df_original_no_nan # 2022 rows

# generate a list of genes that remain
genes = df_original_no_nan['Gene name']

# make a new data frame of the gene names and PDR values (which will have a value of zero for now):
df_hybrid = pd.DataFrame(genes, columns = ['Gene name'])
initialize_PDRs = np.zeros(len(genes))
df_hybrid['HL'] = initialize_PDRs # HL stands for half life

# create some sorting dataframe: 
HL12plus = pd.DataFrame(columns=original_columns)
HL3 = pd.DataFrame(columns=original_columns)
HL6 = pd.DataFrame(columns=original_columns)
HL12 = pd.DataFrame(columns=original_columns)

# any rows that have a value greater than 12 in the "C-lim12_mean" column are considered outliers:
HL12plus = df_original_no_nan[df_original_no_nan['C-lim12_mean'] > 12]

# assign a value of 12 to the "HL" values for the genes in the HL12plus dataframe:
df_hybrid.loc[df_hybrid['Gene name'].isin(HL12plus['Gene name']), 'HL'] = 12

# drop the rows that are in the HL12plus dataframe from the original dataframe:
df_original_no_H12plus = df_original_no_nan.drop(HL12plus.index)

# any rows that have a value greater than 6 in the "C-lim12_mean" column are considered to be that PDR:
HL12 = df_original_no_H12plus[df_original_no_H12plus['C-lim6_mean'] > 6]

# assign the "C-lim12_mean" to the "HL" values in hybrid_df for the genes present in the HL12 dataframe:
df_hybrid.loc[df_hybrid['Gene name'].isin(HL12['Gene name']), 'HL'] = HL12['C-lim12_mean']

# drop the rows that are in the HL6 dataframe from the original dataframe:
df_original_no_HL12 = df_original_no_H12plus.drop(HL12.index)

# any rows that have a value greater than 3 in the "C-lim6_mean" column are considered to be that PDR:
HL6 = df_original_no_HL12[df_original_no_HL12['C-lim6_mean'] > 3]

# assign the "C-lim6_mean" to the "HL" values in hybrid_df for the genes present in the HL6 dataframe:
df_hybrid.loc[df_hybrid['Gene name'].isin(HL6['Gene name']), 'HL'] = HL6['C-lim6_mean']

# drop the rows that are in the HL6 dataframe from the original dataframe:
df_original_no_HL6 = df_original_no_HL12.drop(HL6.index)

# all remaining genes are considered to have a PDR less than 3 and are assigned to their "C-lim3_mean" value:
HL3 = df_original_no_HL6[df_original_no_HL6['C-lim3_mean'] > 0]

df_hybrid.loc[df_hybrid['Gene name'].isin(HL3['Gene name']), 'HL'] = HL3['C-lim3_mean']

# remove all rows that are in the HL3 dataframe from the original dataframe:
df_original_no_HL3 = df_original_no_HL6.drop(HL3.index)

df_original_no_HL3 # gives a dataframe with 0 rows!

HL3 # 112 rows
HL6 # 1176 rows
HL12 # 372 rows
HL12plus # 362 rows 


Unnamed: 0,Protein ID,Gene name,C-lim3_1,C-lim3_2,C-lim3_difference,C-lim3_mean,C-lim6_1,C-lim6_2,C-lim6_difference,C-lim6_mean,C-lim12_1,C-lim12_2,C-lim12_difference,C-lim12_mean
3,sp|P00363|FRDA_ECOLI,frdA,2.994039,2.946457,0.047583,2.970248,6.080384,6.670409,0.590025,6.375396,13.518765,10.761427,2.757338,12.140096
8,sp|P00490|PHSM_ECOLI,malP,3.246059,3.255595,0.009536,3.250827,7.155026,6.312058,0.842968,6.733542,16.000000,15.222989,0.777011,15.611495
11,sp|P00550|PTM3C_ECOLI,mtlA,3.128073,3.577450,0.449377,3.352762,6.542198,7.304136,0.761938,6.923167,12.602487,12.717865,0.115379,12.660176
12,sp|P00561|AK1H_ECOLI,thrA,3.233443,2.931791,0.301651,3.082617,5.741585,6.051427,0.309842,5.896506,13.303670,13.774148,0.470478,13.538909
16,sp|P00634|PPB_ECOLI,phoA,3.198057,3.267965,0.069908,3.233011,5.601799,6.828251,1.226452,6.215025,12.954219,16.000000,3.045781,14.477110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2619,sp|P33225|TORA_ECOLI,torA,3.242210,3.350187,0.107976,3.296199,2.755052,6.534516,3.779464,4.644784,11.913944,14.538864,2.624920,13.226404
2634,sp|P39365|SGCC_ECOLI,sgcC,3.477748,4.000000,0.522252,3.738874,5.520102,6.483361,0.963259,6.001732,11.111960,16.000000,4.888040,13.555980
2659,sp|P68187|MALK_ECOLI,malK,3.482673,3.221725,0.260948,3.352199,8.000000,6.398394,1.601606,7.199197,16.000000,16.000000,0.000000,16.000000
2665,sp|P69856|NANC_ECOLI,nanC,3.084356,3.099205,0.014850,3.091780,6.057304,7.654714,1.597410,6.856009,9.834367,16.000000,6.165633,12.917184


In [38]:
df_hybrid

Unnamed: 0,Gene name,HL
2,gnd,5.851957
3,frdA,12.000000
4,gdhA,4.658821
5,ndh,11.781129
6,sodA,5.114563
...,...,...
2712,xdhC,1.463444
2713,ygeW,4.887572
2714,idi,8.961890
2715,otnI,3.682324


In [39]:
# TODO: make the csv file! 

# convert the half life values to minutes:
df_hybrid['HL'] = df_hybrid['HL'] * 60

# rename HL to "half_life (units.min)"
df_hybrid.rename(columns = {'HL':'half_life (units.min)'}, inplace = True)

# rename Gene name to "Gene id"
df_hybrid.rename(columns = {'Gene name':'Gene id'}, inplace = True)

# save the dataframe to a csv file:
df_hybrid.to_csv('C_lim_PDRs_MIA.csv', index = False)

# save the datafram to a tsv file:
df_hybrid.to_csv('C_lim_PDRs_MIA.tsv', sep = '\t', index = False)

In [29]:
df_hybrid

Unnamed: 0,Gene name,HL
2,gnd,351.117415
3,frdA,720.000000
4,gdhA,279.529258
5,ndh,706.867727
6,sodA,306.873785
...,...,...
2712,xdhC,87.806639
2713,ygeW,293.254340
2714,idi,537.713370
2715,otnI,220.939456


In [40]:
HL3

Unnamed: 0,Protein ID,Gene name,C-lim3_1,C-lim3_2,C-lim3_difference,C-lim3_mean,C-lim6_1,C-lim6_2,C-lim6_difference,C-lim6_mean,C-lim12_1,C-lim12_2,C-lim12_difference,C-lim12_mean
23,sp|P00888|AROF_ECOLI,aroF,2.291959,2.192231,0.099728,2.242095,3.046524,2.523461,0.523063,2.784993,3.435439,4.241195,0.805756,3.838317
44,sp|P00963|ASNA_ECOLI,asnA,2.472492,2.593467,0.120975,2.532980,3.007113,2.786435,0.220678,2.896774,4.982878,5.545865,0.562988,5.264371
59,sp|P03007|DPO3E_ECOLI,dnaQ,0.859817,0.833309,0.026508,0.846563,2.980549,1.283221,1.697328,2.131885,3.406678,1.724953,1.681725,2.565816
79,sp|P04949|FLIC_ECOLI,fliC,0.572848,0.861368,0.288520,0.717108,0.985057,0.462050,0.523007,0.723554,1.371948,0.540590,0.831358,0.956269
133,sp|P06993|MALT_ECOLI,malT,0.536392,0.683805,0.147413,0.610098,1.114213,0.980639,0.133574,1.047426,1.502937,1.250091,0.252846,1.376514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,sp|P77302|YDAM_ECOLI,ydaM,0.663451,0.916264,0.252813,0.789858,1.830381,1.917561,0.087180,1.873971,1.567987,1.264097,0.303890,1.416042
2696,sp|P77432|LSRK_ECOLI,lsrK,1.304411,1.804001,0.499590,1.554206,2.532520,3.129401,0.596880,2.830961,2.895077,2.938459,0.043383,2.916768
2703,sp|P77671|ALLB_ECOLI,allB,0.842160,1.616314,0.774153,1.229237,1.939537,3.068969,1.129432,2.504253,2.581318,2.175953,0.405365,2.378636
2711,sp|Q46799|XDHA_ECOLI,xdhA,1.938953,1.812483,0.126469,1.875718,3.295268,2.331981,0.963287,2.813625,4.527569,2.593228,1.934341,3.560398
