In [74]:
import pandas as pd
import os
import re
import sys

project_root = "/home/adri/Projects/phd/bias_2"
os.chdir(project_root)

sys.path.append('src/utils')
from get_BW_nomenclature import get_bw

# Coupling data

In [49]:
quant_data = "data/raw/alanin_scanning-cuantitative_values-emax_ec50.csv"
quant_df = pd.read_csv(quant_data)

# drop last two rows
quant_df = quant_df[:-2]
# drop columns that contain "SR144528"
quant_df = quant_df.loc[:, ~quant_df.columns.str.contains('SR144528')]
# drop columns with GoB or bArr2
quant_df = quant_df.loc[:, ~quant_df.columns.str.contains('GoB')]
quant_df = quant_df.loc[:, ~quant_df.columns.str.contains('bArr2')]
# drop rows with mutant pattern [A-Z]\d+[A-Z] with regex
quant_df = quant_df[~quant_df["Mutant"].str.contains("[A-Z]\d+[A-Z]")]
# drop GPCRdb column
quant_df = quant_df.drop(columns=["GPCRdb"])

In [50]:
# Create a new column indicating if the mutant has been simulated
features_df = pd.read_csv('data/processed/features_new.csv', index_col=0)
simulated_mutants = features_df["mutant_id"].astype(str).unique()

quant_df["simulated"] = quant_df["Position"].isin(simulated_mutants)

  features_df = pd.read_csv('data/processed/features_new.csv', index_col=0)


In [51]:
# Create a new column indicating the categorical signaling profile of the mutant
# Read signling data
path = 'data/raw/alanin_scanning-cuantitative_values-emax_ec50.csv'
signaling_df = pd.read_csv(path)

# Remove mutations that are not alanin scan
alanin_scan_mask = signaling_df.Mutant.apply(lambda x: bool(re.match('([A-Z])(\d+)$', str(x))))
signaling_df = signaling_df.loc[alanin_scan_mask]

# Remove non usefull columns
cols_of_interst = ['Position','Emax_avg_corr_Gi2_HU210', 'Emax_avg_corr_bArr1_HU210']
signaling_df = signaling_df[cols_of_interst]

# Change column names
signaling_df.columns = ['position', 'gi_emax', 'barr_emax']

# Set mutant number as index
signaling_df = signaling_df.set_index('position')

# Set boolean masks for biased and wt
gi_mask = signaling_df.gi_emax > 0
barr_mask = signaling_df.barr_emax > 0
    
# Create a field wih categorical signal information
signaling_df['profile'] = 'Coup'
signaling_df.loc[gi_mask & ~barr_mask, 'profile'] = 'PrefCoup_Gi'
signaling_df.loc[~gi_mask & barr_mask, 'profile'] = 'PrefCoup_barr'
signaling_df.loc[~gi_mask & ~barr_mask, 'profile'] = 'NoCoup'

# Keep only the categorical labels
signaling_df = signaling_df[['profile']]

In [52]:
quant_df = quant_df.merge(signaling_df, left_on="Position", right_on="position", how="left")

In [53]:
# set wt profile to Coup
quant_df.loc[quant_df["Position"] == 'wt', "profile"] = "Coup"

In [54]:
# Format mutant column so it is the mutant code is ANB where A is the iniatial aminoacid, N is the position and B is the final aminoacid
# All residues must mutate to alanin except for alanin, which mutates to valine. Don't modify the wt row
def format_mutant(mutant):
    if mutant == "wt":
        return mutant
    if mutant[0] == "A":
        return mutant + "V"
    else:
        return mutant + "A"
    
quant_df["Mutant"] = quant_df["Mutant"].apply(format_mutant)

In [56]:
quant_df.columns

Index(['Mutant', 'Position', '%wt expression', 'Emax_avg_corr_Gi2_HU210',
       'Emax_avg_corr_bArr1_HU210', 'EC50_avg_corr_Gi2_HU210',
       'EC50_avg_corr_bArr1_HU210', 'simulated', 'profile'],
      dtype='object')

In [57]:
# change column names
quant_df.columns = ["mutant", "position", "%wt expression", "Gi Emax", "bArr Emax", "Gi EC50", "bArr EC50", "simulated", "coupling profile"]

In [59]:
# Change simulated column from boolean to string with values "simulated" and "not simulated"
quant_df["simulated"] = quant_df["simulated"].replace({True: "simulated", False: "not simulated"})

In [61]:
# Change coupling profile "Coup" to "Coup_Gi_bArr" and NoCoup to "NoCoup_Gi_bArr"
quant_df["coupling profile"] = quant_df["coupling profile"].replace({"Coup": "Coup_Gi_bArr", "NoCoup": "NoCoup_Gi_bArr"})

In [62]:
quant_df

Unnamed: 0,mutant,position,%wt expression,Gi Emax,bArr Emax,Gi EC50,bArr EC50,simulated,coupling profile
0,M1A,1,104.277644,0.825515,1.122299,0.098706,0.385988,not simulated,Coup_Gi_bArr
1,E2A,2,116.723038,0.615114,0.907206,0.468060,0.393758,not simulated,Coup_Gi_bArr
2,E3A,3,121.328063,1.072523,0.826491,-0.109611,-0.361701,not simulated,Coup_Gi_bArr
3,C4A,4,84.004864,1.203155,1.060027,-0.047019,-0.022091,not simulated,Coup_Gi_bArr
4,W5A,5,119.051109,1.003370,1.008177,-0.119622,-0.119924,not simulated,Coup_Gi_bArr
...,...,...,...,...,...,...,...,...,...
355,L357A,357,97.655672,1.167825,0.747814,-0.240117,0.007049,not simulated,Coup_Gi_bArr
356,S358A,358,98.189135,1.107844,0.862811,-0.101797,0.468228,not simulated,Coup_Gi_bArr
357,D359A,359,91.057497,1.158079,0.744454,0.059214,-0.265148,not simulated,Coup_Gi_bArr
358,C360A,360,123.213115,1.052632,0.665300,0.038943,0.099321,not simulated,Coup_Gi_bArr


In [63]:
# save suplementary data
quant_df.to_csv("results/suplementary_data/suplementary_data_1.csv",
                 index=False, sep="\t")

In [68]:
# write quant df a an excel file
quant_df.to_excel("results/suplementary_data/suplementary_data_1.xlsx",
                 index=False)

# Degeneracy data

In [32]:
from glob import glob

In [67]:

# List of CSV file names
csv_files = glob("results/allosteric_network_distance/ACN_filering_thresholds/ACN_0.1_1/transition_matrices/*.csv")  # Add your CSV file names here

# Excel file name
excel_file = 'results/suplementary_data/suplementary_data_2.xlsx'  # Change to your desired Excel file name

# Create a Pandas Excel writer object
with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer:

    # Loop through the CSV files and write each one to a separate worksheet
    for csv_file in csv_files:
        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)

        # Extract the filename (without extension) to use as the worksheet name
        mutant = csv_file.split('/')[-1].split('_')[0]
        sheet_name = f"{mutant}_degeneracy"

        # Write the DataFrame to the Excel writer object
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# Simulated mutants data

In [84]:
# get simulated mutants
sim_mutants_df = quant_df[quant_df["simulated"] == "simulated"]

In [85]:
# Keep position and coupling profile columns
sim_mutants_df = sim_mutants_df[["position", "coupling profile"]]

In [86]:
# add vallesteros-Weinstein notation
bw_dict = get_bw('6PT0')

# reformat dict
clean_bw_dict = {}
for k, v in bw_dict.items():
    bw = re.findall('\d+\.\d+',v)

    if not bw:
        continue

    clean_bw_dict[k[1:]] = bw[0]

# add bw notation to sim mutants df
sim_mutants_df["BW notation"] = sim_mutants_df["position"].apply(lambda x: clean_bw_dict.get(x, x))

In [88]:
# save df without index
sim_mutants_df.to_csv("results/suplementary_data/simulated_mutants_data.csv",
                 index=False)

In [90]:
sim_mutants_df

Unnamed: 0,position,coupling profile,BW notation
32,33,Coup_Gi_bArr,1.32
46,47,Coup_Gi_bArr,1.46
48,49,Coup_Gi_bArr,1.48
51,52,Coup_Gi_bArr,1.51
52,53,Coup_Gi_bArr,1.52
60,61,PrefCoup_Gi,1.60
76,77,PrefCoup_Gi,2.47
90,91,Coup_Gi_bArr,2.61
108,109,PrefCoup_Gi,3.28
111,112,Coup_Gi_bArr,3.31
