In [17]:
import os
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import Bio.PDB.Polypeptide as pp
from collections import Counter
import freesasa
%matplotlib inline

In [2]:
AA = list(pp.aa1)

In [None]:
pd.set_option('display.max_rows', 1000)

In [3]:
figures_path = "../../../Dropbox/perturbation_networks/draft/figures"

In [4]:
DATA = 'data/'

### Functional Data

In [5]:
proteins = ['1be9', '1d5r', '1nd4', '3dqw', '4bz3']
protein_names = ['PSD95', 'PTEN', 'APH(3\')II', 'Src CD', 'VIM-2']

In [6]:
# Import processed functional data as DataFrames, all files have ordered AA list as index, positions as columns
# Save data in functional_data
functional_data = dict()
for protein in proteins:
    csv_file = os.path.join(DATA, f'functional_{protein}.csv')
    functional_data[protein] = pd.read_csv(csv_file, index_col=0, header=0)

### Perturbation Network Data and Related Functions

In [7]:
data_path = os.path.join(DATA, 'structure')
thresholds = [round(i, 1) for i in np.linspace(3, 10, 71)]
sample_thresholds = [round(i, 1) for i in np.linspace(3, 10, 8)]
measures = ['nodes', 'edges', 'weight', 'distance']

In [8]:
def ReadNetworkCSV(protein, threshold, measure):
    """Return DataFrame from corresponding CSV. If protein has multiple identical chains, return average value for 
    each position amongst all chains."""
    file = os.path.join(data_path, f"{protein}/{protein}_{threshold}_{measure}.csv")
    network_df = pd.read_csv(file, header=0)
    network_df.index = AA
    # Get chains from columns
    column_names = list(network_df.columns)
    chains = list(set([position[1] for position in column_names]))
    # Get positions without chain distinction from functional files
    positions = list(functional_data[protein].columns)
    average = pd.DataFrame(index=AA, columns=positions, dtype=np.float64)
    # Save data for position over chains in list, write average into df
    for position in positions:
        for aa in AA:
            values = []
            for chain in chains:
                check = position[0]+chain+position[1:]
                if check in network_df.columns:
                    values.append(network_df.at[aa, check])
            if values:
                average_value = sum(values)/len(values)
                average.at[aa, position] = average_value
    return average

In [9]:
def Standardize(protein, threshold, measure):
    """Return standardized values from network data. Make 0's into NaN. """
    network_df = ReadNetworkCSV(protein, threshold, measure)
    for position in network_df.columns:
        for aa in network_df.index:
            if position[0] == aa:
                network_df.at[aa, position] = np.nan
    data_array = network_df.to_numpy()
    data_mean = np.nanmean(network_df, dtype=np.float64)
    data_std = np.nanstd(network_df, dtype=np.float64)
    network_df = network_df.apply(lambda x:(x-data_mean)/data_std)
    return network_df 

In [10]:
def GetPercentage(percentage, which, data, return_score=False):
    """Return set with top or bottom percentage of positions according to functional data. 
    Parameters:
        percentage (float): between 0 and 1, percentage of positions that we want.
        which (str): 'highest', 'lowest'
        data (dataframe): functional data to consider mean of
        return_score (bool): If True, return list of tuples with mean value and position
    Returns:
        Set of positions.
    """
    functional_mean = data.mean()
    positions = list(data.columns)
    pairs = [(functional_mean[pos], pos) for pos in positions] 
    pairs.sort(key = lambda x:x[0]) 
    if which == 'highest': 
        pairs.reverse() 
    n = int(len(positions)*percentage)
    if return_score:
        return [pair for pair in pairs[:n]]
    else:
        return set([pair[1] for pair in pairs[:n]])

In [11]:
def GetSD(sd, data):
    """ Return set with positions with mean scores above (if sd > 0) or below (if sd < 0) sd according to 
    functional data."""
    functional_mean = data.mean()
    positions = list(data.columns)
    if sd > 0:
        return set([pos for pos in positions if functional_mean[pos] > sd])
    else: 
        return set([pos for pos in positions if functional_mean[pos] < sd])

In [12]:
def GetNetworkExtremes(protein, mincount, measure_cutoffs, thresh=9.0):
    """ Return set with positions that pass measure sd cutoffs for at least mincount measures. """
    network_extremes_list = []
    for i,measure in enumerate(measures): 
        threshold = 3.8 if measure == 'distance' else thresh
        network_df = Standardize(protein, threshold, measure)
        if measure_cutoffs[i] > 0:
            extremes = network_df.columns[(network_df > measure_cutoffs[i]).any()].tolist()
        else:
            extremes = network_df.columns[(network_df < measure_cutoffs[i]).any()].tolist()
        network_extremes_list.extend(extremes)

    counter = Counter(network_extremes_list)
    positions = list(set(network_extremes_list))
    return set([pos for pos in positions if counter[pos] >= mincount])

In [13]:
def ToPercentage(a,b):
    """Return percentage form of a/b, if b != 0. If given set or list, use len of. 
    If string, return formatted percentage, else float."""
    x = a if type(a) == int or type(a) == float else len(a)
    y = b if type(b) == int or type(b) == float else len(b)
    
    if y == 0:
        return np.nan
    else:
        return round(100*x/y,1)

### Calculating buriedness

In [14]:
# output from buriedness.py is in DATA/output/output_{protein}.csv

In [15]:
def GetList(protein, mincount, measure_cutoffs, thresh=9.0, loss=True):
    """Get list with predicted positions, AA in three letter code. If loss==False, use complement for gain preds"""
    pos = GetNetworkExtremes(protein, mincount, measure_cutoffs, thresh=thresh)
    if not loss:
        total_pos = functional_data[protein].columns
        complement = [i for i in total_pos if i not in pos]
        pos = complement
    positions = map(lambda x:pp.one_to_three(x[0])+x[1:], pos)
    return list(positions)

In [24]:
pdb_path = os.path.join(DATA, 'pdb')

In [71]:
buriedness_data = dict()
for protein in proteins:
    file = os.path.join(DATA, f"output/output_{protein}.csv")
    df = pd.read_csv(file)
    df.columns = ['residue', 'buriedness']
    df = df[~df.residue.str.contains('HOH')] # remove water
    predictions = GetList(protein, 4, [1.5]*4) # predictions maximizing accuracy
    df = df.set_index('residue')
    buriedness_data[protein] = df

In [72]:
for protein in proteins:
    structure = freesasa.Structure(os.path.join(pdb_path, f"{protein}.pdb"))
    result = freesasa.calc(structure)
    area_classes = freesasa.classifyResults(result, structure)
    d = result.residueAreas()
    for chain in d:
        for position in d[chain]:
            key = chain+d[chain][position].residueType+position 
            buriedness_data[protein].at[key, 'ASA'] = d[chain][position].total

In [78]:
## classifying as predicted for loss, gain or none 
## esta parte se deberia poder hacer de una forma mas eficiente, pero no se como :(
for protein in proteins:
    loss = GetList(protein, 4, [1.5]*4)
    gain = GetList(protein, 1, [1]*4, loss=False)
    df = buriedness_data[protein]
    for position in df.index:
        for pred in loss:
            if pred in position:
                df.at[position, 'prediction'] = 'loss'
        for pred in gain:
            if pred in position:
                df.at[position, 'prediction'] = 'gain'

In [None]:
#### Getting position numbers to color in pdbs

In [None]:
## loss 
for protein in proteins:
    predictions = GetNetworkExtremes(protein, 4, [1.5]*4)
    numbers = [int(x[1:]) for x in predictions]
    print(protein, numbers)

In [None]:
## gain 
for protein in proteins:
    predictions = GetNetworkExtremes(protein, 1, [1]*4)
    total_positions = functional_data[protein].columns
    numbers = [int(x[1:]) for x in total_positions if x not in predictions]
    print(protein, numbers)