In [1]:
import os
import sys
import random
import itertools
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import Bio.PDB.Polypeptide as pp
from collections import Counter
from getmutations import MutationsDict, GetMutations
%matplotlib inline

In [2]:
AA = list(pp.aa1)

In [3]:
figures_path = "../../../Dropbox/perturbation_networks/draft/figures"

In [4]:
DATA = 'data/'

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
mpl.style.use('seaborn-colorblind')
mpl.rcParams['grid.color'] = 'xkcd:white'
mpl.rcParams['grid.linestyle'] = '-'
mpl.rcParams['grid.linewidth'] = 0.5
mpl.rcParams['figure.facecolor'] = 'xkcd:white'
mpl.rcParams["axes.facecolor"] = 'xkcd:white'
mpl.rcParams["savefig.facecolor"] = 'xkcd:white'

### Functional Data

In [6]:
proteins = ['1be9', '1d5r', '1nd4', '3dqw', '4bz3']
protein_names = ['PSD95', 'PTEN', 'APH(3\')II', 'Src CD', 'VIM-2']
to_names = {i:j for (i,j) in zip(proteins, protein_names)}

In [7]:
# Import processed functional data as DataFrames, all files have ordered AA list as index, positions as columns
# Save data in functional_data
functional_data = dict()
for protein in proteins:
    csv_file = os.path.join(DATA, f'functional_{protein}.csv')
    functional_data[protein] = pd.read_csv(csv_file, index_col=0, header=0)

### Perturbation Network Data and Related Functions

In [8]:
data_path = os.path.join(DATA, 'structure')
thresholds = [round(i, 1) for i in np.linspace(3, 10, 71)]
sample_thresholds = [round(i, 1) for i in np.linspace(3, 10, 8)]
measures = ['nodes', 'edges', 'weight', 'distance']

In [9]:
def ReadNetworkCSV(protein, threshold, measure):
    """Return DataFrame from corresponding CSV. If protein has multiple identical chains, return average value for 
    each position amongst all chains."""
    file = os.path.join(os.path.join(DATA, 'structure'), f"{protein}/{protein}_{threshold}_{measure}.csv")
    network_df = pd.read_csv(file, header=0)
    network_df.index = AA
    # Get chains from columns
    column_names = list(network_df.columns)
    chains = list(set([position[1] for position in column_names]))
    # Get positions without chain distinction from functional files
    positions = list(functional_data[protein].columns)
    average = pd.DataFrame(index=AA, columns=positions, dtype=np.float64)
    # Save data for position over chains in list, write average into df
    for position in positions:
        for aa in AA:
            values = []
            for chain in chains:
                check = position[0]+chain+position[1:]
                if check in network_df.columns:
                    values.append(network_df.at[aa, check])
            if values:
                average_value = sum(values)/len(values)
                average.at[aa, position] = average_value
    return average

In [10]:
def Standardize(protein, threshold, measure):
    """Return standardized values from network data. Make 0's into NaN. """
    network_df = ReadNetworkCSV(protein, threshold, measure)
    for position in network_df.columns:
        for aa in network_df.index:
            if position[0] == aa:
                network_df.at[aa, position] = np.nan
    data_array = network_df.to_numpy()
    data_mean = np.nanmean(network_df, dtype=np.float64)
    data_std = np.nanstd(network_df, dtype=np.float64)
    network_df = network_df.apply(lambda x:(x-data_mean)/data_std)
    return network_df 

In [11]:
def GetPercentage(percentage, which, data, return_score=False):
    """Return set with top or bottom percentage of positions according to functional data. 
    Parameters:
        percentage (float): between 0 and 1, percentage of positions that we want.
        which (str): 'highest', 'lowest'
        data (dataframe): functional data to consider mean of
        return_score (bool): If True, return list of tuples with mean value and position
    Returns:
        Set of positions.
    """
    functional_mean = data.mean()
    positions = list(data.columns)
    pairs = [(functional_mean[pos], pos) for pos in positions] 
    pairs.sort(key = lambda x:x[0]) 
    if which == 'highest': 
        pairs.reverse() 
    n = int(len(positions)*percentage)
    if return_score:
        return [pair for pair in pairs[:n]]
    else:
        return set([pair[1] for pair in pairs[:n]])

In [12]:
def GetNetworkExtremes(protein, mincount, measure_cutoffs, thresh=9.0):
    """ Return set with positions that pass measure sd cutoffs for at least mincount measures. """
    network_extremes_list = []
    for i,measure in enumerate(measures): 
        threshold = 3.8 if measure == 'distance' else thresh
        network_df = Standardize(protein, threshold, measure)
        if measure_cutoffs[i] > 0:
            extremes = network_df.columns[(network_df > measure_cutoffs[i]).any()].tolist()
        else:
            extremes = network_df.columns[(network_df < measure_cutoffs[i]).any()].tolist()
        network_extremes_list.extend(extremes)

    counter = Counter(network_extremes_list)
    positions = list(set(network_extremes_list))
    return set([pos for pos in positions if counter[pos] >= mincount])

In [13]:
def ToPercentage(a,b):
    """Return percentage form of a/b, if b != 0. If given set or list, use len of. 
    If string, return formatted percentage, else float."""
    x = a if type(a) == int or type(a) == float else len(a)
    y = b if type(b) == int or type(b) == float else len(b)
    
    if y == 0:
        return np.nan
    else:
        return round(100*x/y,2)

## Predictions to test 

### Structurally Sensitive Positions

In [14]:
def ComparePredictionsLoss(functional_percentage, network_mincount, measure_cutoffs=[1,1,1,1], thresh=9.0, 
                           string=True):
    """Compare percentage of positions with highest mean functional values with predicted positions above cutoff for 
    perturbation network data. Return True Positives, False Positives, Coverage, Accuracy, and percentages of 
    positions. 
    """
    predict = pd.DataFrame(index=proteins, columns=['True Positives', 'False Positives', 'Coverage', 'Accuracy',
                                                          'Prediction %', 'Functional %'])
    for protein in proteins:
        network_extremes = GetNetworkExtremes(protein, network_mincount, measure_cutoffs, thresh=thresh)
        funct_extremes = GetPercentage(functional_percentage, 'lowest', functional_data[protein])
        
        how_many = len(network_extremes.intersection(funct_extremes))
        positions = len(functional_data[protein].columns)

        predict.at[protein, 'True Positives'] = how_many
        predict.at[protein, 'False Positives'] = len(network_extremes) - how_many 
        predict.at[protein,'Coverage']= ToPercentage(how_many, funct_extremes)
        predict.at[protein, 'Accuracy'] = ToPercentage(how_many, network_extremes)
        predict.at[protein,'Functional %'] = ToPercentage(funct_extremes, positions)
        predict.at[protein,'Prediction %'] = ToPercentage(network_extremes, positions)
     
    percentages = ['Coverage', 'Accuracy','Prediction %', 'Functional %']
    
    for score in percentages:
        total = 0
        for protein in proteins:
            total += predict.at[protein, score]
        predict.at['Mean', score] = ToPercentage(total,500)
        
    if string: 
        return predict.style.format({col:'{0:,.1f}%' for col in percentages})
    else:
        return predict 

In [15]:
def ComparePredictionsGain(functional_percentage, network_mincount, measure_cutoffs=[1,1,1,1], thresh=9.0, 
                          string=True):
    """Compare percentage of positions with highest mean functional values with complement of predicted positions for 
    loss of function for given measure_cutoffs and mincounts. Return True Positives, False Positives, Coverage, 
    Accuracy, and percentages of positions. 
    """
    predict = pd.DataFrame(index=proteins, columns=['True Positives', 'False Positives', 'Coverage', 'Accuracy',
                                                          'Prediction %', 'Functional %'])
    for protein in proteins:
        network_extremes_loss = GetNetworkExtremes(protein, network_mincount, measure_cutoffs, thresh=thresh)
        total_positions = functional_data[protein].columns
        funct_extremes = GetPercentage(functional_percentage, 'highest', functional_data[protein])
        network_extremes = set([pos for pos in total_positions if pos not in network_extremes_loss])
        
        how_many = len(network_extremes.intersection(funct_extremes))
        positions = len(functional_data[protein].columns)

        predict.at[protein, 'True Positives'] = how_many
        predict.at[protein, 'False Positives'] = len(network_extremes) - how_many 
        predict.at[protein,'Coverage']= ToPercentage(how_many, funct_extremes)
        predict.at[protein, 'Accuracy'] = ToPercentage(how_many, network_extremes)
        predict.at[protein,'Functional %'] = ToPercentage(funct_extremes, positions)
        predict.at[protein,'Prediction %'] = ToPercentage(network_extremes, positions)
        
    percentages = ['Coverage', 'Accuracy','Prediction %', 'Functional %']
    
    for score in percentages:
        total = 0
        for protein in proteins:
            total += predict.at[protein, score]
        predict.at['Mean', score] = ToPercentage(total,500)
        
    if string: 
        return predict.style.format({col:'{0:,.1f}%' for col in percentages})
    else:
        return predict 

In [16]:
headers = ['Maximizing Accuracy', 'Maximizing Coverage', 'Maximizing Both']
scores = ['Accuracy', 'Coverage', 'Prediction %']
index = pd.MultiIndex.from_product([headers, scores], names=['Prediction', 'Score'])

In [17]:
loss_predictions = pd.DataFrame(index=protein_names, columns=index)

In [18]:
accuracy = ComparePredictionsLoss(0.4, 4, [1.5,1.5,1.5,1.5], string=False)
coverage = ComparePredictionsLoss(0.4, 2, [1,1,1,1], string=False)
both = ComparePredictionsLoss(0.4, 2, [1.5,1.5,1.5,1.5], string=False)
best_loss = [accuracy, coverage, both]

for j,prediction in enumerate(headers):
    for i, protein in enumerate(protein_names):
        for score in scores:
            loss_predictions.at[protein, (prediction, score)] = best_loss[j].at[proteins[i], score]

for prediction in headers:
    for score in scores:
        total = 0
        for protein in protein_names:
            total += loss_predictions.at[protein, (prediction, score)]
        loss_predictions.at['Mean', (prediction, score)] = ToPercentage(total,500)

In [19]:
gain_predictions = pd.DataFrame(index=protein_names, columns=index)

In [20]:
accuracy = ComparePredictionsGain(0.4, 1, [1,1,1,1], string=False)
coverage = ComparePredictionsGain(0.4, 3, [1,1,1,1], string=False)
both = ComparePredictionsGain(0.4, 2, [1,1,1,1], string=False)
best_gain = [accuracy, coverage, both]

for j,prediction in enumerate(headers):
    for i, protein in enumerate(protein_names):
        for score in scores:
            gain_predictions.at[protein, (prediction, score)] = best_gain[j].at[proteins[i], score]

for prediction in headers:
    for score in scores:
        total = 0
        for protein in protein_names:
            total += gain_predictions.at[protein, (prediction, score)]
        gain_predictions.at['Mean', (prediction, score)] = ToPercentage(total,500)

## Testing for statistical significance / null model

In [21]:
def PredictRandom(protein, percentage):
    '''Select percentage of positions at random.'''
    positions = list(functional_data[protein].columns)
    n = int(round(len(positions)*percentage, 0))
    predictions = np.random.choice(positions, n, replace=False)
    return set(predictions)

In [92]:
def TestRandom(functional_percentage, prediction_percentage, runs):
    '''Test Coverage and Accuracy of random predictions. '''
    labels1 = ['PSD95', 'PTEN', 'APH(3\')II', 'Src CD', 'VIM-2']
    df = pd.DataFrame(columns=['Mean', 'SD', 'Type', 'Protein', 'Normal test p val'])
    
    if type(prediction_percentage) is float: # if given same value for 5 proteins
        prediction_percentage = [prediction_percentage]*5
        
    for i, protein in enumerate(proteins):
        #p_normal_coverage = 1
        #while(p_normal_coverage > 0.05): # only keep simulations that pass normality test 
        coverage, accuracy = [], []
        n = len(functional_data[protein].columns)
        funct_extremes = GetPercentage(functional_percentage, 'lowest', functional_data[protein])

        for j in range(runs):
            predictions = PredictRandom(protein, prediction_percentage[i])
            how_many = len(predictions.intersection(funct_extremes))
            coverage.append(ToPercentage(how_many, funct_extremes))
            accuracy.append(ToPercentage(how_many, predictions))

        ## normal?
        mu_c = np.mean(coverage)
        sigma_c = np.std(coverage)

        mu_a = np.mean(accuracy)
        sigma_a = np.std(accuracy)

        p_normal_accuracy = sp.stats.shapiro(accuracy)[1]
        p_normal_coverage = sp.stats.shapiro(coverage)[1]
            
        ## Save mean and std 
        df.loc[len(df)] = [mu_c, sigma_c, 'Coverage', protein_names[i], p_normal_coverage]
        df.loc[len(df)] = [mu_a, sigma_a, 'Accuracy', protein_names[i], p_normal_accuracy]
            
    return df

### Testing Structurally Sensitive Predictions

In [36]:
# Summary of loss predictions 
loss_predictions

Prediction,Maximizing Accuracy,Maximizing Accuracy,Maximizing Accuracy,Maximizing Coverage,Maximizing Coverage,Maximizing Coverage,Maximizing Both,Maximizing Both,Maximizing Both
Score,Accuracy,Coverage,Prediction %,Accuracy,Coverage,Prediction %,Accuracy,Coverage,Prediction %
PSD95,66.67,42.42,25.3,60.87,84.85,55.42,68.42,78.79,45.78
PTEN,83.95,55.74,26.38,66.87,90.98,54.07,74.26,82.79,44.3
APH(3')II,90.48,37.25,16.47,65.89,83.33,50.59,73.12,66.67,36.47
Src CD,72.55,38.54,21.07,50.99,80.21,62.4,62.26,68.75,43.8
VIM-2,90.74,53.26,23.38,65.93,96.74,58.44,73.15,85.87,46.75
Mean,80.88,45.44,22.52,62.11,87.22,56.18,70.24,76.57,43.42


In [106]:
def get_pvalues(prediction_df, prediction):
    """Return dataframe with scores, z-scores, and p values for z-test to random data from 10,000 runs, based on 
    predictions from loss_predictions or gain_predictions. 
    
    Parameters:
        prediction (str): Maximizing Accuracy, Coverage or Both 
        prediction_df (DataFrame): loss_predictions or gain_predictions
    
    Returns:
        data (DataFrame), with data for coverage and accuracy for five proteins and mean. 
    """
    # prediction percentages are mean from these predictions
    percentages = list(prediction_df[(prediction, 'Prediction %')]
                                   .apply(lambda x:round(x/100,3)))[:-1]
    # Functional percentage is 0.4
    data = TestRandom(0.4, percentages, 3000)
    
    # Add scores from predictions
    data.loc[data['Type'] == 'Accuracy', 'Score'] = prediction_df[(prediction, 'Accuracy')].tolist()[:-1]
    data.loc[data['Type'] == 'Coverage', 'Score'] = prediction_df[(prediction, 'Coverage')].tolist()[:-1]
    
    # Add z scores 
    data['Z score'] = data.apply(lambda x: (x['Score']-x['Mean'])/x['SD'], axis=1)
    
    ### z-test 
    # Null hypothesis is that coverage and accuracy scores are obtained from seen prediction percentage through random
    # predictions. 
    data['p value'] = data.apply(lambda x: sp.stats.norm.sf(abs(-x['Z score'])), axis=1)
    return data

In [107]:
get_pvalues(loss_predictions, 'Maximizing Accuracy')

Unnamed: 0,Mean,SD,Type,Protein,Normal test p val,Score,Z score,p value
0,25.18435,5.880488,Coverage,PSD95,1.763646e-21,42.42,2.93099,0.00168942
1,39.580233,9.241811,Accuracy,PSD95,1.714122e-21,66.67,2.931219,0.001688176
2,26.429807,3.153426,Coverage,PTEN,1.388107e-10,55.74,9.294714,7.380148e-21
3,39.807367,4.747645,Accuracy,PTEN,1.323748e-10,83.95,9.297796,7.169375e-21
4,16.517477,2.807786,Coverage,APH(3')II,3.346631e-14,37.25,7.383939,7.683679e-14
5,40.10786,6.819587,Accuracy,APH(3')II,3.319552e-14,90.48,7.386392,7.543357e-14
6,21.009103,3.249383,Coverage,Src CD,2.963794e-13,38.54,5.395146,3.423395e-08
7,39.545697,6.11737,Accuracy,Src CD,2.978688e-13,72.55,5.395179,3.422771e-08
8,23.43322,3.410081,Coverage,VIM-2,1.056936e-12,53.26,8.74665,1.098905e-18
9,39.923073,5.810081,Accuracy,VIM-2,1.027832e-12,90.74,8.746337,1.101952e-18


In [108]:
get_pvalues(loss_predictions, 'Maximizing Coverage')

Unnamed: 0,Mean,SD,Type,Protein,Normal test p val,Score,Z score,p value
0,55.37802,6.889129,Coverage,PSD95,1.590013e-18,84.85,4.278042,9.427236e-06
1,39.725663,4.940652,Accuracy,PSD95,1.579591e-18,60.87,4.279666,9.358719e-06
2,53.994707,3.525371,Coverage,PTEN,1.129966e-08,90.98,10.491179,4.741875e-26
3,39.68145,2.590459,Accuracy,PTEN,1.173184e-08,66.87,10.49565,4.522627e-26
4,50.539983,3.81229,Coverage,APH(3')II,2.348501e-10,83.33,8.601133,3.946642e-18
5,39.961797,3.016853,Accuracy,APH(3')II,2.298716e-10,65.89,8.594455,4.183059e-18
6,62.26838,3.740179,Coverage,Src CD,1.222209e-10,80.21,4.796996,8.053157e-07
7,39.588253,2.378103,Accuracy,Src CD,1.190481e-10,50.99,4.79447,8.155259e-07
8,58.37452,4.006488,Coverage,VIM-2,1.087289e-10,96.74,9.575837,5.051764e-22
9,39.781277,2.729131,Accuracy,VIM-2,1.093255e-10,65.93,9.581337,4.78984e-22


In [109]:
get_pvalues(loss_predictions, 'Maximizing Both')

Unnamed: 0,Mean,SD,Type,Protein,Normal test p val,Score,Z score,p value
0,45.660647,6.68666,Coverage,PSD95,7.360659999999999e-19,78.79,4.954544,3.624998e-07
1,39.654767,5.805196,Accuracy,PSD95,7.648878999999999e-19,68.42,4.955084,3.614954e-07
2,44.3791,3.43264,Coverage,PTEN,2.868708e-09,82.79,11.189899,2.28477e-29
3,39.813027,3.078778,Accuracy,PTEN,2.790017e-09,74.26,11.188522,2.320518e-29
4,36.39088,3.787771,Coverage,APH(3')II,6.040878e-10,66.67,7.993916,6.535948e-16
5,39.91329,4.152819,Accuracy,APH(3')II,5.937268e-10,73.12,7.996185,6.416666e-16
6,43.639613,3.960409,Coverage,Src CD,5.637967e-10,68.75,6.340352,1.146204e-10
7,39.523433,3.586176,Accuracy,Src CD,5.423047e-10,62.26,6.340059,1.148383e-10
8,46.85015,4.152442,Coverage,VIM-2,5.74804e-10,85.87,9.396844,2.811204e-21
9,39.908943,3.536996,Accuracy,VIM-2,5.332297e-10,73.15,9.398103,2.777776e-21


In [110]:
get_pvalues(gain_predictions, 'Maximizing Accuracy')

Unnamed: 0,Mean,SD,Type,Protein,Normal test p val,Score,Z score,p value
0,30.162657,6.11131,Coverage,PSD95,1.046227e-20,54.55,3.990526,3.29634e-05
1,39.818667,8.067659,Accuracy,PSD95,1.068114e-20,72.0,3.988931,3.318588e-05
2,29.30822,3.228685,Coverage,PTEN,7.272505e-10,50.0,6.408732,7.336739e-11
3,39.726657,4.374953,Accuracy,PTEN,7.109634e-10,67.78,6.412261,7.168839e-11
4,30.688487,3.602172,Coverage,APH(3')II,8.446537e-11,57.84,7.53754,2.394598e-14
5,40.134213,4.711622,Accuracy,APH(3')II,8.454413e-11,75.64,7.535788,2.426979e-14
6,24.45647,3.44036,Coverage,Src CD,2.226707e-12,39.58,4.395915,5.515351e-06
7,39.792733,5.59914,Accuracy,Src CD,2.153714e-12,64.41,4.396615,5.497594e-06
8,26.8279,3.617711,Coverage,VIM-2,4.592796e-12,53.26,7.306304,1.372949e-13
9,39.809893,5.368851,Accuracy,VIM-2,4.734275e-12,79.03,7.305121,1.38509e-13


In [111]:
get_pvalues(gain_predictions, 'Maximizing Coverage')

Unnamed: 0,Mean,SD,Type,Protein,Normal test p val,Score,Z score,p value
0,50.702237,6.932716,Coverage,PSD95,3.515795e-18,69.7,2.740306,0.003069099
1,39.838437,5.44579,Accuracy,PSD95,3.665033e-18,54.76,2.740018,0.003071791
2,52.47419,3.519917,Coverage,PTEN,5.422561e-09,80.33,7.91377,1.248549e-15
3,39.761777,2.666168,Accuracy,PTEN,5.734019e-09,60.87,7.917063,1.215929e-15
4,59.561867,3.744646,Coverage,APH(3')II,1.345374e-10,89.22,7.920143,1.186186e-15
5,39.970437,2.511984,Accuracy,APH(3')II,1.349437e-10,59.87,7.921853,1.169987e-15
6,46.303663,3.927119,Coverage,Src CD,4.685426e-10,65.62,4.918705,4.355942e-07
7,39.68874,3.365984,Accuracy,Src CD,4.805674e-10,56.25,4.920184,4.323139e-07
8,50.63324,4.093709,Coverage,VIM-2,1.875985e-10,85.87,8.607539,3.732283e-18
9,39.815013,3.219595,Accuracy,VIM-2,1.737381e-10,67.52,8.605115,3.81201e-18


In [112]:
get_pvalues(gain_predictions, 'Maximizing Both')

Unnamed: 0,Mean,SD,Type,Protein,Normal test p val,Score,Z score,p value
0,44.69259,6.782598,Coverage,PSD95,1.497972e-18,69.7,3.686996,0.0001134585
1,39.863707,6.04701,Accuracy,PSD95,1.4858440000000001e-18,62.16,3.68716,0.0001133854
2,45.9006,3.486413,Coverage,PTEN,8.475972e-09,73.77,7.993718,6.546449e-16
3,39.71698,3.015064,Accuracy,PTEN,9.004101e-09,63.83,7.997515,6.347751e-16
4,49.466867,3.947663,Coverage,APH(3')II,2.369796e-09,84.31,8.826269,5.411263e-19
5,40.044633,3.196791,Accuracy,APH(3')II,2.356463e-09,68.25,8.823025,5.570439e-19
6,37.578163,3.865341,Coverage,Src CD,2.887854e-10,55.21,4.561522,2.539209e-06
7,39.64239,4.078581,Accuracy,Src CD,3.004408e-10,58.24,4.559823,2.559835e-06
8,41.531623,3.919037,Coverage,VIM-2,5.277362e-11,75.0,8.539949,6.714027e-18
9,39.80089,3.755227,Accuracy,VIM-2,5.159736e-11,71.88,8.542522,6.5662080000000005e-18
