In [1]:
import os
import sys
import itertools
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import Bio.PDB.Polypeptide as pp
from collections import Counter
sys.path.append('/Users/macbook/Documents/GitHub/perturbation-networks-get-data')
from getmutations import MutationsDict, GetMutations
%matplotlib inline

In [2]:
AA = list(pp.aa1)

In [3]:
path = "/Users/macbook/Documents/perturbation-networks/dms_data"
protein_path = "/Users/macbook/Documents/perturbation-networks/proteins"

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
def PlotHeatmap(dataframe, save_as=None, vmin=None, vmax=None):
    """Plot heatmap of dataframe values.
    
    Parameters:
        dataframe: data to plot
        save_as: name of file to save. If None, image is not saved
        vmin, vmax: min and max values for colorscale
    """
    # Set missing values gray 
    sns.set(rc={'axes.facecolor':'94A3A8'})
    # Set size to match dataframe dimensions
    w = len(dataframe.columns)
    h = len(dataframe.index)
    ratio = w//h
    plt.figure(figsize=(10*ratio,10))
    heatmap = sns.heatmap(dataframe, cmap='RdBu_r', center=0, cbar=True, square=True, vmin=vmin, vmax=vmax)
    if save_as:
        fig = heatmap.get_figure()
        fig.savefig(f'{save_as}.png')
    return

### Functional data

In [6]:
proteins = ['1be9', '1d5r', '1nd4', '3dqw', '4bz3']

In [7]:
# Import processed functional data as DataFrames, all files have ordered AA list as index, positions as columns
# Save data in functional_data
functional_data = dict()
for protein in proteins:
    csv_file = os.path.join(path, f'functional_{protein}.csv')
    functional_data[protein] = pd.read_csv(csv_file, index_col=0, header=0)

In [8]:
# For estimating loss of function changes, make all positive values zero, then rescale by standard deviation
# Save data in all_negatives
all_negatives = dict()
for protein in proteins:
    functional_df = functional_data[protein].copy()
    functional_df[functional_df > 0] = 0
    array = functional_df.to_numpy()
    array_std = np.nanstd(array)
    all_negatives[protein] = functional_df.apply(lambda x:round(x/array_std, 3))   

In [9]:
# For estimating gain of function changes, make all negative values zero, then rescale by standard deviation
# Save data in all_positives
all_positives = dict()
for protein in proteins:
    functional_df = functional_data[protein].copy()
    functional_df[functional_df < 0] = 0
    array = functional_df.to_numpy()
    array_std = np.nanstd(array)
    all_positives[protein] = functional_df.apply(lambda x:round(x/array_std, 3)) 

### Perturbation Network Data

In [10]:
data_path = os.path.join(protein_path, 'data')
thresholds = [round(i, 1) for i in np.linspace(3, 10, 71)]
sample_thresholds = [round(i, 1) for i in np.linspace(3, 10, 8)]
measures = ['nodes', 'edges', 'weight', 'distance']

In [11]:
def ReadNetworkCSV(protein, threshold, measure):
    """Return DataFrame from corresponding CSV. If protein has multiple identical chains, return average value for 
    each position amongst all chains."""
    file = os.path.join(data_path, f"{protein}/{protein}_{threshold}_{measure}.csv")
    network_df = pd.read_csv(file, header=0)
    network_df.index = AA
    # Get chains from columns
    column_names = list(network_df.columns)
    chains = list(set([position[1] for position in column_names]))
    # Get positions without chain distinction from functional files
    positions = list(functional_data[protein].columns)
    average = pd.DataFrame(index=AA, columns=positions, dtype=np.float64)
    # Save data for position over chains in list, write average into df
    for position in positions:
        for aa in AA:
            values = []
            for chain in chains:
                check = position[0]+chain+position[1:]
                if check in network_df.columns:
                    values.append(network_df.at[aa, check])
            if values:
                average_value = sum(values)/len(values)
                average.at[aa, position] = average_value
    return average

In [12]:
def Standardize(protein, threshold, measure):
    """Return standardized values from network data. Make 0's into NaN. """
    network_df = ReadNetworkCSV(protein, threshold, measure)
    for position in network_df.columns:
        for aa in network_df.index:
            if position[0] == aa:
                network_df.at[aa, position] = np.nan
    data_array = network_df.to_numpy()
    data_mean = np.nanmean(network_df, dtype=np.float64)
    data_std = np.nanstd(network_df, dtype=np.float64)
    network_df = network_df.apply(lambda x:(x-data_mean)/data_std)
    return network_df 

In [13]:
def CheckDistribution(cutoffs, data=all_negatives, absolute=False):
    """Return df with number of positions with mean above cutoffs (if positive), or below cutoffs (if negative) and 
    percentage. If absolute, check above and below."""
    columns = []
    for cutoff in cutoffs:
        columns.extend([f'{cutoff} std', f'% {cutoff} std'])
    values = pd.DataFrame(index=proteins, columns=columns)
    for protein in proteins:
        data_df = data[protein]
        data_mean = list(data_df.mean())
        n = len(data_mean)
        for cutoff in cutoffs:
            if not absolute:
                if cutoff > 0:
                    m = len([i for i in data_mean if i > cutoff])
                elif cutoff < 0:
                    m = len([i for i in data_mean if i < cutoff])
            else:
                assert cutoff > 0, 'Cutoffs should be positive'
                m = len([i for i in data_mean if i > cutoff or i < -cutoff])
            values.at[protein, f'{cutoff} std'] = m
            values.at[protein, f'% {cutoff} std'] = str(round(100*m/n, 1))+'%'
    return values

In [14]:
def CheckDistribution2(cutoffs, data, absolute=False):
    """Return df with number of positions with at least one value above cutoff (if positive), or below cutoff 
    (if negative) and percentage. If absolute, check above and below."""
    columns = []
    for cutoff in cutoffs:
        columns.extend([f'{cutoff} std', f'% {cutoff} std'])
    values = pd.DataFrame(index=proteins, columns=columns)
    for protein in proteins:
        data_df = data[protein]
        n = len(data_df.columns)
        for cutoff in cutoffs:
            if not absolute:
                if cutoff > 0:
                    m = len(set(data_df.columns[(data_df > cutoff).any()]))
                elif cutoff < 0:
                    m = len(set(data_df.columns[(data_df < cutoff).any()]))
            else:
                assert cutoff > 0, 'Cutoffs should be positive'
                above = set(data_df.columns[(data_df > cutoff).any()])
                below = set(data_df.columns[(data_df < cutoff).any()])
                m = len(above.union(below))
            values.at[protein, f'{cutoff} std'] = m
            values.at[protein, f'% {cutoff} std'] = str(round(100*m/n, 1))+'%'
    return values

In [15]:
# all_positives and all_negatives data is much better distributed than other standardized functional datasets
display(CheckDistribution([-2,-1.5,-1]))
display(CheckDistribution([1, 1.5, 2], data=all_positives))

Unnamed: 0,-2 std,% -2 std,-1.5 std,% -1.5 std,-1 std,% -1 std
1be9,3,3.6%,11,13.3%,17,20.5%
1d5r,33,10.7%,69,22.5%,98,31.9%
1nd4,5,2.0%,83,32.5%,154,60.4%
3dqw,45,18.6%,71,29.3%,108,44.6%
4bz3,38,16.5%,83,35.9%,112,48.5%


Unnamed: 0,1 std,% 1 std,1.5 std,% 1.5 std,2 std,% 2 std
1be9,17,20.5%,9,10.8%,6,7.2%
1d5r,26,8.5%,7,2.3%,1,0.3%
1nd4,40,15.7%,18,7.1%,11,4.3%
3dqw,25,10.3%,14,5.8%,10,4.1%
4bz3,29,12.6%,13,5.6%,6,2.6%


In [16]:
# Check distributions to find best cutoff for each measure
for measure in measures:
    threshold = 3.8 if measure == 'distance' else 5.0
    data = {protein:Standardize(protein, threshold, measure) for protein in proteins}
    print(measure)
    display(CheckDistribution2([-2,-1.5, -1, 1, 1.5, 2, 3], data))

nodes


Unnamed: 0,-2 std,% -2 std,-1.5 std,% -1.5 std,-1 std,% -1 std,1 std,% 1 std,1.5 std,% 1.5 std,2 std,% 2 std,3 std,% 3 std
1be9,1,1.2%,5,6.0%,32,38.6%,46,55.4%,36,43.4%,25,30.1%,4,4.8%
1d5r,4,1.3%,49,16.0%,146,47.6%,152,49.5%,113,36.8%,72,23.5%,31,10.1%
1nd4,1,0.4%,23,9.0%,103,40.4%,119,46.7%,75,29.4%,43,16.9%,18,7.1%
3dqw,5,2.1%,46,19.0%,134,55.4%,131,54.1%,85,35.1%,54,22.3%,13,5.4%
4bz3,6,2.6%,44,19.0%,124,53.7%,129,55.8%,91,39.4%,60,26.0%,16,6.9%


edges


Unnamed: 0,-2 std,% -2 std,-1.5 std,% -1.5 std,-1 std,% -1 std,1 std,% 1 std,1.5 std,% 1.5 std,2 std,% 2 std,3 std,% 3 std
1be9,0,0.0%,1,1.2%,29,34.9%,44,53.0%,33,39.8%,24,28.9%,11,13.3%
1d5r,0,0.0%,26,8.5%,156,50.8%,144,46.9%,114,37.1%,82,26.7%,30,9.8%
1nd4,0,0.0%,4,1.6%,96,37.6%,116,45.5%,85,33.3%,58,22.7%,23,9.0%
3dqw,0,0.0%,25,10.3%,134,55.4%,130,53.7%,91,37.6%,61,25.2%,23,9.5%
4bz3,0,0.0%,13,5.6%,114,49.4%,119,51.5%,91,39.4%,65,28.1%,25,10.8%


weight


Unnamed: 0,-2 std,% -2 std,-1.5 std,% -1.5 std,-1 std,% -1 std,1 std,% 1 std,1.5 std,% 1.5 std,2 std,% 2 std,3 std,% 3 std
1be9,0,0.0%,0,0.0%,19,22.9%,42,50.6%,33,39.8%,22,26.5%,14,16.9%
1d5r,0,0.0%,1,0.3%,157,51.1%,169,55.0%,130,42.3%,87,28.3%,43,14.0%
1nd4,0,0.0%,0,0.0%,112,43.9%,131,51.4%,88,34.5%,63,24.7%,27,10.6%
3dqw,0,0.0%,8,3.3%,146,60.3%,138,57.0%,97,40.1%,75,31.0%,31,12.8%
4bz3,0,0.0%,1,0.4%,113,48.9%,124,53.7%,98,42.4%,68,29.4%,37,16.0%


distance


Unnamed: 0,-2 std,% -2 std,-1.5 std,% -1.5 std,-1 std,% -1 std,1 std,% 1 std,1.5 std,% 1.5 std,2 std,% 2 std,3 std,% 3 std
1be9,0,0.0%,0,0.0%,0,0.0%,52,62.7%,52,62.7%,27,32.5%,10,12.0%
1d5r,0,0.0%,0,0.0%,224,73.0%,177,57.7%,177,57.7%,75,24.4%,32,10.4%
1nd4,28,11.0%,52,20.4%,179,70.2%,137,53.7%,96,37.6%,67,26.3%,7,2.7%
3dqw,13,5.4%,50,20.7%,171,70.7%,146,60.3%,95,39.3%,50,20.7%,8,3.3%
4bz3,0,0.0%,0,0.0%,153,66.2%,138,59.7%,105,45.5%,46,19.9%,15,6.5%


For predictions, we consider positions based on whether a single mutation crosses the corresponding cutoff. We found that 'Distance' data achieves peak Spearman correlation with thresholds around 3.6 to 3.8 Å, and data for 3.8 Å is used for predictions. For 'Nodes', 'Edges' and 'Weight', we found that Spearman correlations are consistent for thresholds above 4.0 Å, (datos sobre qué tan consistentes), and data for 5.0 Å is used. (explicar algo sobre significado biológico de esos thresholds). 

We compare this predictions to different cutoffs of functional data, considering the average value for each position, depending on the corresponding positive or negative scaled functional data. 

In [17]:
def GetPercentage(percentage, which, data):
    """Return set with top or bottom percentage of positions according to functional data. 
    Parameters:
        percentage (float): between 0 and 1, percentage of positions that we want.
        which (str): 'highest', 'lowest'
        data (dataframe): functional data to consider mean of
    
    Returns:
        Set of positions.
    """
    functional_mean = data.mean()
    positions = list(data.columns)
    
    pairs = [(functional_mean[pos], pos) for pos in positions] 
    pairs.sort(key = lambda x:x[0]) 
    if which == 'highest': 
        pairs.reverse() 
    n = int(len(positions)*percentage)
    return set([pair[1] for pair in pairs[:n]])

In [18]:
def GetSD(sd, data):
    """ Return set with positions with mean scores above (if sd > 0) or below (if sd < 0) sd according to 
    functional data."""
    functional_mean = data.mean()
    positions = list(data.columns)
    if sd > 0:
        return set([pos for pos in positions if functional_mean[pos] > sd])
    else: 
        return set([pos for pos in positions if functional_mean[pos] < sd])

In [19]:
def GetNetworkExtremes(protein, mincount, measure_cutoffs, thresh=5.0):
    """ Return set with positions that pass measure sd cutoffs for at least mincount measures. """
    network_extremes_list = []
    for i,measure in enumerate(measures): 
        threshold = 3.8 if measure == 'distance' else thresh
        network_df = Standardize(protein, threshold, measure)
        if measure_cutoffs[i] > 0:
            extremes = network_df.columns[(network_df > measure_cutoffs[i]).any()].tolist()
        else:
            extremes = network_df.columns[(network_df < measure_cutoffs[i]).any()].tolist()
        network_extremes_list.extend(extremes)

    counter = Counter(network_extremes_list)
    positions = list(set(network_extremes_list))
    return set([pos for pos in positions if counter[pos] >= mincount])

In [20]:
def ToPercentage(a,b):
    """Return percentage form of a/b, if b != 0. If given set or list, use len of. """
    x = a if type(a) == int or type(a) == float else len(a)
    y = b if type(b) == int or type(b) == float else len(b)
    
    if y == 0:
        return np.nan
    else:
        return f'{round(100*x/y, 1)}%'

```
True Positives: positions deemed sensitive by functional data and predicted sensitive by network data. 
False Positives: positions predicted sensitive that are not below functional data cutoff. 
Coverage (%): accurately predicted positions / functionally sensitive positions
Accuracy (%): accurately predicted positions / predicted positions
Functional percentage (%): percentage of positions considered sensitive by functional data
Prediction percentage (%): percentage of positions predicted as sensitive```

In [21]:
def ComparePredictionsLoss(functional_cutoff, network_mincount, measure_cutoffs=[1,1,1,1], 
                           funct_method='percentage', data=all_negatives, thresh=5.0):
    """Compare positions with mean functional values below cutoff with predicted positions above cutoff for 
    perturbation network data. Return True Positives, False Positives, Coverage, Accuracy, and percentages of 
    positions. 
    
    Parameters:
        functional_cutoff (float): If funct_method is 'sd', standard deviations below 0 that determine which 
                                   positions are selected. If funct_method is 'percentage', take that percentage of 
                                   positions with lowest mean scores. 
        network_mincount (int): Minimum number of measures for which a position needs to pass cutoff to be predicted 
                              functionally sensitive. Int between 1 and 4, inclusive. 
        measure_cutoffs (list, 4 ints): Standard deviations above average that determine predictions for each measure.
                                        In order, [nodes, edges, weight, distance].
        data (dict): Contains dataframes with functional data, default all_negatives.
        funct_method (str): 'percentage' or 'sd', determines method to select positions to compare predictions to. 
                            Default 'percentage'. 
    """
    predict = pd.DataFrame(index=proteins, columns=['True Positives', 'False Positives', 'Coverage', 'Accuracy',
                                                          'Functional %', 'Prediction %'])
    for protein in proteins:
        network_extremes = GetNetworkExtremes(protein, network_mincount, measure_cutoffs, thresh=thresh)
        funct_extremes = GetSD(functional_cutoff, data[protein]) if funct_method == 'sd' \
                                        else GetPercentage(functional_cutoff, 'lowest', data[protein])
        
        how_many = len(network_extremes.intersection(funct_extremes))
        positions = len(data[protein].columns)

        predict.at[protein, 'True Positives'] = how_many
        predict.at[protein, 'False Positives'] = len(network_extremes) - how_many 
        predict.at[protein,'Coverage']= ToPercentage(how_many, funct_extremes)
        predict.at[protein, 'Accuracy'] = ToPercentage(how_many, network_extremes)
        predict.at[protein,'Functional %'] = ToPercentage(funct_extremes, positions)
        predict.at[protein,'Prediction %'] = ToPercentage(network_extremes, positions)
        
    method = f'{functional_cutoff} SD' if funct_method == 'sd' else f'{functional_cutoff*100}%'
    print("Loss of function predictions:")
    print(f'mincount = {network_mincount}, functional cutoff = {method}, measure cutoffs = {measure_cutoffs}')
    display(predict)
    return 

No están tan bien distribuidos los datos de redes abajo del promedio, pero abajo de -1 da posiciones para casi todas las proteínas y medidas. 

In [22]:
def ComparePredictionsGain(functional_cutoff, network_mincount, measure_cutoffs=[-1,-1,-1,-1], 
                           funct_method='percentage', data=all_positives, thresh=5.0):
    """Compare positions with mean functional values above cutoff with predicted positions below cutoff for 
    perturbation network data. Return True Positives, False Positives, Coverage, Accuracy, and percentages of 
    positions. 
    
    Parameters:
        functional_cutoff (float): If funct_method is 'sd', standard deviations above 0 that determine which 
                                   positions are selected. If funct_method is 'percentage', take that percentage of 
                                   positions with highest mean scores. 
        network_mincount (int): Minimum number of measures for which a position needs to pass cutoff to be predicted 
                              functionally sensitive. Int between 1 and 4, inclusive. 
        measure_cutoffs (list, 4 ints): Standard deviations below average that determine predictions for each measure.
                                        In order, [nodes, edges, weight, distance].
        data (dict): Contains dataframes with functional data, default all_positives.
        funct_method (str): 'percentage' or 'sd', determines method to select positions to compare predictions to. 
                            Default 'percentage'. 
    """
    predict = pd.DataFrame(index=proteins, columns=['True Positives', 'False Positives', 'Coverage', 'Accuracy',
                                                          'Functional %', 'Prediction %'])
    for protein in proteins:
        network_extremes = GetNetworkExtremes(protein, network_mincount, measure_cutoffs, thresh=thresh)
        funct_extremes = GetSD(functional_cutoff, data[protein]) if funct_method == 'sd' \
                                        else GetPercentage(functional_cutoff, 'highest', data[protein])
            
        how_many = len(network_extremes.intersection(funct_extremes))
        positions = len(data[protein].columns)

        predict.at[protein, 'True Positives'] = how_many
        predict.at[protein, 'False Positives'] = len(network_extremes) - how_many 
        predict.at[protein,'Coverage']= ToPercentage(how_many, funct_extremes)
        predict.at[protein, 'Accuracy'] = ToPercentage(how_many, network_extremes)
        predict.at[protein,'Functional %'] = ToPercentage(funct_extremes, positions)
        predict.at[protein,'Prediction %'] = ToPercentage(network_extremes, positions)
        
    method = f'{functional_cutoff} SD' if funct_method == 'sd' else f'{functional_cutoff*100}%'
    print("Gain of function predictions:")
    print(f'mincount = {network_mincount}, functional cutoff = {method}, measure cutoffs = {measure_cutoffs}')
    display(predict)
    return 

In [23]:
ComparePredictionsLoss(-1, 1, funct_method = 'sd')

Loss of function predictions:
mincount = 1, functional cutoff = -1 SD, measure cutoffs = [1, 1, 1, 1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,16,42,94.1%,27.6%,20.5%,69.9%
1d5r,94,127,95.9%,42.5%,31.9%,72.0%
1nd4,135,35,87.7%,79.4%,60.4%,66.7%
3dqw,89,78,82.4%,53.3%,44.6%,69.0%
4bz3,106,56,94.6%,65.4%,48.5%,70.1%


In [24]:
for i in range(1,5):
    ComparePredictionsLoss(0.4, i, measure_cutoffs=[1,1,1.5,1.5])

Loss of function predictions:
mincount = 1, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,30,28,90.9%,51.7%,39.8%,69.9%
1d5r,113,97,92.6%,53.8%,39.7%,68.4%
1nd4,89,55,87.3%,61.8%,40.0%,56.5%
3dqw,73,74,76.0%,49.7%,39.7%,60.7%
4bz3,89,58,96.7%,60.5%,39.8%,63.6%


Loss of function predictions:
mincount = 2, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,27,18,81.8%,60.0%,39.8%,54.2%
1d5r,105,51,86.1%,67.3%,39.7%,50.8%
1nd4,75,40,73.5%,65.2%,40.0%,45.1%
3dqw,65,64,67.7%,50.4%,39.7%,53.3%
4bz3,82,40,89.1%,67.2%,39.8%,52.8%


Loss of function predictions:
mincount = 3, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,25,16,75.8%,61.0%,39.8%,49.4%
1d5r,101,34,82.8%,74.8%,39.7%,44.0%
1nd4,69,27,67.6%,71.9%,40.0%,37.6%
3dqw,58,52,60.4%,52.7%,39.7%,45.5%
4bz3,74,30,80.4%,71.2%,39.8%,45.0%


Loss of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,21,10,63.6%,67.7%,39.8%,37.3%
1d5r,78,24,63.9%,76.5%,39.7%,33.2%
1nd4,48,16,47.1%,75.0%,40.0%,25.1%
3dqw,39,28,40.6%,58.2%,39.7%,27.7%
4bz3,57,21,62.0%,73.1%,39.8%,33.8%


In [25]:
for i in range(1,5):
    ComparePredictionsLoss(0.4, i, measure_cutoffs=[1.5,1.5,2,2])

Loss of function predictions:
mincount = 1, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 2, 2]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,27,17,81.8%,61.4%,39.8%,53.0%
1d5r,99,50,81.1%,66.4%,39.7%,48.5%
1nd4,76,37,74.5%,67.3%,40.0%,44.3%
3dqw,57,54,59.4%,51.4%,39.7%,45.9%
4bz3,79,35,85.9%,69.3%,39.8%,49.4%


Loss of function predictions:
mincount = 2, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 2, 2]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,23,12,69.7%,65.7%,39.8%,42.2%
1d5r,89,25,73.0%,78.1%,39.7%,37.1%
1nd4,62,19,60.8%,76.5%,40.0%,31.8%
3dqw,50,39,52.1%,56.2%,39.7%,36.8%
4bz3,69,22,75.0%,75.8%,39.8%,39.4%


Loss of function predictions:
mincount = 3, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 2, 2]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,17,7,51.5%,70.8%,39.8%,28.9%
1d5r,75,18,61.5%,80.6%,39.7%,30.3%
1nd4,50,12,49.0%,80.6%,40.0%,24.3%
3dqw,42,30,43.8%,58.3%,39.7%,29.8%
4bz3,51,18,55.4%,73.9%,39.8%,29.9%


Loss of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 2, 2]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,10,5,30.3%,66.7%,39.8%,18.1%
1d5r,29,4,23.8%,87.9%,39.7%,10.7%
1nd4,28,6,27.5%,82.4%,40.0%,13.3%
3dqw,20,9,20.8%,69.0%,39.7%,12.0%
4bz3,14,8,15.2%,63.6%,39.8%,9.5%


Creo que de lo que tenemos justo arriba, min count 2, functional 40%, measure cutoffs [1.5, 1.5, 2, 2] es un buen ejemplo, coverage y accuracy son ambos buenos en promedio, y muy parecidos entre sí. Además, el porcentaje de funcionales y predicciones se parece, que creo igual es importante considerar. 

3dqw es consistentemente menor, lo que puede deberse a que le faltan varios valores funcionales. 

Podríamos comparar tomando el mismo 'Prediction %' de posiciones al azar, y ver cómo cambian Accuracy y Coverage.

In [26]:
ComparePredictionsLoss(0.4, 4, measure_cutoffs=[1.5,1.5,1.5,1.5])
ComparePredictionsLoss(0.4, 4, measure_cutoffs=[1.5,1.5,1.5,1.5], thresh=9.0)
ComparePredictionsLoss(0.4, 4, measure_cutoffs=[1.5,1.5,1.5,1.5], thresh=10.0)

Loss of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,18,9,54.5%,66.7%,39.8%,32.5%
1d5r,72,16,59.0%,81.8%,39.7%,28.7%
1nd4,40,11,39.2%,78.4%,40.0%,20.0%
3dqw,34,20,35.4%,63.0%,39.7%,22.3%
4bz3,48,15,52.2%,76.2%,39.8%,27.3%


Loss of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,14,7,42.4%,66.7%,39.8%,25.3%
1d5r,68,13,55.7%,84.0%,39.7%,26.4%
1nd4,37,5,36.3%,88.1%,40.0%,16.5%
3dqw,37,14,38.5%,72.5%,39.7%,21.1%
4bz3,49,5,53.3%,90.7%,39.8%,23.4%


Loss of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,13,6,39.4%,68.4%,39.8%,22.9%
1d5r,65,8,53.3%,89.0%,39.7%,23.8%
1nd4,40,5,39.2%,88.9%,40.0%,17.6%
3dqw,34,14,35.4%,70.8%,39.7%,19.8%
4bz3,38,5,41.3%,88.4%,39.8%,18.6%


Usar 5 a usar 9 como threshold sí parece hacer algo de diferencia en accuracy. Estoy muy confundida al respecto. 1nd4 es la que tiene exactamente la misma correlación de spearman para thresholds > 4, pero la diferencia con los dos thresholds es casi 10% de precisión. 3dqw sí crece bastante su correlación de spearman conforme crece el threshold. Creo que sería interesante ver cómo se comparan las predicciones para un threshold y para otro. 

In [27]:
for i in range(1,5):
    ComparePredictionsGain(1, i, funct_method = 'sd', measure_cutoffs = [-1, -1, -1, -1])

Gain of function predictions:
mincount = 1, functional cutoff = 1 SD, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,9,23,52.9%,28.1%,20.5%,38.6%
1d5r,20,215,76.9%,8.5%,8.5%,76.5%
1nd4,35,150,87.5%,18.9%,15.7%,72.5%
3dqw,21,166,84.0%,11.2%,10.3%,77.3%
4bz3,24,154,82.8%,13.5%,12.6%,77.1%


Gain of function predictions:
mincount = 2, functional cutoff = 1 SD, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,9,20,52.9%,31.0%,20.5%,34.9%
1d5r,15,164,57.7%,8.4%,8.5%,58.3%
1nd4,32,90,80.0%,26.2%,15.7%,47.8%
3dqw,16,142,64.0%,10.1%,10.3%,65.3%
4bz3,22,113,75.9%,16.3%,12.6%,58.4%


Gain of function predictions:
mincount = 3, functional cutoff = 1 SD, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,7,12,41.2%,36.8%,20.5%,22.9%
1d5r,15,132,57.7%,10.2%,8.5%,47.9%
1nd4,30,68,75.0%,30.6%,15.7%,38.4%
3dqw,16,116,64.0%,12.1%,10.3%,54.5%
4bz3,20,90,69.0%,18.2%,12.6%,47.6%


Gain of function predictions:
mincount = 4, functional cutoff = 1 SD, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,0,0,0.0%,,20.5%,0.0%
1d5r,12,110,46.2%,9.8%,8.5%,39.7%
1nd4,27,58,67.5%,31.8%,15.7%,33.3%
3dqw,10,98,40.0%,9.3%,10.3%,44.6%
4bz3,11,70,37.9%,13.6%,12.6%,35.1%


In [28]:
for i in range(1,5):
    ComparePredictionsGain(0.4, i, measure_cutoffs = [-1, -1, -1, -1])

Gain of function predictions:
mincount = 1, functional cutoff = 40.0%, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,18,14,54.5%,56.2%,39.8%,38.6%
1d5r,106,129,86.9%,45.1%,39.7%,76.5%
1nd4,92,93,90.2%,49.7%,40.0%,72.5%
3dqw,81,106,84.4%,43.3%,39.7%,77.3%
4bz3,78,100,84.8%,43.8%,39.8%,77.1%


Gain of function predictions:
mincount = 2, functional cutoff = 40.0%, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,18,11,54.5%,62.1%,39.8%,34.9%
1d5r,92,87,75.4%,51.4%,39.7%,58.3%
1nd4,74,48,72.5%,60.7%,40.0%,47.8%
3dqw,72,86,75.0%,45.6%,39.7%,65.3%
4bz3,71,64,77.2%,52.6%,39.8%,58.4%


Gain of function predictions:
mincount = 3, functional cutoff = 40.0%, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,14,5,42.4%,73.7%,39.8%,22.9%
1d5r,80,67,65.6%,54.4%,39.7%,47.9%
1nd4,66,32,64.7%,67.3%,40.0%,38.4%
3dqw,64,68,66.7%,48.5%,39.7%,54.5%
4bz3,65,45,70.7%,59.1%,39.8%,47.6%


Gain of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,0,0,0.0%,,39.8%,0.0%
1d5r,66,56,54.1%,54.1%,39.7%,39.7%
1nd4,57,28,55.9%,67.1%,40.0%,33.3%
3dqw,50,58,52.1%,46.3%,39.7%,44.6%
4bz3,45,36,48.9%,55.6%,39.8%,35.1%


Las predicciones se vuelven complicadas porque no están tan bien distribuidos los datos de redes hacia abajo. Parece que son casi al azar, fuera de 1nd4 y 1be9, no parece que tengamos mucha información, los porcentajes de accuracy se parecen al porcentaje de predicciones. Creo que decir que tiene al menos una mutación estable no es tan buena métrica para ganancia de función, podemos considerar el promedio. 

In [29]:
def ComparePredictionsGain2(functional_cutoff, network_mincount, measure_cutoffs=[-1,-1,-1,-1], 
                           funct_method='percentage', data=all_positives):
    """ComparePredictionsGain with network mean for prediction selection."""
    predict = pd.DataFrame(index=proteins, columns=['True Positives', 'False Positives', 'Coverage', 'Accuracy',
                                                          'Functional %', 'Prediction %'])
    for protein in proteins:
        network_extremes_list = []
        for i,measure in enumerate(measures): 
            threshold = 3.8 if measure == 'distance' else 5.0
            network_df = Standardize(protein, threshold, measure)
            network_mean = network_df.mean()
            extremes = set([pos for pos in network_df.columns if network_mean[pos] < measure_cutoffs[i]])
            network_extremes_list.extend(extremes)

        counter = Counter(network_extremes_list)
        positions = list(set(network_extremes_list))
        network_extremes = set([pos for pos in positions if counter[pos] >= network_mincount])
        funct_extremes = GetSD(functional_cutoff, data[protein]) if funct_method == 'sd' \
                                        else GetPercentage(functional_cutoff, 'highest', data[protein])
        
        how_many = len(network_extremes.intersection(funct_extremes))
        positions = len(data[protein].columns)

        predict.at[protein, 'True Positives'] = how_many
        predict.at[protein, 'False Positives'] = len(network_extremes) - how_many 
        predict.at[protein,'Coverage']= ToPercentage(how_many, funct_extremes)
        predict.at[protein, 'Accuracy'] = ToPercentage(how_many, network_extremes)
        predict.at[protein,'Functional %'] = ToPercentage(funct_extremes, positions)
        predict.at[protein,'Prediction %'] = ToPercentage(network_extremes, positions)
        
    method = f'{functional_cutoff} SD' if funct_method == 'sd' else f'{functional_cutoff*100}%'
    print("Gain of function predictions:")
    print(f'mincount = {network_mincount}, functional cutoff = {method}, measure cutoffs = {measure_cutoffs}')
    display(predict)
    return 

In [30]:
for i in range(1,5):
    ComparePredictionsGain2(1, i, funct_method = 'sd', measure_cutoffs = [-1, -1, -1, -1])

Gain of function predictions:
mincount = 1, functional cutoff = 1 SD, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,3,6,17.6%,33.3%,20.5%,10.8%
1d5r,2,36,7.7%,5.3%,8.5%,12.4%
1nd4,13,26,32.5%,33.3%,15.7%,15.3%
3dqw,2,25,8.0%,7.4%,10.3%,11.2%
4bz3,7,25,24.1%,21.9%,12.6%,13.9%


Gain of function predictions:
mincount = 2, functional cutoff = 1 SD, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,2,5,11.8%,28.6%,20.5%,8.4%
1d5r,1,30,3.8%,3.2%,8.5%,10.1%
1nd4,9,22,22.5%,29.0%,15.7%,12.2%
3dqw,2,19,8.0%,9.5%,10.3%,8.7%
4bz3,6,10,20.7%,37.5%,12.6%,6.9%


Gain of function predictions:
mincount = 3, functional cutoff = 1 SD, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,0,0,0.0%,,20.5%,0.0%
1d5r,1,19,3.8%,5.0%,8.5%,6.5%
1nd4,8,14,20.0%,36.4%,15.7%,8.6%
3dqw,2,16,8.0%,11.1%,10.3%,7.4%
4bz3,4,6,13.8%,40.0%,12.6%,4.3%


Gain of function predictions:
mincount = 4, functional cutoff = 1 SD, measure cutoffs = [-1, -1, -1, -1]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,0,0,0.0%,,20.5%,0.0%
1d5r,1,10,3.8%,9.1%,8.5%,3.6%
1nd4,4,7,10.0%,36.4%,15.7%,4.3%
3dqw,1,13,4.0%,7.1%,10.3%,5.8%
4bz3,2,3,6.9%,40.0%,12.6%,2.2%


No mejora demasiado considerando los promedios, y es interesante que para tres de las proteínas podemos predecir más o menos, y para dos parece que no tenemos nada de información. Podríamos intentar hacer más pruebas, pero creo que no podemos sacar demasiada información sobre las posiciones con ganancia de función. 

Volviendo a comparar los thresholds, vamos a checar cómo se comparan las predicciones de posiciones para 5.0 y para 9.0. 

In [31]:
for measure in ['nodes', 'edges', 'weight']:
    df = pd.DataFrame(index=proteins, columns=['in 5.0/total 9.0', 'in 9.0/total 5.0'])
    for protein in proteins:
        network_df_5 = Standardize(protein, 5.0, measure)
        network_extremes_5 = set(network_df_5.columns[(network_df_5 > 1.5).any()].tolist())
        
        network_df_9 = Standardize(protein, 9.0, measure)
        network_extremes_9 = set(network_df_9.columns[(network_df_9 > 1.5).any()].tolist())
        #if network_extremes_5 == network_extremes_9:
            #print(protein, measure)
        both = network_extremes_5.intersection(network_extremes_9)
        df.at[protein, 'in 5.0/total 9.0'] = round(len(both)/len(network_extremes_9), 2)
        df.at[protein, 'in 9.0/total 5.0'] = round(len(both)/len(network_extremes_5), 2)
    print(measure)
    display(df)

nodes


Unnamed: 0,in 5.0/total 9.0,in 9.0/total 5.0
1be9,0.93,0.75
1d5r,0.91,0.84
1nd4,0.88,0.75
3dqw,0.77,0.72
4bz3,0.87,0.75


edges


Unnamed: 0,in 5.0/total 9.0,in 9.0/total 5.0
1be9,0.93,0.85
1d5r,0.95,0.92
1nd4,0.93,0.82
3dqw,0.86,0.87
4bz3,0.91,0.96


weight


Unnamed: 0,in 5.0/total 9.0,in 9.0/total 5.0
1be9,0.86,0.91
1d5r,0.79,0.84
1nd4,0.74,0.93
3dqw,0.71,0.94
4bz3,0.83,0.88


No está completamente contenido uno en el otro, y parece que la contención es algo distinta según la medida... Vamos a comparar las de pérdida de función con la intersección entre los de 5.0 y los de 9.0. La hipótesis es que debería tener mayor accuracy. 

In [32]:
def ComparePredictionsLoss2(functional_cutoff, network_mincount, measure_cutoffs=[1,1,1,1], 
                           funct_method='percentage', data=all_negatives):
    """ComparePredictionsLoss using intersection for thresholds 5.0 and 9.0 for predictions. """
    predict = pd.DataFrame(index=proteins, columns=['True Positives', 'False Positives', 'Coverage', 'Accuracy',
                                                          'Functional %', 'Prediction %'])
    for protein in proteins:
        network_extremes_list = []
        for i,measure in enumerate(measures): 
            if measure == 'distance':
                threshold = 3.8
                network_df = Standardize(protein, threshold, measure)
                extremes = network_df.columns[(network_df > measure_cutoffs[i]).any()].tolist()
                network_extremes_list.extend(extremes)
            else:
                network_df_5 = Standardize(protein, 5.0, measure)
                network_extremes_5 = set(network_df_5.columns[(network_df_5 > 1.5).any()].tolist())
                network_df_9 = Standardize(protein, 9.0, measure)
                network_extremes_9 = set(network_df_9.columns[(network_df_9 > 1.5).any()].tolist())
                extremes = network_extremes_5.intersection(network_extremes_9)
                network_extremes_list.extend(extremes)

        counter = Counter(network_extremes_list)
        positions = list(set(network_extremes_list))
        network_extremes = set([pos for pos in positions if counter[pos] >= network_mincount])
        funct_extremes = GetSD(functional_cutoff, data[protein]) if funct_method == 'sd' \
                                        else GetPercentage(functional_cutoff, 'lowest', data[protein])
                 
        how_many = len(network_extremes.intersection(funct_extremes))
        positions = len(data[protein].columns)

        predict.at[protein, 'True Positives'] = how_many
        predict.at[protein, 'False Positives'] = len(network_extremes) - how_many 
        predict.at[protein,'Coverage']= ToPercentage(how_many, funct_extremes)
        predict.at[protein, 'Accuracy'] = ToPercentage(how_many, network_extremes)
        predict.at[protein,'Functional %'] = ToPercentage(funct_extremes, positions)
        predict.at[protein,'Prediction %'] = ToPercentage(network_extremes, positions)
        
    method = f'{functional_cutoff} SD' if funct_method == 'sd' else f'{functional_cutoff*100}%'
    print("Loss of function predictions:")
    print(f'mincount = {network_mincount}, functional cutoff = {method}, measure cutoffs = {measure_cutoffs}')
    display(predict)
    return 

In [33]:
ComparePredictionsLoss(0.4, 4, measure_cutoffs=[1.5,1.5,1.5,1.5])
ComparePredictionsLoss(0.4, 4, measure_cutoffs=[1.5,1.5,1.5,1.5], thresh=9.0)
ComparePredictionsLoss2(0.4, 4, measure_cutoffs=[1.5,1.5,1.5,1.5])

Loss of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,18,9,54.5%,66.7%,39.8%,32.5%
1d5r,72,16,59.0%,81.8%,39.7%,28.7%
1nd4,40,11,39.2%,78.4%,40.0%,20.0%
3dqw,34,20,35.4%,63.0%,39.7%,22.3%
4bz3,48,15,52.2%,76.2%,39.8%,27.3%


Loss of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,14,7,42.4%,66.7%,39.8%,25.3%
1d5r,68,13,55.7%,84.0%,39.7%,26.4%
1nd4,37,5,36.3%,88.1%,40.0%,16.5%
3dqw,37,14,38.5%,72.5%,39.7%,21.1%
4bz3,49,5,53.3%,90.7%,39.8%,23.4%


Loss of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1.5, 1.5, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,13,7,39.4%,65.0%,39.8%,24.1%
1d5r,62,11,50.8%,84.9%,39.7%,23.8%
1nd4,33,5,32.4%,86.8%,40.0%,14.9%
3dqw,30,11,31.2%,73.2%,39.7%,16.9%
4bz3,43,5,46.7%,89.6%,39.8%,20.8%


Casi no hay cambio, lo cual coincide con que 9.0 da más información que 5.0, pero no estoy segura de por qué. Tal vez estamos logrando capturar más interacciones con los aminoácidos vecinos, o tal vez captura más efectos en cadena de cambios estructurales. 

Volviendo a la parte de ganancia de función, tal vez tengamos más información considerando el complemento de las predicciones para pérdida de función. 

In [37]:
def ComparePredictionsGain3(functional_cutoff, network_mincount, measure_cutoffs=[1,1,1,1], 
                           funct_method='percentage', data=all_positives, thresh=5.0):
    """ComparePredictionsGain using complement of loss predictions. """
    predict = pd.DataFrame(index=proteins, columns=['True Positives', 'False Positives', 'Coverage', 'Accuracy',
                                                          'Functional %', 'Prediction %'])
    for protein in proteins:
        network_extremes_loss = GetNetworkExtremes(protein, network_mincount, measure_cutoffs, thresh=thresh)
        total_positions = data[protein].columns
        network_extremes = set([pos for pos in total_positions if pos not in network_extremes_loss])
        funct_extremes = GetSD(functional_cutoff, data[protein]) if funct_method == 'sd' \
                                        else GetPercentage(functional_cutoff, 'highest', data[protein])
        
        how_many = len(network_extremes.intersection(funct_extremes))
        positions = len(data[protein].columns)

        predict.at[protein, 'True Positives'] = how_many
        predict.at[protein, 'False Positives'] = len(network_extremes) - how_many 
        predict.at[protein,'Coverage']= ToPercentage(how_many, funct_extremes)
        predict.at[protein, 'Accuracy'] = ToPercentage(how_many, network_extremes)
        predict.at[protein,'Functional %'] = ToPercentage(funct_extremes, positions)
        predict.at[protein,'Prediction %'] = ToPercentage(network_extremes, positions)
        
    method = f'{functional_cutoff} SD' if funct_method == 'sd' else f'{functional_cutoff*100}%'
    print("Gain of function predictions:")
    print(f'mincount = {network_mincount}, functional cutoff = {method}, measure cutoffs = {measure_cutoffs}')
    display(predict)
    return 

In [38]:
for i in range(1,5):
    ComparePredictionsGain3(0.4, i, measure_cutoffs=[1,1,1.5,1.5])

Gain of function predictions:
mincount = 1, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,16,9,48.5%,64.0%,39.8%,30.1%
1d5r,55,42,45.1%,56.7%,39.7%,31.6%
1nd4,76,35,74.5%,68.5%,40.0%,43.5%
3dqw,49,46,51.0%,51.6%,39.7%,39.3%
4bz3,54,30,58.7%,64.3%,39.8%,36.4%


Gain of function predictions:
mincount = 2, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,19,19,57.6%,50.0%,39.8%,45.8%
1d5r,87,64,71.3%,57.6%,39.7%,49.2%
1nd4,85,55,83.3%,60.7%,40.0%,54.9%
3dqw,56,57,58.3%,49.6%,39.7%,46.7%
4bz3,66,43,71.7%,60.6%,39.8%,47.2%


Gain of function predictions:
mincount = 3, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,20,22,60.6%,47.6%,39.8%,50.6%
1d5r,98,74,80.3%,57.0%,39.7%,56.0%
1nd4,89,70,87.3%,56.0%,40.0%,62.4%
3dqw,63,69,65.6%,47.7%,39.7%,54.5%
4bz3,74,53,80.4%,58.3%,39.8%,55.0%


Gain of function predictions:
mincount = 4, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,24,28,72.7%,46.2%,39.8%,62.7%
1d5r,106,99,86.9%,51.7%,39.7%,66.8%
1nd4,94,97,92.2%,49.2%,40.0%,74.9%
3dqw,77,98,80.2%,44.0%,39.7%,72.3%
4bz3,78,75,84.8%,51.0%,39.8%,66.2%


In [36]:
ComparePredictionsLoss(0.4, 1,measure_cutoffs=[1,1,1.5,1.5])

Loss of function predictions:
mincount = 1, functional cutoff = 40.0%, measure cutoffs = [1, 1, 1.5, 1.5]


Unnamed: 0,True Positives,False Positives,Coverage,Accuracy,Functional %,Prediction %
1be9,30,28,90.9%,51.7%,39.8%,69.9%
1d5r,113,97,92.6%,53.8%,39.7%,68.4%
1nd4,89,55,87.3%,61.8%,40.0%,56.5%
3dqw,73,74,76.0%,49.7%,39.7%,60.7%
4bz3,89,58,96.7%,60.5%,39.8%,63.6%


Tenemos ligeramente más información, aunque no es tan fácil predecir las de ganancia como las de pérdida. 