# Measuring the Correlation on MutPred2 Pathogenicity Score and Y2H Score Change
> Here, I investigate if there is correlation between MutPred2 probability of pathogenicity and the change in Y2H assay scores resulting from mutations

## Read in MutPred2 Output

In [None]:
import pandas as pd

In [None]:
import seaborn as sns

In [None]:
def validateSeqs(r):
    ref,loc,alt = aaTable[r.aa_change_mt[:3]],int(r.aa_change_mt[3:-3]),aaTable[r.aa_change_mt[-3:]]
    return (r.p_seq_db[loc-1] == ref) and (ref != alt)

In [None]:
aaPairs = {"A":"Ala", "R":"Arg", "N":"Asn", "D":"Asp",
           "C":"Cys", "E":"Glu", "Q":"Gln", "G":"Gly",
           "H":"His", "I":"Ile", "L":"Leu", "K":"Lys",
           "M":"Met", "F":"Phe", "P":"Pro", "S":"Ser",
           "T":"Thr", "W":"Trp", "Y":"Tyr", "V":"Val"}

aaTable = dict(list(zip(*list(zip(*aaPairs.items()))[::-1])))

In [None]:
mp = pd.read_csv("data/y2hEdgotyping/mutpred2Results/variants.faa.out")

In [None]:
mp

## Read in Y2H pre-processed version from nb_01

In [None]:
y2h = pd.read_csv("data/y2hEdgotyping/y2HMerged.csv",index_col=0)

### Get the degree counts of each "central node"

In [None]:
y2h.groupby(by=["db_orf_id","aa_change_mt"]).ad_orf_id.count().sort_values()#.hist()

## Merge the MutPred2 and Y2H dataframes

In [None]:
mp = mp.assign(db_orf_id=mp.ID.apply(lambda s: s.replace("db_orf_","")),
               aa_change_mt=mp.Substitution.apply(lambda s: aaPairs[s[0]]+s[1:-1]+aaPairs[s[-1]]))

In [None]:
mp

In [None]:
y2h = y2h.assign(db_orf_id=y2h.db_orf_id.astype(str))

In [None]:
y2h

In [None]:
df = pd.merge(y2h[y2h.apply(validateSeqs,axis=1)],mp,
              left_on=["db_orf_id","aa_change_mt"],right_on=["db_orf_id","aa_change_mt"],
              how="inner")

In [None]:
df

# Calculate the relative score for each variant-PPI pair
> Each row of df is composed of scores for a single PPI in 5 selective media (settings) before and after mutating one of the two interacting proteins.
Each of the 10 scores can range from 0-4, indicating the number of technical replicates in which a colony grew.
To measure the effect on a single protein-protein interaction for a variant, I calculate the change in counts for each of the 5 selective media. Previously, a change of 2 or more was interpreted as disrupting PPI

In [None]:
import matplotlib.pyplot as plt

In [None]:
df = df.assign(delta1=df.apply(lambda r: r.LWH1_f_wt-r.LWH1_f_mt,axis=1),
               delta2=df.apply(lambda r: r.LWH10_f_wt-r.LWH10_f_mt,axis=1),
               delta3=df.apply(lambda r: r.LWH25_f_wt-r.LWH25_f_mt,axis=1),
               delta4=df.apply(lambda r: r.LWA_f_wt-r.LWA_f_mt,axis=1),
               delta5=df.apply(lambda r: r.LWAH1_f_wt-r.LWAH1_f_mt,axis=1))

In [None]:
preview = df[["ad_orf_id", "db_orf_id","Substitution","delta1","delta2","delta3","delta4","delta5"]].dropna()

In [None]:
df = df[~df[["ad_orf_id", "db_orf_id","Substitution",
             "delta1","delta2","delta3","delta4","delta5"]].isna().any(1)]

In [None]:
df

In [None]:
import numpy as np

In [None]:
preview.delta1 = preview.delta1.astype(np.int32)
preview.delta2 = preview.delta2.astype(np.int32) 
preview.delta3 = preview.delta3.astype(np.int32) 
preview.delta4 = preview.delta4.astype(np.int32) 
preview.delta5 = preview.delta5.astype(np.int32) 

In [None]:
preview

## Parse probabilities and p-values of the functional effects of each variant estimated by MutPred2

In [None]:
import re

In [None]:
def getFloats(s):
    numeric_const_pattern = '[-+]? (?: (?: \d* \. \d+ ) | (?: \d+ \.? ) )(?: [Ee] [+-]? \d+ ) ?'
    rx = re.compile(numeric_const_pattern,re.VERBOSE)
    return [float(f) for f in rx.findall(s)]

In [None]:
ppiVals = []
for s in df["Molecular mechanisms with Pr >= 0.01 and P < 0.99"]:
    ppiVals.append(dict([(si[:si.find("(")].strip(),getFloats(si)) for si in s.split(";") if "Altered PPI_residue" in si or \
                        "Altered PPI_hotspot" in si or "Altered MoRF" in si]))

In [None]:
df = df.assign(ppiVals=ppiVals)

In [None]:
df

In [None]:
df.groupby(["db_orf_id","aa_change_mt"]).count()

In [None]:
df.ad_ensembl_gene_id_mt

## Prepare for Plotting

In [None]:
def getVal(d,key):
    if key in d:
        return d[key][0]
    return np.nan

In [None]:
grp = df.groupby(["db_orf_id","aa_change_mt"])
GAIN_OR_LOSS = False
vals = np.zeros((len(grp),15))
keys = []
for i,(idx,g) in enumerate(grp):
    keys.append(idx)
    for j in range(1,6):
        if GAIN_OR_LOSS:
            vals[i,j-1] = (np.abs(g[f"delta{j}"]) >= 2).mean()
            vals[i,5+j-1] = (np.abs(g[f"delta{j}"]) >= 2).sum()
        else:
            vals[i,j-1] = (g[f"delta{j}"] >= 2).mean()
            vals[i,5+j-1] = (g[f"delta{j}"] >= 2).sum()
    vals[i,-5] = np.nanmean(g["ppiVals"].apply(lambda d: getVal(d,"Altered PPI_residue")))
    vals[i,-4] = np.nanmean(g["ppiVals"].apply(lambda d: getVal(d,"Altered PPI_hotspot")))
    vals[i,-3] = np.nanmean(g["ppiVals"].apply(lambda d: getVal(d,"Altered MoRF")))
    vals[i,-2] = g["MutPred2 score"].mean()
    vals[i,-1] = len(g)

In [None]:
statDF = pd.DataFrame(vals,columns=[f"fD{i}" for i in range(5)]+[f"numD{i}" for i in range(5)] + ["AlteredPPIResiduePr",
                                                                                                  "AlteredPPIHotspotPr",
                                                                                                  "AlteredMoRFPr",
                                                                                                  "MutPred2 score",
                                                                                                  "db_degree"],
                      index=pd.MultiIndex.from_tuples(keys))

In [None]:
statDF.to_csv("data/y2hEdgotyping/variantStatistics.csv")

In [None]:
statDF

In [None]:
statDF.numD2.value_counts().sort_index()

In [None]:
def mpDigitize(mpScore):
    bins = np.ones_like(mpScore).astype(float) * np.nan
    for i,s in enumerate(mpScore):
        if s <= .01:
            bins[i] = 0
        elif s <= .197:
            bins[i] = 1
        elif s <= .391:
            bins[i] = 2
        elif s < .737:
            bins[i] = 3
        elif s < .829:
            bins[i] = 4
        elif s < .932:
            bins[i] = 5
        elif s >= .932:
            bins[i] = 6
        else:
            raise ValueError(f"Couldn't bin value {s}")
    return bins.astype(int)

In [None]:
NBINS=5
MINDEGREE = 1
MAXDEGREE = np.inf
suffDf = statDF[(statDF.db_degree >= MINDEGREE) & (statDF.db_degree < MAXDEGREE)].dropna()
mpPercentiles = np.percentile(suffDf["MutPred2 score"],
                        np.arange(100/NBINS,100,100/NBINS))
altResPercentiles = np.nanpercentile(suffDf["AlteredPPIResiduePr"],
                        np.arange(100/NBINS,100,100/NBINS))
altHotPercentiles = np.nanpercentile(suffDf["AlteredPPIHotspotPr"],
                        np.arange(100/NBINS,100,100/NBINS))
altMoRFPercentiles = np.nanpercentile(suffDf["AlteredMoRFPr"],
                        np.arange(100/NBINS,100,100/NBINS))

suffDf = suffDf.assign(MutPred2ScoreBin=np.digitize(suffDf["MutPred2 score"],
                                                   mpPercentiles,right=True),
                      MutPred2PaperBin=mpDigitize(suffDf["MutPred2 score"]),
                      alteredResidueBin=np.digitize(suffDf["AlteredPPIResiduePr"],
                                                   altResPercentiles, right=True),
                      alteredHotspotBin=np.digitize(suffDf["AlteredPPIHotspotPr"],
                                                   altHotPercentiles, right=True),
                      alteredMoRFBin=np.digitize(suffDf["AlteredMoRFPr"],
                                                   altMoRFPercentiles, right=True))

In [None]:
from matplotlib.colors import ListedColormap

In [None]:
cm = ListedColormap(sns.color_palette("GnBu", 4))

## Plot

In [None]:
def makePlot(gb,ax,xlab,ticks,lgndLoc="upper left",width=.75,byCount=False,lgnd=False,yLab=False):
    if byCount:
        counts = np.zeros((len(gb),4))
        numDCutoffs = [1,5,10]
        indices = []
        for i,(idx,g) in enumerate(gb):
            indices.append(idx)
            groupBins = np.digitize(g,numDCutoffs,right=True)
            binNum,binCount = np.unique(groupBins,return_counts=True)
            counts[i,binNum] += binCount
        for binNum,binHeights in enumerate(counts.T):
            if binNum == 0:
                lbl = "0"
            elif binNum == len(numDCutoffs):
                lbl = "10+"
            else:
                lbl = f"({numDCutoffs[binNum-1]},{numDCutoffs[binNum]}]"
            ax.bar(np.array(indices) + ((binNum/4.0) - .5) * width,
                  height=binHeights,width=width/4,
                  color=cm(binNum),label=lbl)
    else:
        lowCut,highCut = np.arange(0,1,.25),np.arange(.25,1.25,.25)
        highCut[-1] = 1.01
        counts = [gb.aggregate(lambda fracs: ((fracs >= c[0]) & (fracs < c[1])).sum()) for c in zip(lowCut,highCut)]
        for i,height in enumerate(counts,start=0):
            if i == len(counts)-1:
                lbl = "[0.75,1.0]"
            else:
                lbl = f"[{lowCut[i]},{highCut[i]})"
            ax.bar(height.index +((i/4.0) - .5) * width,
                      height=height.values,width=width/4,
                     color=cm(i),label=lbl)
    ax.set_xlabel(xlab)
    if yLab:
        ax.set_ylabel("Number Mutations")
    
    ax.set_xticks(np.arange(len(ticks)).astype(int))
    ax.set_xticklabels(ticks)
    ax.set_xlim(-1,len(ticks))
    handles, labels = ax.get_legend_handles_labels()
    if lgnd:
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(handles[::-1], labels[::-1],loc='center left', bbox_to_anchor=(1, 0.5))

    #     ax.legend(, loc=lgndLoc)

In [None]:
for d in range(5):
    fig,ax = plt.subplots(1,5,figsize=(24,6),sharey=True)
    
#     makePlot(suffDf.groupby("MutPred2ScoreBin")[f"fD{d}"],ax[0],"MutPred2 Score Quantile",
#             np.arange(NBINS),"upper right")
    
#     makePlot(suffDf.groupby("MutPred2PaperBin")[f"fD{d}"],ax[1],"ClinGen SVI WG Recommendation",
#             ["B3","B2","B1","-","P1","P2","P3"])
    
#     makePlot(suffDf.groupby("alteredResidueBin")[f"fD{d}"],ax[2],"Altered PPI Residue Quantile",
#             np.arange(NBINS))
    
#     makePlot(suffDf.groupby("alteredHotspotBin")[f"fD{d}"],ax[3],"Altered PPI Hotspot Quantile",
#             np.arange(NBINS))
    
#     makePlot(suffDf.groupby("alteredMoRFBin")[f"fD{d}"],ax[4],"Altered MoRF Quantile",
#             np.arange(NBINS),lgnd=True)

    makePlot(suffDf.groupby("MutPred2ScoreBin")[f"numD{d}"],ax[0],"MutPred2 Score Quantile",
            np.arange(NBINS),"upper right",byCount=True,yLab=True)
    
    makePlot(suffDf.groupby("MutPred2PaperBin")[f"numD{d}"],ax[1],"MutPred2 Score ClinGen SVI WG Recommendation",
            ["B3","B2","B1","-","P1","P2","P3"],byCount=True)
    
    makePlot(suffDf.groupby("alteredResidueBin")[f"numD{d}"],ax[2],"Altered PPI Residue Quantile",
            np.arange(NBINS),byCount=True)
    
    makePlot(suffDf.groupby("alteredHotspotBin")[f"numD{d}"],ax[3],"Altered PPI Hotspot Quantile",
            np.arange(NBINS),byCount=True)
    
    makePlot(suffDf.groupby("alteredMoRFBin")[f"numD{d}"],ax[4],"Altered MoRF Quantile",
            np.arange(NBINS),byCount=True,lgnd=True)
    if GAIN_OR_LOSS:
        fig.suptitle(f"Level {d+1} - Number of PPIs Affected |WT-MT| >= 2")
    else:
        fig.suptitle(f"Level {d+1} - Number of PPIs Affected (WT-MT) >= 2")
#     plt.savefig(f"figs/mutPredCorrelation_lvl_{d+1}.pdf",format="pdf")
#     plt.savefig(f"figs/mutPredCorrelation_lvl_{d+1}.jpg",format="jpg")
    plt.show()
    

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score((suffDf.numD2 > 0).values,
              suffDf["MutPred2 score"].values)

In [None]:
roc_auc_score((suffDf.numD2 > 0).values,
              suffDf["AlteredPPIResiduePr"].values)

In [None]:
roc_auc_score((suffDf.numD2 > 0).values,
              suffDf["AlteredPPIHotspotPr"].values)

In [None]:
roc_auc_score((suffDf.numD2 > 0).values,
              suffDf["AlteredMoRFPr"].values)

In [None]:
suffDf