Notes:

- z-score normalize WT interactions, then also apply to MT
- graph partitioned cross validation
- bootstrap MT for PPI pairs    
- estimate distributions in projected space (PCA,Autoencoder,etc.)
 - plot projected space



Evaluation:
- Does normalization affect prior?
- Is there consistency across feature representations?

In [None]:
from itertools import chain

In [None]:
import pandas as pd
edgotype_df = pd.read_csv("data/y2hEdgotyping/qY2H_edgotyping_data.csv",index_col=0)

In [None]:
edgotype_df.clinical_significance.value_counts()

In [None]:
wt_df = edgotype_df[edgotype_df.aa_change == "WT"]
mt_df = edgotype_df[edgotype_df.aa_change != "WT"]

In [None]:
wt_df[["db_ensembl_gene_id","db_symbol","ad_ensembl_gene_id","ad_symbol","aa_change","clinical_significance"]]

In [None]:
def getScores(wt):
    scoreColumns = ['LWH1_f', 'LWH10_f', 'LWH25_f', 'LWA_f', 'LWAH1_f']
    nameColumns = ["db_symbol","ad_symbol","aa_change","clinical_significance"]
    mts = mt_df[(mt_df.db_orf_id == wt["db_orf_id"]) & \
                (mt_df.ad_orf_id == wt["ad_orf_id"])]
    score_wt = wt[scoreColumns].values.astype(float).reshape((1,-1))
    name_wt = wt[nameColumns]
    if np.isnan(score_wt).any():
        return np.zeros((0,5)),np.zeros((0,5)),[],[]
    s_mts = mts[scoreColumns].dropna(axis=0)
    score_mts = s_mts.values.astype(float)
    if len(s_mts.index):
        _,name_mts = zip(*mts.loc[s_mts.index, nameColumns].iterrows())
    else:
        name_mts = []
    return score_wt, score_mts, [name_wt], name_mts

In [None]:
import numpy as np

In [None]:
def encode_scores(scores):
    encoded = np.zeros((scores.shape[0],25))
    for i,s in enumerate(scores):
        for j,sj in enumerate(s):
            encoded[i,5 * j + int(sj)] = 1
    return encoded

In [None]:
ENCODE = False
if ENCODE:
    dim = 25
    penalty='l2'
else:
    dim = 5
    penalty=None
score_wt = np.zeros((0,dim))
score_mt = np.zeros((0,dim))
names_wt = []
names_mt = []
for wt_id, wt in wt_df.iterrows():
    score_wt_i, score_mt_i,name_wt,name_mts = getScores(wt)
    assert len(score_mt_i) == len(name_mts)
    if ENCODE:
        score_wt_i = encode_scores(score_wt_i)
        score_mt_i = encode_scores(score_mt_i)
    score_wt = np.concatenate((score_wt, score_wt_i))
    score_mt = np.concatenate((score_mt,score_mt_i))
    names_wt.append(name_wt)
    names_mt.append(name_mts)

In [None]:
score_wt.shape

In [None]:
score_wt

In [None]:
wt_means = score_wt.mean(1)

In [None]:
plt.hist(wt_means,bins=25)

In [None]:
score_wt

In [None]:
score_mt.shape

In [None]:
X = np.concatenate((score_wt, score_mt))
y = np.concatenate((np.ones(score_wt.shape[0]),
                    np.zeros(score_mt.shape[0])))

In [None]:
namerecords = list(chain.from_iterable(names_wt + names_mt))
names = pd.DataFrame(namerecords,index=range(len(namerecords)))

In [None]:
names.loc[[4484,4485]]

In [None]:
names

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [None]:
import matplotlib.pyplot as plt

In [None]:
import dist_curve
from dist_curve.curve_constructor import makeCurve, plotCurve
from dist_curve.model import getTrainedEstimator

In [None]:
model = getTrainedEstimator("/data/dzeiberg/ClassPriorEstimation/model.hdf5")

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
from ppi.nnpu import getPosterior

In [None]:
clfs = []
aucs = []
priors = []
fig,ax = plt.subplots(2,5,figsize=(12,4),sharex=True,sharey=True)
fig2,ax2 = plt.subplots(2,5,figsize=(16,3))
for i,(trainInd,testInd) in enumerate(KFold(shuffle=True).split(X,y)):
    print(f"~~~~~~~~~~ Fold {i} ~~~~~~~~~~")
    XTr,yTr = X[trainInd],y[trainInd]
    XTe, yTe = X[testInd],y[testInd]
    names_Te = names.iloc[testInd]
    clf_i = LogisticRegression(penalty="l2")
    clf_i.fit(XTr,yTr)
#     posScores = clf_i.predict_proba(XTe[yTe == 0])[:,1].reshape((-1,1))
#     mixScores = clf_i.predict_proba(XTe[yTe == 1])[:,1].reshape((-1,1))
    scores = clf_i.predict_proba(XTe)[:,1]
    posScores = scores[yTe == 1].reshape((-1,1))
    mixScores = scores[yTe == 0].reshape((-1,1))
    auc = roc_auc_score(yTe,scores)
    aucs.append(auc)
    print("AUC: {:.3f}".format(auc))
    n,bins,patches = ax[0,i].hist(posScores,
               bins=10,density=True)
    _,_,_ = ax[1,i].hist(mixScores,
                      bins=bins,density=True,alpha=.5)
    ax[0,i].set_title(f"Fold-{i+1} Positive")
    ax[1,i].set_title(f"Fold-{i+1} Unlabeled")
    clfs.append(clf_i)
    curve_i = makeCurve(posScores,mixScores)
    ax2[0,i].plot(np.arange(0,1,.01),
                (curve_i - curve_i.min()) / (curve_i.max() - curve_i.min()))
    alpha_i = model.predict(curve_i.reshape((1,-1)) / curve_i.sum(),
                           verbose=0)[0,0]
    print(f"prior est: {alpha_i:.3f}")
    ax2[0,i].axvline(alpha_i,0,1)
    ax2[0,i].set_title(f"Fold {i+1}")
    priors.append(alpha_i)
    train_preds,net_i = getPosterior(XTr,yTr.reshape((-1,1)),alpha_i)
    test_preds = net_i.predict(XTe)
    
    print(names_Te.iloc[np.argsort(test_preds.ravel())][:5])
    ax2[1,i].hist(test_preds)
print(f"Average AUC_PU: {np.mean(aucs):.2f}")
print(f"Average Prior: {np.mean(priors):.2f}")