In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
aaPairs = {"A":"Ala", "R":"Arg", "N":"Asn", "D":"Asp",
           "C":"Cys", "E":"Glu", "Q":"Gln", "G":"Gly",
           "H":"His", "I":"Ile", "L":"Leu", "K":"Lys",
           "M":"Met", "F":"Phe", "P":"Pro", "S":"Ser",
           "T":"Thr", "W":"Trp", "Y":"Tyr", "V":"Val"}

aaTable = dict(list(zip(*list(zip(*aaPairs.items()))[::-1])))

# Vidal Lab Data

In [None]:
edgotypes = pd.read_csv("data/y2hEdgotyping/qY2H_edgotyping_data.csv",index_col=0)

- ad_orf_id : ID of the ORF fused on Activation Domain (AD)
- db_orf_id : ID of the ORF fused on DNA-binding Domain (DB)
- db_mut_id : ID of the variant
- standard_batch : Name of the experiment
- assay_id : Type of Y2H experiment, I can explain you the details of that in our next meeting
- LW : Selective media score to control for the presence of both plasmids (AD and DB); scores range from 0-4 and should 3+ for most
- LWH1_f : Selective media score to test for interaction, for yeast spotted on SC -LW -histidine +1 mM 3AT (“level 1”); scores range from 0-4
- LWH10_f : Selective media score to test for interaction, for yeast spotted on SC -LW -histidine +10 mM 3AT (“level 2”); scores range from 0-4
- LWH25_f : Selective media score to test for interaction, for yeast spotted on SC -LW -histidine +25 mM 3AT (“level 3”); scores range from 0-4
- LWA_f :Selective media score to test for interaction, for yeast spotted on SC -LW -adenine (“level 4”); scores range from 0-4
- LWAH1_f :Selective media score to test for interaction, for yeast spotted on SC -LW -adenine -histidine +1 mM 3AT (“level 5”); scores range from 0-4
- n_condition : number of valid conditions (some conditions might be ignored if contamination for instance)
- score : sum positive levels; a level is considered positive if score > 1
- score_norm : score/n_condition
- ad_symbol : HGNC symbol of the ORF fused on Activation Domain (AD)
- ad_ensembl_gene_id : ad_ensembl_gene_id of the ORF fused on Activation Domain (AD)
- db_symbol : HGNC symbol of the ORF fused on the DNA-binding Domain (DB)
- db_ensembl_gene_id : ad_ensembl_gene_id of the ORF fused on the DNA-binding Domain (DB)
- nt_change : nt_change
- aa_change : aa_change
- clinical_significance : clinical_significance from ClinVar
- allele_score : score_norm
- wt_score : score_norm of the respective wild-type (WT score_norm if the row is WT)

In [None]:
mtSub = edgotypes[edgotypes.clinical_significance != "WT"]
wtSub = edgotypes[edgotypes.clinical_significance == "WT"]

In [None]:
mtSub[[c for c in mtSub.columns if "LW" in c] + ["score"]]

In [None]:
wtTraj = wtSub[[c for c in wtSub.columns if "LW" in c]].dropna().values

In [None]:
mtTraj = mtSub[[c for c in mtSub.columns if "LW" in c]].dropna().values

In [None]:
plt.fill_between(range(6),wtTraj.mean(0) - 1.96 * wtTraj.std(0),
                wtTraj.mean(0) + 1.96* wtTraj.std(0),color="blue")
plt.plot(wtTraj.mean(0),color="blue",label="WT")

plt.fill_between(range(6),mtTraj.mean(0) - 1.96 * mtTraj.std(0),
                mtTraj.mean(0) + 1.96* mtTraj.std(0),alpha=.5,color="orange")
plt.plot(mtTraj.mean(0),color="orange",label="MT")
plt.legend()

In [None]:
mergedEdgoTypes = pd.merge(mtSub,wtSub,how="left",
                           left_on=["db_orf_id","ad_orf_id"],
                           right_on=["db_orf_id","ad_orf_id"],
                           suffixes=["_mt","_wt"])

In [None]:
orf_seqs = pd.read_csv("data/y2hEdgotyping/ORF_sequence.csv",index_col=0)

In [None]:
y2HMerged = pd.merge(pd.merge(mergedEdgoTypes,orf_seqs,left_on="db_orf_id",
         right_on="orf_id",how="left"),orf_seqs,left_on="ad_orf_id",
        right_on="orf_id",suffixes=["_db","_ad"])

In [None]:
y2HMerged

In [None]:
def validateSeqs(r):
    ref,loc,alt = aaTable[r.aa_change_mt[:3]],int(r.aa_change_mt[3:-3]),aaTable[r.aa_change_mt[-3:]]
    return (r.p_seq_db[loc-1] == ref) and (ref != alt)

In [None]:
y2HMerged[y2HMerged.apply(validateSeqs,axis=1)]

In [None]:
y2HMerged[y2HMerged.apply(validateSeqs,axis=1)].to_csv("data/y2H_edgotyping_10_7_22/y2HMerged.csv")

# Look into Overlap with MaveDB

In [None]:
y2HSymbols = set(y2HMerged.db_symbol_mt).union(set(y2HMerged.ad_symbol_mt))

In [None]:
import requests

In [None]:
r = requests.get("https://www.mavedb.org/api/target/")

In [None]:
maveSymbols = set([t["name"] for t in r.json()])

In [None]:
maveSymbols.intersection(y2HSymbols)

In [None]:
maveIntersection = y2HMerged[(y2HMerged.db_symbol_mt.isin(maveSymbols))]

In [None]:
maveIntersection[["db_mut_id_mt","db_orf_id"]].drop_duplicates()

In [None]:
maveIntersection = maveIntersection.assign(hgvs_pro=maveIntersection.aa_change_mt.apply(lambda s: "p."+s)) 

In [None]:
maveIntersection.db_symbol_mt.value_counts()

In [None]:
maves = [pd.read_csv("data/maveDB/urn_mavedb_00000096-a-1_scores_GATK.csv",header=4),
        pd.read_csv("data/maveDB/urn_mavedb_00000001-d-1_scores_TPK1.csv",header=4)]

In [None]:
maves[1]

In [None]:
maveIntersectionJoined = pd.merge(maveIntersection,maves[0],left_on="hgvs_pro",right_on="hgvs_pro")

In [None]:
maveIntersectionJoined.score

In [None]:
plt.scatter(maveIntersectionJoined.score, maveIntersectionJoined.LWH25_f_wt - maveIntersectionJoined.LWH25_f_mt)
plt.xlabel("Mave")
plt.ylabel(r"$\Delta$ PPI (WT-MT)")
# plt.yticks(ticks=[0,1,2],labels=list("012"))

In [None]:
import seaborn as sns

import scipy.stats as ss

fig,ax = plt.subplots(2,6,figsize=(24,12))
for lvl,(axi,mtScores,wtScores) in enumerate(zip(ax.T,[y2HMerged.LW_mt,
                                          y2HMerged.LWH1_f_mt,
                                          y2HMerged.LWH10_f_mt,
                                          y2HMerged.LWH25_f_mt,
                                          y2HMerged.LWA_f_mt,
                                         y2HMerged.LWAH1_f_mt],
                                              [y2HMerged.LW_wt,
                                          y2HMerged.LWH1_f_wt,
                                          y2HMerged.LWH10_f_wt,
                                          y2HMerged.LWH25_f_wt,
                                          y2HMerged.LWA_f_wt,
                                         y2HMerged.LWAH1_f_wt]),start=0):
    mask = ~(pd.isna(mtScores) | pd.isna(wtScores))

    _,xb,yb,binVal = ss.binned_statistic_2d(mtScores[mask],wtScores[mask],np.zeros(mask.sum()),statistic=np.sum,
                                          bins=[np.arange(6),np.arange(6)],expand_binnumbers=True,)
    binVal -= 1
    cnts = np.zeros((5,5))
    for b in binVal.T:
        cnts[b[0],b[1]] += 1
    cnts /= cnts.sum()
    cnts *= 100
    sns.heatmap(cnts,ax=axi[0],annot=True,cmap="rocket_r",vmin=0,vmax=100,cbar=lvl == ax.shape[1]-1)
    axi[0].set_xlabel("WT Score")
    if lvl == 0:
        axi[0].set_ylabel("MT Score")
    axi[1].hist(mtScores.dropna(),bins=np.arange(0,6),label="mt")
    axi[1].hist(wtScores.dropna(),bins=np.arange(0,6),alpha=.5,label="wt")
    axi[1].set_xlabel("Score")
    if lvl == 0:
        axi[1].set_ylabel("Count")
    
    if not lvl:
        axi[0].set_title(f"Control")
    else:
        axi[0].set_title(f"Level {lvl:d}")
axi[1].legend()

In [None]:
fig.savefig("data/y2H_edgotyping_10_7_22/figs/scoreChanges.pdf",format="pdf")

In [None]:
tot = 0
with open("data/y2hEdgotyping/mutpred2Results/variants.faa","w") as f:
    for seq,group in y2HMerged[y2HMerged.apply(validateSeqs,axis=1)].groupby("p_seq_db"):
        orf_id = str(group.db_orf_id.unique()[0]).replace(" ","").replace(";","").replace(",","")
        uniqueVariants = group.aa_change_mt.unique()
        tot += len(uniqueVariants)
        for v in uniqueVariants:
            try:
                int(v[3:-3])
            except ValueError:
                print(v)
                raise ValueError
        varstr = " ".join([aaTable[v[:3]]+v[3:-3]+aaTable[v[-3:]] for v in uniqueVariants])
        r = f">db_orf_{orf_id} {varstr}\n{seq}\n"
        print(r)
        f.write(r)

In [None]:
tot

# Nature Paper

In [None]:
ls data/natureExtensiveDisruption/

In [None]:
pd.read_excel("data/natureExtensiveDisruption/41467_2019_11959_MOESM10_ESM.xlsx")

# HuRI Data

In [None]:
huriT = pd.read_csv("/data/dzeiberg/ppi/HuRI.tsv",delimiter="\t",header=None)

In [None]:
huriT.columns = ["A","B"]

In [None]:
huriT

In [None]:
huri = pd.read_csv("/data/dzeiberg/ppi/HuRI.psi",delimiter="\t",header=None)

In [None]:
huri.shape

In [None]:
huri.loc[0]

In [None]:
huri.loc[0,3]

In [None]:
huriT[huriT.A == "ENSG00000130518"]

In [None]:
huriT[huriT.A == "ENSG00000160014"]

In [None]:
huriT