In [1]:
from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder


from bio_embeddings.project import tsne_reduce,umap_reduce
from bio_embeddings.visualize import render_3D_scatter_plotly

from Bio import SeqIO
import matplotlib.pyplot as plt 

import pandas as pd 
import re 
import seaborn as sns
import os.path as osp 
import numpy as np 
import os 

  from .autonotebook import tqdm as notebook_tqdm


查看不同的embedding的方式，对SLF和RNase的区分度，以及
- mean 
- concat
- sum
- minus

四种简单的模态融合的方式能否区分自交亲和与不亲和

In [3]:


def embed_sis(sisFilePath, embedder):
    # if hasattr(embedder, "name"):
    #     embedderName = getattr(embedder, "name")
    # else:
    #     embedderName = 'unk'
    

    total = pd.read_csv(sisFilePath)
    for i in ["SLF_Seq", "SRnase_Seq"]:
        embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(total[i])]
        # total[f"{i}_{embedderName}"] = pd.Series(embeddings)
        total[f"{i}_embedding"] = pd.Series(embeddings)

    return total 


def fusion_slf_RNase(data):
    inter_data = pd.DataFrame()
    inter_data["name"] = data["SLF"]+"_" + data["SRnase"]
    inter_data["SLF"] = data["SLF"]
    inter_data["RNase"] = data["SRnase"]
    inter_data["label"] = data["label"]

    inter_data["mean"] =( data["SLF_Seq_embedding"] + data["SRnase_Seq_embedding"] )/ 2
    inter_data["sum"] =data["SLF_Seq_embedding"] + data["SRnase_Seq_embedding"] 
    inter_data["minus"] =data["SLF_Seq_embedding"] - data["SRnase_Seq_embedding"] 

    def concat(x, cols):
        concat_data = []
        for col in cols:
            concat_data.append(x[col])
        return np.concatenate(concat_data)
    inter_data["concat"] = data.apply(lambda x: concat(x, ["SLF_Seq_embedding", "SRnase_Seq_embedding"]), axis=1)
    return inter_data



def plot_SLF_RNase(SLF=None, RNase=None, ax=None, **kwargs):
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 10))
    if SLF is not None:
            
        SLF["SLF_num"] = SLF["name"].apply(lambda x: re.findall(r"SLF\d+", x)[0])
        sns.scatterplot(SLF[SLF["SLF_num"].isin([f"SLF{i}" for i in range(1, 6)])], x="compoent_0", y="compoent_1", hue="SLF_num", ax = ax )
    if RNase is not None:
        sns.scatterplot(RNase, x="compoent_0", y="compoent_1", hue="name", marker="*", s=100, ax = ax)
    
    
def mkdirs(path):
    try:
        os.makedirs(path)
    except:
        pass



def mainFunc(embedder, filePath, saveRootDir):
    if hasattr(embedder, "name"):
        embedderName = getattr(embedder, "name")
    else:
        embedderName = "unk"

    saveRootDir_embedder = osp.join(saveRootDir, embedderName)
    mkdirs(saveRootDir_embedder)

    o = embed_sis(filePath, embedder)
    # save embedding 
    o.to_csv(osp.join(saveRootDir_embedder, f"{embedderName}_embedding.csv"))

    # plot slf and RNase
    options = {
        'n_components': 2,
        "n_jobs":10
    }
    SLF_embedding = o.loc[:, ["SLF", "SLF_Seq", "SLF_Seq_embedding"]].rename(columns={"SLF":"name", "SLF_Seq":"seq", f"SLF_Seq_embedding":"embedding"}).drop_duplicates(["name", "seq"]).reset_index(drop=True)
    RNase_embedding = o.loc[:, ["SRnase", "SRnase_Seq", "SRnase_Seq_embedding"]].rename(columns={"SRnase":"name", "SRnase_Seq":"seq", f"SRnase_Seq_embedding":"embedding"}).drop_duplicates(["name", "seq"]).reset_index(drop=True)
    # tsne SLF
    embedding_tsne_SLF = tsne_reduce(SLF_embedding["embedding"].to_list(),  **options)
    for i in range(embedding_tsne_SLF.shape[1]):
        SLF_embedding[f"compoent_{i}"] = embedding_tsne_SLF[:, i]
    # tsne RNase
    embedding_tsne_RNase = tsne_reduce(RNase_embedding["embedding"].to_list(),  **options)
    for i in range(embedding_tsne_RNase.shape[1]):
        RNase_embedding[f"compoent_{i}"] = embedding_tsne_RNase[:, i]

    fig, ax = plt.subplots(figsize=(10, 10))
    plot_SLF_RNase(SLF= SLF_embedding,  RNase = RNase_embedding,ax=ax)
    fig.savefig(osp.join(saveRootDir_embedder, "SLF_RNase_tsne.png"), dpi=400)


    # fusion slf and SRnase
    fusion_data = fusion_slf_RNase(o)
    # tsne each fusion type at 2d 

    options = {
        'n_components': 2,
        "n_jobs":10
    }

    fusion_data_tsne_dict = {}

    for col in fusion_data.columns[4:]:
        fusion_data_tsne = fusion_data.iloc[:, :4]
        embedding_tsne = tsne_reduce(fusion_data[col].to_list(),  **options)

        for i in range(embedding_tsne.shape[1]):
            fusion_data_tsne[f"compoent_{i}"] = embedding_tsne[:, i]
            
        fusion_data_tsne_dict[col] = fusion_data_tsne


    # plot 2d 

    length = len(fusion_data_tsne_dict)
    fig, axes = plt.subplots(length, 3, figsize=(10*3, 10*length))
    for row, (key, df) in enumerate(fusion_data_tsne_dict.items()):
        tmp_df = df[df["label"] != -1].copy()
        ax1 =  axes[row, 0]
        sns.scatterplot(tmp_df, x="compoent_0", y="compoent_1", hue="label",ax =ax1)
        ax1.set_title(f"{key} and hue by label")

        ax2=axes[row, 1]
        sns.scatterplot(tmp_df, x="compoent_0", y="compoent_1", hue="RNase",ax = ax2)
        ax2.set_title(f"{key} and hue by RNase")

        ax3 = axes[row, 2]
        tmp_df["SLF_num"] = tmp_df["SLF"].apply(lambda x: re.findall(r"SLF\d+", x)[0])
        sns.scatterplot(tmp_df, x="compoent_0", y="compoent_1", hue="SLF_num", ax = ax3)
        ax3.set_title(f"{key} and hue by SLF_num")
    fig.savefig(osp.join(saveRootDir_embedder, "SLF_RNase_pair_tsne.png"), dpi=400)



In [3]:
from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, ProtTransT5UniRef50Embedder, ProtTransXLNetUniRef100Embedder,ProtTransT5BFDEmbedder, ProtTransAlbertBFDEmbedder, ProtTransT5XLU50Embedder, BeplerEmbedder, ESM1bEmbedder, ESM1vEmbedder, ESMEmbedder,CPCProtEmbedder, PLUSRNNEmbedder

saveRootDir = "./embeddingSave"
mkdirs(saveRootDir)

filePath = "/p300s/wangmx_group/xutingfeng/SIS/data/SLF1.csv"


for embedder in [SeqVecEmbedder, ProtTransBertBFDEmbedder, ProtTransT5UniRef50Embedder, ProtTransXLNetUniRef100Embedder,ProtTransT5BFDEmbedder, ProtTransAlbertBFDEmbedder, ProtTransT5XLU50Embedder, BeplerEmbedder, ESM1bEmbedder, ESM1vEmbedder, ESMEmbedder,CPCProtEmbedder, PLUSRNNEmbedder]:
    try:    
        EMBERDER = embedder()
        mainFunc(EMBERDER, filePath, saveRootDir)
        print(f"success{embedder.name}")
        del EMBERDER
    except:
        print(f"failure: {embedder.name}")
        pass 



[t-SNE] Computing 15 nearest neighbors...
[t-SNE] Indexed 16 samples in 0.000s...
[t-SNE] Computed neighbors for 16 samples in 1.133s...
[t-SNE] Computed conditional probabilities for sample 16 / 16
[t-SNE] Mean sigma: 0.019026
[t-SNE] KL divergence after 250 iterations with early exaggeration: 63.008144
[t-SNE] KL divergence after 1850 iterations: 0.264278
[t-SNE] Computing 6 nearest neighbors...
[t-SNE] Indexed 7 samples in 0.000s...
[t-SNE] Computed neighbors for 7 samples in 0.014s...
[t-SNE] Computed conditional probabilities for sample 7 / 7
[t-SNE] Mean sigma: 2.075498




[t-SNE] KL divergence after 250 iterations with early exaggeration: 41.140308
[t-SNE] KL divergence after 450 iterations: 0.157692




[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.001s...
[t-SNE] Computed neighbors for 112 samples in 0.111s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.012950
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.547482
[t-SNE] KL divergence after 800 iterations: 0.752811
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.024s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.012950




[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.547485
[t-SNE] KL divergence after 800 iterations: 0.752811
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.001s...
[t-SNE] Computed neighbors for 112 samples in 0.100s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.021341




[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.924969
[t-SNE] KL divergence after 850 iterations: 0.789910
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.035s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.015702




[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.824490
[t-SNE] KL divergence after 1900 iterations: 0.151092
successseqvec


Some weights of the model checkpoint at /home/xutingfeng/.cache/bio_embeddings/prottrans_bert_bfd/model_directory were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[t-SNE] Computing 15 nearest neighbors...
[t-SNE] Indexed 16 samples in 0.000s...
[t-SNE] Computed neighbors for 16 samples in 0.060s...
[t-SNE] Computed conditional probabilities for sample 16 / 16
[t-SNE] Mean sigma: 0.014526
[t-SNE] KL divergence after 250 iterations with early exaggeration: 66.066093
[t-SNE] KL divergence after 1150 iterations: 0.160901
[t-SNE] Computing 6 nearest neighbors...
[t-SNE] Indexed 7 samples in 0.000s...
[t-SNE] Computed neighbors for 7 samples in 0.010s...
[t-SNE] Computed conditional probabilities for sample 7 / 7
[t-SNE] Mean sigma: 1.595448
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.277561




[t-SNE] KL divergence after 5750 iterations: 0.162500




[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.001s...
[t-SNE] Computed neighbors for 112 samples in 0.108s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.009562
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.783512
[t-SNE] KL divergence after 2850 iterations: 0.526232
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.054s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.009562




[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.783512
[t-SNE] KL divergence after 2850 iterations: 0.526232
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.040s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.016125




[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.227654
[t-SNE] KL divergence after 1300 iterations: 0.396517
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.057s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.011663




[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.420292
[t-SNE] KL divergence after 5200 iterations: 0.129709
successprottrans_bert_bfd
failure: prottrans_t5_uniref50


Some weights of the model checkpoint at /home/xutingfeng/.cache/bio_embeddings/prottrans_xlnet_uniref100/model_directory were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


failure: prottrans_xlnet_uniref100
failure: prottrans_t5_bfd


Some weights of the model checkpoint at /home/xutingfeng/.cache/bio_embeddings/prottrans_albert_bfd/model_directory were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'sop_classifier.classifier.bias', 'predictions.decoder.bias', 'sop_classifier.classifier.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


failure: prottrans_albert_bfd
failure: prottrans_t5_xl_u50




[t-SNE] Computing 15 nearest neighbors...
[t-SNE] Indexed 16 samples in 0.000s...
[t-SNE] Computed neighbors for 16 samples in 0.019s...
[t-SNE] Computed conditional probabilities for sample 16 / 16
[t-SNE] Mean sigma: 0.134365
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.900291
[t-SNE] KL divergence after 1750 iterations: 0.110886
[t-SNE] Computing 6 nearest neighbors...
[t-SNE] Indexed 7 samples in 0.000s...
[t-SNE] Computed neighbors for 7 samples in 0.011s...
[t-SNE] Computed conditional probabilities for sample 7 / 7
[t-SNE] Mean sigma: 2.828427
[t-SNE] KL divergence after 250 iterations with early exaggeration: 43.422234




[t-SNE] KL divergence after 550 iterations: 0.158711




[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.019s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.053976
[t-SNE] KL divergence after 250 iterations with early exaggeration: 68.756012
[t-SNE] KL divergence after 1000 iterations: 0.978538
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.020s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.053976




[t-SNE] KL divergence after 250 iterations with early exaggeration: 68.756012
[t-SNE] KL divergence after 1000 iterations: 0.978537
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.016s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.069420




[t-SNE] KL divergence after 250 iterations with early exaggeration: 63.877544
[t-SNE] KL divergence after 2250 iterations: 0.646616
[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 112 samples in 0.000s...
[t-SNE] Computed neighbors for 112 samples in 0.028s...
[t-SNE] Computed conditional probabilities for sample 112 / 112
[t-SNE] Mean sigma: 0.067022




[t-SNE] KL divergence after 250 iterations with early exaggeration: 67.913353
[t-SNE] KL divergence after 1700 iterations: 0.681234
successbepler


## raw test code

In [5]:
saveRootDir = "./embeddingSave"
mkdirs(saveRootDir)

filePath = "/p300s/wangmx_group/xutingfeng/project_development/mmodel/pipline/SLF1.csv"

In [6]:
embedder = SeqVecEmbedder()

if hasattr(embedder, "name"):
    embedderName = getattr(embedderName, "name")
else:
    embedderName = "unk"

saveRootDir_embedder = osp.join(saveRootDir, embedderName)
mkdirs(saveRootDir_embedder)

o = embed_sis(filePath, embedder)
# save embedding 
o.to_csv(osp.join(saveRootDir_embedder, f"{embedderName}_embedding.csv"))


NameError: name 'embedderName' is not defined

In [None]:
# plot slf and RNase
options = {
    'n_components': 2,
    "n_jobs":10
}
SLF_embedding = o.loc[:, ["SLF", "SLF_Seq", "SLF_Seq_embedding"]].rename(columns={"SLF":"name", "SLF_Seq":"seq", f"SLF_Seq_embedding":"embedding"}).drop_duplicates(["name", "seq"]).reset_index(drop=True)
RNase_embedding = o.loc[:, ["SRnase", "SRnase_Seq", "SRnase_Seq_embedding"]].rename(columns={"SRnase":"name", "SRnase_Seq":"seq", f"SRnase_Seq_embedding":"embedding"}).drop_duplicates(["name", "seq"]).reset_index(drop=True)
# tsne SLF
embedding_tsne_SLF = tsne_reduce(SLF_embedding["embedding"].to_list(),  **options)
for i in range(embedding_tsne_SLF.shape[1]):
    SLF_embedding[f"compoent_{i}"] = embedding_tsne_SLF[:, i]
# tsne RNase
embedding_tsne_RNase = tsne_reduce(RNase_embedding["embedding"].to_list(),  **options)
for i in range(embedding_tsne_RNase.shape[1]):
    RNase_embedding[f"compoent_{i}"] = embedding_tsne_RNase[:, i]

fig, ax = plt.subplots(figsize=(10, 10))
plot_SLF_RNase(SLF= SLF_embedding,  RNase = RNase_embedding,ax=ax)
fig.savefig(osp.join(saveRootDir_embedder, "SLF_RNase_tsne.png"), dpi=400)


In [None]:
# fusion slf and SRnase
fusion_data = fusion_slf_RNase(o)
# tsne each fusion type at 2d 

options = {
    'n_components': 2,
    "n_jobs":10
}

fusion_data_tsne_dict = {}

for col in fusion_data.columns[4:]:
    fusion_data_tsne = fusion_data.iloc[:, :4]
    embedding_tsne = tsne_reduce(fusion_data[col].to_list(),  **options)

    for i in range(embedding_tsne.shape[1]):
        fusion_data_tsne[f"compoent_{i}"] = embedding_tsne[:, i]
        
    fusion_data_tsne_dict[col] = fusion_data_tsne


# plot 2d 

length = len(fusion_data_tsne_dict)
fig, axes = plt.subplots(length, 3, figsize=(10*3, 10*length))
for row, (key, df) in enumerate(fusion_data_tsne_dict.items()):
    tmp_df = df[df["label"] != -1].copy()
    ax1 =  axes[row, 0]
    sns.scatterplot(tmp_df, x="compoent_0", y="compoent_1", hue="label",ax =ax1)
    ax1.set_title(f"{key} and hue by label")

    ax2=axes[row, 1]
    sns.scatterplot(tmp_df, x="compoent_0", y="compoent_1", hue="RNase",ax = ax2)
    ax2.set_title(f"{key} and hue by RNase")

    ax3 = axes[row, 2]
    tmp_df["SLF_num"] = tmp_df["SLF"].apply(lambda x: re.findall(r"SLF\d+", x)[0])
    sns.scatterplot(tmp_df, x="compoent_0", y="compoent_1", hue="SLF_num", ax = ax3)
    ax3.set_title(f"{key} and hue by SLF_num")
fig.savefig(osp.join(saveRootDir_embedder, "SLF_RNase_pair_tsne.png"), dpi=400)


In [None]:
# tsne each fusion type at 3d 
options = {
    'n_components': 3,
    "n_jobs":10
}

fusion_data_tsne_dict3d = {}

for col in fusion_data.columns[4:]:
    fusion_data_tsne = fusion_data.iloc[:, :4]
    embedding_tsne = tsne_reduce(fusion_data[col].to_list(),  **options)

    for i in range(embedding_tsne.shape[1]):
        fusion_data_tsne[f"compoent_{i}"] = embedding_tsne[:, i]
        
    fusion_data_tsne_dict3d[col] = fusion_data_tsne

In [None]:
fusion_data_tsne_dict["mean"]

In [None]:
import plotly.express as px



In [None]:
df = fusion_data_tsne_dict3d["mean"][fusion_data_tsne_dict3d["minus"]["label"]!=-1]

df["label"] = df["label"].astype("str")
fig = px.scatter_3d(df, x='compoent_0', y='compoent_1', z='compoent_2',
              color='label')
fig.show()

## 废弃

In [None]:


filePathList=["/p300s/wangmx_group/xutingfeng/SIS/data/fasta/single/SLF.fasta", "/p300s/wangmx_group/xutingfeng/SIS/data/fasta/single/SRnase.fasta"]

embedding_tsne_df_List = []


for filePath in filePathList:
    for embedder_name in embedder_dict.keys():
        embedder =embedder_dict[embedder_name]

        df = pd.DataFrame([[i.name,str(i.seq)] for i in SeqIO.parse(filePath, "fasta")], columns = ["name", "seq"])

        # Generate embeddings for heay and light CDR3 AA sequences
        embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(df["seq"])]

        o = pd.DataFrame([df["name"], pd.Series(embeddings, name="embedding")]).T
        o["embedderName"] = embedder_name
        embedding_tsne_df_List.append(o)
        

    # options = {
    #     'min_dist': .1,
    #     'spread': 8,
    #     'n_neighbors': 160,
    #     'metric': 'euclidean',
    #     'n_components': 2,
    #     'random_state': 10
    # }

        # options = {
        #     'n_components': 2,
        #     "n_jobs":10
        # }

        # embedding_tsne = tsne_reduce(embeddings, **options)
        # embedding_tsne_df =  pd.DataFrame(embedding_tsne, columns=["component_0", "component_1"])

        # embedding_tsne_df["label"] =  df["name"]
        # embedding_tsne_df_List.append(embedding_tsne_df)



# sns.scatterplot(data = embedding_tsne_df[embedding_tsne_df["label"].isin([f"SLF{i}" for i in range(1, 6)])], x= "component_0", y ="component_1", hue="label")

In [None]:
data_embedding = pd.concat(embedding_tsne_df_List)

In [None]:
data_embedding

In [None]:
def func(x):
    if "SLF" in x:
        return re.findall(r"SLF\d+", x)[0]
    if "RNase" in x:
        return x 


data_embedding["label"] = data_embedding["name"].apply(func)


In [None]:
options = {
    'n_components': 2,
    "n_jobs":10
}

embedding_tsne = tsne_reduce(data_embedding["embedding"].to_list(), **options)
embedding_tsne_df =  pd.DataFrame(embedding_tsne, columns=["component_0", "component_1"])
    
data_embedding = pd.concat([data_embedding.reset_index(drop=True), embedding_tsne_df], axis=1)

In [None]:
fig, axes = plt.subplots(1, 2,figsize=(20, 10))
axes = axes.flatten()
for embedder_name,ax in zip(data_embedding["embedderName"].unique(), axes):
    data_embedde_plt = data_embedding[data_embedding["embedderName"] == embedder_name]

    sns.scatterplot(data = data_embedde_plt[data_embedde_plt["label"].isin([f"SLF{i}" for i in range(1, 6)])], x= "component_0", y ="component_1", hue="label", ax = ax )
    sns.scatterplot(data = data_embedde_plt[data_embedde_plt["label"].isin([f"S{i}-RNase" for i in range(1,20)])], x= "component_0", y ="component_1", hue="label", marker="*", s=100, ax = ax)
    ax.set_title(f"{embedder_name}")

# sns.scatterplot(data_embedde_plt = slf_embedding_tsne_df[slf_embedding_tsne_df["label"].isin([f"SLF{i}" for i in range(1, 6)])], x= "component_0", y ="component_1", hue="label")
# data_embedde_plt[data_embedde_plt["label"].isin([*[f"S{i}-RNase" for i in range(1,20)], *[f"SLF{i}" for i in range(1, 6)]])]

In [None]:
SLF_embedding = data_embedding[data_embedding["name"].apply(lambda x: "SLF" in x)].loc[:, ["name", "embedding", "embedderName"]].rename(columns = {"embedding":"SLF_embedding"})
RNase_embedding = data_embedding[data_embedding["name"].apply(lambda x: "RNase" in x)].loc[:, ["name", "embedding", "embedderName"]].rename(columns = {"embedding":"RNase_embedding"})




In [None]:
total = pd.read_csv("/p300s/wangmx_group/xutingfeng/SIS/data/total_haplotype_combination_data.csv")
total

In [None]:
total_embedding = total.merge(SLF_embedding, left_on="SLF", right_on="name", how="left").drop("name", axis=1).merge(RNase_embedding, left_on=["SRnase", "embedderName"], right_on=["name", "embedderName"], how="left").drop("name", axis=1)

total_embedding

In [None]:


# total = pd.read_csv("/p300s/wangmx_group/xutingfeng/SIS/data/total_data.csv")
# total = total[total["label"] != -1]

# for i in ["SLF_Seq", "SRnase_Seq"]:

#     embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(total[i])]

#     total[i] = pd.Series(embeddings)
    

In [None]:
def func(x):
    x["SLF_RNase_concat"] = np.concatenate([x["SLF_embedding"], x["RNase_embedding"]])
    x["SLF_RNase_mean"] = np.stack([x["SLF_embedding"], x["RNase_embedding"]], axis=1).mean(axis=1)
    return x 

total_inter = total_embedding.apply(func, axis=1)
total_inter

In [None]:
test_data = total_inter[total_inter["embedderName"] == "SeqVec"]["SLF_RNase_concat"].to_list()



In [None]:

options = {
    'n_components': 2,
    "n_jobs":10
}


embedder_res = {}
for embedderName, embedder_df in total_inter.groupby("embedderName"):
    
    res = []
    for col in ["SLF_RNase_concat", "SLF_RNase_mean"]:
        embedding_tsne = tsne_reduce(embedder_df[col].to_list(), **options)

        embedding_tsne_df =  pd.DataFrame(embedding_tsne, columns=["component_0", "component_1"])
        embedding_tsne_df["name"] =  embedder_df["SLF"] + "_" + embedder_df["SRnase"]
        embedding_tsne_df["label"] =  embedder_df["label"]

        res.append(embedding_tsne_df)
    embedder_res[embedderName] = res

    

In [None]:
embedding_tsne.shape

In [None]:
import matplotlib.pyplot as plt 

res = embedder_res["ProtTansBertBFD"]
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes = axes.flatten()

for idx,(name, ax )in enumerate(zip(["SLF_RNase_concat", "SLF_RNase_mean"], axes)):
    sns.scatterplot(data =res[idx], x= "component_0", y ="component_1", hue="label", ax = ax )
    ax.set_title(f"{name}") 

In [None]:
res[0]

In [None]:
for i in res:
    print(i["name"])
    i["SLF_name"] = i["name"].apply(lambda x: re.findall(r"SLF\d+", x)[0])
    i["RNase_name"] = i["name"].apply(lambda x: re.findall(r"S\d+-RNase", x)[0])
    #  data[data["label"].isin([f"S{i}-RNase" for i in range(1,20)])]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes = axes.flatten()

for idx,(name, ax )in enumerate(zip(["SLF_RNase_concat", "SLF_RNase_mean"], axes)):
    sns.scatterplot(data =res[idx], x= "component_0", y ="component_1", hue="SLF_name", ax = ax )
    ax.set_title(f"{name}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes = axes.flatten()

for idx,(name, ax )in enumerate(zip(["SLF_RNase_concat", "SLF_RNase_mean"], axes)):
    sns.scatterplot(data =res[idx], x= "component_0", y ="component_1", hue="RNase_name", ax = ax )
    ax.set_title(f"{name}")

In [None]:
t = res[0]
fig, ax = plt.subplots(figsize=(20, 20))
sns.scatterplot(data =t, x= "component_0", y ="component_1", hue="label", ax =ax)




# for _, df in t.iterrows():
#     x= df["component_0"]
#     y = df["component_1"]
#     s = df["name"] + "_" + str(df["label"])
#     ax.annotate(text = s,xy=(x,y), xycoords='data')
    
# ax.set_xlim(-100, 50)
# ax.set_ylim(70, 120)
# plt.xlim(-53, -45)
# plt.ylim(-5, 10)

In [None]:
t