<h1>药物</h1>

In [1]:
with open('Davis.txt', 'r') as f:
    lines = f.readlines()

data = []
for line in lines:
    parts = line.strip().split(' ', 4)
    if len(parts) == 5:
        compound_id, protein_name, smiles, rest = parts[0], parts[1], parts[2], parts[3] + ' ' + parts[4]
        sequence, label = rest.rsplit(' ', 1)
        data.append({
            'compound_id': compound_id,
            'protein_name': protein_name,
            'smiles': smiles,
            'sequence': sequence,
            'label': int(label)
        })

In [2]:
print(len(data))

In [3]:
data

In [4]:
smiles = set()
for smile in data:
    smiles.add(smile['smiles'])

In [5]:
for smile in smiles:
    print(smile)

In [6]:
smiles = list(smiles)

In [7]:
print(len(smiles))

In [8]:
import torch
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cuda')

In [9]:
# from transformers import AutoTokenizer, RobertaModel
# 
#  = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MTR")
#  = RobertaModel.from_pretrained("DeepChem/ChemBERTa-10M-MTR").to(device)
# model.eval()

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM").to(device)
model.eval()

tokenizer里包括里input_ids和attention_mask

In [10]:
input = tokenizer(smiles,padding=True, truncation=True,max_length=512, return_tensors="pt").to(device)

In [11]:
input

In [12]:
with torch.no_grad():
    output = model(**input)

In [13]:
output.pooler_output

In [14]:
(output.pooler_output).shape

In [15]:
output.last_hidden_state.shape

In [16]:
output.last_hidden_state.mean(dim=1)

In [17]:
output.keys()

这个要保存为cpu不然保存不了，gpu的话怕适配不了所以要先保存为cpu

In [18]:
features = output.last_hidden_state.cpu()
torch.save(features, 'ligands_davis.pt')

In [19]:
import torch

features = torch.load('ligands_davis.pt')
print(features.shape)

<h1>蛋白质</h1>

In [20]:
with open('Davis.txt', 'r') as f:
    lines = f.readlines()

data = []
for line in lines:
    parts = line.strip().split(' ', 4)
    if len(parts) == 5:
        compound_id, protein_name, smiles, rest = parts[0], parts[1], parts[2], parts[3] + ' ' + parts[4]
        sequence, label = rest.rsplit(' ', 1)
        data.append({
            'compound_id': compound_id,
            'protein_name': protein_name,
            'smiles': smiles,
            'sequence': sequence,
            'label': int(label)
        })

proteins = set()
for protein in data:
    proteins.add(protein['sequence'])

In [21]:
proteins =list(proteins)
for protein in proteins:
    print(protein)

In [22]:
import torch
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cuda')
import re

# Load model directly
from transformers import T5Tokenizer, T5EncoderModel

local_path_model = "prot_t5_xl_uniref50/"
tokenizer = T5Tokenizer.from_pretrained(local_path_model)
model = T5EncoderModel.from_pretrained(local_path_model).to(device)
model.eval()

sequence_examples = proteins
# this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]

# tokenize sequences and pad up to the longest sequence in the batch
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

# generate embeddings
with torch.no_grad():
    embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)

# extract embeddings for the first ([0,:]) sequence in the batch while removing padded & special tokens ([0,:7]) 
emb_0 = embedding_repr.last_hidden_state[0,:7] # shape (7 x 1024)
print(f"Shape of per-residue embedding of first sequences: {emb_0.shape}")
# do the same for the second ([1,:]) sequence in the batch while taking into account different sequence lengths ([1,:8])
emb_1 = embedding_repr.last_hidden_state[1,:8] # shape (8 x 1024)

# if you want to derive a single representation (per-protein embedding) for the whole protein
emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)

print(f"Shape of per-protein embedding of first sequences: {emb_0_per_protein.shape}")

In [None]:

with open('Davis.txt', 'r') as f:
    lines = f.readlines()

data = []
for line in lines:
    parts = line.strip().split(' ', 4)
    if len(parts) == 5:
        compound_id, protein_name, smiles, rest = parts[0], parts[1], parts[2], parts[3] + ' ' + parts[4]
        sequence, label = rest.rsplit(' ', 1)
        data.append({
            'compound_id': compound_id,
            'protein_name': protein_name,
            'smiles': smiles,
            'sequence': sequence,
            'label': int(label)
        })

proteins = set()
for protein in data:
    proteins.add(protein['sequence'])

proteins = list(proteins)
print(len(proteins))

In [None]:
from transformers import AutoTokenizer, AutoModel
#local_path = "/Volumes/PASSPORT/FinalYearProject/ChemBERTa-77M-MLM"
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM").to(device)
model.eval()

｜分子相识性

In [None]:
from torch.nn.functional import cosine_similarity
import torch
device = torch.device("mps")

from transformers import AutoTokenizer, AutoModel
local_path = "/Volumes/PASSPORT/FinalYearProject/ChemBERTa-77M-MLM"
tokenizer = AutoTokenizer.from_pretrained(local_path)
model = AutoModel.from_pretrained(local_path).to(device)

smiles_1 = ["CCO"]
smiles_2 = ["CCN"]

tokens_1 = tokenizer(smiles_1, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
tokens_2 = tokenizer(smiles_2, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)

with torch.no_grad():
    emb_1 = model(**tokens_1).pooler_output  # 形状: (1, 768)
    emb_2 = model(**tokens_2).pooler_output

similarity = cosine_similarity(emb_1, emb_2)
print(f"SMILES 相似度: {similarity.item()}")
