In [1]:
import torch
from transformers import AlbertModel, AlbertTokenizer
import re
import os
import requests
from tqdm.auto import tqdm

In [2]:
tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

In [3]:
model = AlbertModel.from_pretrained("Rostlab/prot_albert")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/897M [00:00<?, ?B/s]

Some weights of the model checkpoint at Rostlab/prot_albert were not used when initializing AlbertModel: ['predictions.decoder.bias', 'sop_classifier.classifier.weight', 'predictions.dense.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'sop_classifier.classifier.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [5]:
model = model.to(device)
model = model.eval()

In [7]:
import networkx as nx
edgotype = nx.read_gexf("/data2/edgotype/edgotype.gefx")

In [9]:
import os
import pandas as pd
seqFiles = [pd.read_csv(f"/data2/edgotype/uniprotScan/sequence_{i}.tsv",delimiter="\t") for i in range(6)]

uniprotMatches = pd.concat(seqFiles)
def mergeWithUniprot(graph):
    for node in graph.nodes(data=True):
        seq = node[1]["seq"]
        up = uniprotMatches[(uniprotMatches.Sequence == seq) & \
                            (uniprotMatches.Reviewed == "reviewed") & \
                           (uniprotMatches.Organism == "Homo sapiens (Human)")]
        graph.nodes[node[0]]["uniprotMatches"] = up
        alphafoldStructures = []
        for uniprot_id in graph.nodes[node[0]]["uniprotMatches"]["Entry"]:
            fp = f"/data/dzeiberg/alphafold/predictions/AF-{uniprot_id}-F1-model_v4.pdb.gz"
            if os.path.isfile(fp):
                alphafoldStructures.append(fp)
        graph.nodes[node[0]]["alphafoldStructures"] = alphafoldStructures
    return graph
edgotype_x = mergeWithUniprot(edgotype)

In [37]:
sequences,ensg_ids = list(zip(*[(" ".join(list(n["seq"])),ensg) for ensg,n in edgotype_x.nodes(data=True)]))

In [38]:
sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences]

In [39]:
len(sequences)

578

In [40]:
sequences[0]

'M A S H K L L V T P P K A L L K P L S I P N Q L L L G P G P S N L P P R I M A A G G L Q M I G S M S K D M Y Q I M D E I K E G I Q Y V F Q T R N P L T L V I S G S G H C A L E A A L V N V L E P G D S F L V G A N G I W G Q R A V D I G E R I G A R V H P M T K D P G G H Y T L Q E V E E G L A Q H K P V L L F L T H G E S S T G V L Q P L D G F G E L C H R Y K C L L L V D S V A S L G G T P L Y M D R Q G I D I L Y S G S Q K A L N A P P G T S L I S F S D K A K K K M Y S R K T K P F S F Y L D I K W L A N F W G C D D Q P R M Y H H T I P V I S L Y S L R E S L A L I A E Q G L E N S W R Q H R E A A A Y L H G R L Q A L G L Q L F V K D P A L R L P T V T T V A V P A G Y D W R D I V S Y V I D H F D I E I M G G L G P S T G K V L R I G L L G C N A T R E N V D R V T E A L R A A L Q H C P K K K L'

In [41]:
ids = tokenizer.batch_encode_plus(sequences, add_special_tokens=True, padding='longest')

In [42]:
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

In [54]:
import torch.utils.data as data_utils

ds = data_utils.TensorDataset(input_ids,attention_mask)
loader = data_utils.DataLoader(ds, batch_size=1, shuffle=False)

In [55]:
embeddings = []
with torch.no_grad():
    for (inp_id, inp_att_mask) in tqdm(loader):
        embeddings.append(model(input_ids=inp_id,attention_mask=inp_att_mask)[0].cpu().numpy())

  0%|          | 0/578 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.40 GiB (GPU 0; 10.76 GiB total capacity; 8.89 GiB already allocated; 1.01 GiB free; 8.92 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [29]:
embedding = embedding.cpu().numpy()

In [32]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embedding[seq_num][1:seq_len-1]
    features.append(seq_emd)

In [36]:
sequences[0]

'ENSG00000172482'