In [1]:
cd ..

/home/anqi/lab/llps/1st/UDSM-LLPS-Syn/task_1


In [2]:
import pandas as pd
import numpy as np
import random

# This file is accessed from the original UDSMProt repository
from model_utils import *
random.seed(42)

<h3><span style='background :yellow' >1. Load DrLLPS' protein sequences and tokenize</span></h3>

In [3]:
cd application/

/home/anqi/lab/llps/1st/UDSM-LLPS-Syn/task_1/application


In [4]:
DrLLPS = pd.read_csv("DrLLPS_data.csv")
DrLLPS

Unnamed: 0,Status,DrLLPS ID,Ensemble Gene ID,UniProt Accession,Gene Name,Organism,Sequence,LLPS_labels
0,reviewed,LLPS-Hos-3149,ENSG00000097007.17,P00519,ABL1,Homo,MGQQPGKVLGDQRRPSLPALHFIKGAGKKESSRHGGPHCNVFVEHE...,scaffold
1,reviewed,LLPS-Hos-1686,ENSG00000123908.11,Q9UKV8,AGO2,Homo,MYSGAGPALAPPAPPPPIQGYAFKPPPRPDFGTSGRTIKLQANFFE...,scaffold
2,reviewed,LLPS-Hos-2530,ENSG00000142192.20,P05067,APP,Homo,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,scaffold
3,reviewed,LLPS-Hos-2173,ENSG00000204842.15,Q99700,ATXN2,Homo,MRSAAAAPRSPAVATESRRFAAARWPGWRSLQRPARRSGRGGGGAA...,scaffold
4,reviewed,LLPS-Hos-0662,ENSG00000168488.18,Q8WWM7,ATXN2L,Homo,MLKPQPLQQPSQPQQPPPTQQAVARRPPGGTSPPNGGLPGPLATSA...,scaffold
...,...,...,...,...,...,...,...,...
3622,reviewed,LLPS-Hos-4071,ENSG00000103994.17,Q9H2Y7,ZNF106,Homo,MVRERKCILCHIVYSSKKEMDEHMRSMLHHRELENLKGRDISHECR...,regulator
3623,reviewed,LLPS-Hos-1898,ENSG00000112200.16,Q9Y4E5,ZNF451,Homo,MGDPGSEIIESVPPAGPEASESTTDENEDDIQFVSEGPLRPVLEYI...,regulator
3624,reviewed,LLPS-Hos-0399,ENSG00000167962.13,Q86UK7,ZNF598,Homo,MRVLCEQRYCAVCREELRQVVFGKKLPAFATIPIHQLQHEKKYDIY...,regulator
3625,reviewed,LLPS-Hos-0365,ENSG00000066379.14,Q9P1U0,ZNRD1,Homo,MSVMDLANTCSSFQSDLDFCSDCGSVLPLPGAQDTVTCIRCGFNIN...,regulator


In [5]:
from tqdm import tqdm as tqdm
from pathlib import Path

In [6]:
DrLLPS = DrLLPS.rename(columns = {'UniProt Accession':'Uniprot_ID'})
DrLLPS['LLPS_labels'].value_counts()

client       2998
regulator     529
scaffold      100
Name: LLPS_labels, dtype: int64

In [7]:
path = Path("./")

In [8]:
path.mkdir(parents=True, exist_ok=True) 

In [9]:
WORKING_FOLDER = path

In [10]:
# save the UniProt entry information and category information of DrLLPS database

np.save(path/'UniProt_IDs.npy', DrLLPS['Uniprot_ID'])
np.save(path/'DrLLPS_labels.npy', DrLLPS['LLPS_labels'])

In [11]:
pad_idx=0
mask_idx=1
insert_bos=True
insert_eos=False
insert_oov=False
max_entries=0
max_vocab=60000
min_freq=2

In [12]:
label_none= "_none_" #special label: none (i.e. irrelevant) token for annotation labels e.g. for padding/eos but also for irrelevant phosphorylation site predictions
label_bg = "_bg_" #special label: background token for annotation labels

token_oov="_oov_"
token_pad="_pad_"
token_bos="_bos_"
token_eos="_eos_"
token_mask="_mask_"

In [13]:
# tokenizer
def list_tokenizer(seq): 
    '''default tokenizer just returns a list'''
    return [t for t in seq]

In [14]:
tokenizer=list_tokenizer

In [15]:
tok_itos_in = np.load(path/"tok_itos.npy", allow_pickle=True)
tok_itos_in

array(['_pad_', '_mask_', 'L', 'S', 'A', 'E', 'G', 'P', 'V', 'K', 'R', 'T', 'D', 'Q', 'I', 'N', 'F', 'Y', 'H', 'C',
       'M', 'W', '_bos_', 'X', 'U'], dtype='<U6')

In [16]:
#tokenize text (to be parallelized)
pred_tok = []

for index, row in tqdm(DrLLPS.iterrows()):
    item_tok = tokenizer(row['Sequence'])
    if(insert_bos):
        item_tok=[token_bos]+item_tok
    if(insert_eos):
        item_tok=item_tok +[token_eos]
    pred_tok.append(item_tok)
    
# turn into integers
tok_itos = tok_itos_in
print("tok_itos (", len(tok_itos), "items):",list(tok_itos))
tok_stoi = defaultdict(lambda:(len(tok_itos) if insert_oov else pad_idx), {v:k for k,v in enumerate(tok_itos)})
tok_num = np.array([[tok_stoi[o] for o in p] for p in pred_tok])
np.save(path/"pred_tok.npy",tok_num)

3627it [00:00, 19064.35it/s]


tok_itos ( 25 items): ['_pad_', '_mask_', 'L', 'S', 'A', 'E', 'G', 'P', 'V', 'K', 'R', 'T', 'D', 'Q', 'I', 'N', 'F', 'Y', 'H', 'C', 'M', 'W', '_bos_', 'X', 'U']


  tok_num = np.array([[tok_stoi[o] for o in p] for p in pred_tok])


<h3><span style='background :yellow' >2. Load the fine-tuned UDSM-LLPS(Random) model</span></h3>

In [17]:
PRETRAINED_FOLDER1 = Path("../")

In [18]:
learn = load_learner(PRETRAINED_FOLDER1, 'UDSM-LLPS_Random.pkl')

In [19]:
itos={i:x for i,x in enumerate(tok_itos)}
itos

{0: '_pad_',
 1: '_mask_',
 2: 'L',
 3: 'S',
 4: 'A',
 5: 'E',
 6: 'G',
 7: 'P',
 8: 'V',
 9: 'K',
 10: 'R',
 11: 'T',
 12: 'D',
 13: 'Q',
 14: 'I',
 15: 'N',
 16: 'F',
 17: 'Y',
 18: 'H',
 19: 'C',
 20: 'M',
 21: 'W',
 22: '_bos_',
 23: 'X',
 24: 'U'}

In [20]:
vocab=Vocab(itos)

In [21]:
text_pred = TextList(items=tok_num, vocab=vocab, pad_idx=pad_idx, path=WORKING_FOLDER, processor=[]).split_none().label_empty().databunch(bs=9,pad_idx=pad_idx)
text_pred

TextClasDataBunch;

Train: LabelList (3627 items)
x: TextList
_bos_ M G Q Q P G K V L G D Q R R P S L P A L H F I K G A G K K E S S R H G G P H C N V F V E H E A L Q R P V A S D F E P Q G L S E A A R W N S K E N L L A G P S E N D P N L F V A L Y D F V A S G D N T L S I T K G E K L R V L G Y N H N G E W C E A Q T K N G Q G W V P S N Y I T P V N S L E K H S W Y H G P V S R N A A E Y L L S S G I N G S F L V R E S E S S P G Q R S I S L R Y E G R V Y H Y R I N T A S D G K L Y V S S E S R F N T L A E L V H H H S T V A D G L I T T L H Y P A P K R N K P T V Y G V S P N Y D K W E M E R T D I T M K H K L G G G Q Y G E V Y E G V W K K Y S L T V A V K T L K E D T M E V E E F L K E A A V M K E I K H P N L V Q L L G V C T R E P P F Y I I T E F M T Y G N L L D Y L R E C N R Q E V N A V V L L Y M A T Q I S S A M E Y L E K K N F I H R D L A A R N C L V G E N H L V K V A D F G L S R L M T G D T Y T A H A G A K F P I K W T A P E S L A Y N K F S I K S D V W A F G V L L W E I A T Y G M S P Y P G I D L S Q 

In [22]:
learn.data

TextClasDataBunch;

Train: LabelList (0 items)
x: TextList

y: CategoryList

Path: ..;

Valid: LabelList (0 items)
x: TextList

y: CategoryList

Path: ..;

Test: None

In [23]:
# Assign text_pred to the loaded learner
learn.data = text_pred

In [24]:
learn.data

TextClasDataBunch;

Train: LabelList (3627 items)
x: TextList
_bos_ M G Q Q P G K V L G D Q R R P S L P A L H F I K G A G K K E S S R H G G P H C N V F V E H E A L Q R P V A S D F E P Q G L S E A A R W N S K E N L L A G P S E N D P N L F V A L Y D F V A S G D N T L S I T K G E K L R V L G Y N H N G E W C E A Q T K N G Q G W V P S N Y I T P V N S L E K H S W Y H G P V S R N A A E Y L L S S G I N G S F L V R E S E S S P G Q R S I S L R Y E G R V Y H Y R I N T A S D G K L Y V S S E S R F N T L A E L V H H H S T V A D G L I T T L H Y P A P K R N K P T V Y G V S P N Y D K W E M E R T D I T M K H K L G G G Q Y G E V Y E G V W K K Y S L T V A V K T L K E D T M E V E E F L K E A A V M K E I K H P N L V Q L L G V C T R E P P F Y I I T E F M T Y G N L L D Y L R E C N R Q E V N A V V L L Y M A T Q I S S A M E Y L E K K N F I H R D L A A R N C L V G E N H L V K V A D F G L S R L M T G D T Y T A H A G A K F P I K W T A P E S L A Y N K F S I K S D V W A F G V L L W E I A T Y G M S P Y P G I D L S Q 

```
text_pred has been assigned to the loaded-learner under DatasetType-Train
```

In [26]:
probs1, unlabeled_targs1_ = learn.get_preds(ds_type=DatasetType.Train,ordered = True)

In [27]:
# Load the Uniprot ids and DrLLPS labels of each instance in the text_pred
uniprot_ids = np.load(WORKING_FOLDER/'UniProt_IDs.npy', allow_pickle = True)
labels = np.load(WORKING_FOLDER/'DrLLPS_labels.npy', allow_pickle = True)

In [28]:
filename_output1 = "UDSM-LLPS-Random_predictions.npz"
np.savez(WORKING_FOLDER/filename_output1,UniProt_IDs=uniprot_ids,LLPS_Propensity=probs1,DrLLPS_labels=labels)

<h3><span style='background :yellow' >3. Load the fine-tuned UDSM-LLPS(UniRef) model</span></h3>

In [30]:
PRETRAINED_FOLDER2 = Path("../")

In [31]:
learn2 = load_learner(PRETRAINED_FOLDER2, 'UDSM-LLPS_UniRef.pkl')

In [32]:
learn2.data = text_pred

In [33]:
probs2, unlabeled_targs2_ = learn2.get_preds(ds_type=DatasetType.Train,ordered = True)

In [34]:
filename_output2 = "UDSM-LLPS-UniRef_predictions.npz"
np.savez(WORKING_FOLDER/filename_output2,UniProt_IDs=uniprot_ids,LLPS_Propensity=probs2,DrLLPS_labels=labels)