## Recommendations for Efficient Usage:

Prioritize using CUDA-enabled devices (NVIDIA GPUs) over Google TPUs for better compatibility.

Due to network speed limitations:
* Uploading large FASTA files containing numerous sequences may fail.
* Downloading ZIP archives with extensive results might also fail.

Mitigation strategies:
* Split large files into smaller batches for processing.
* Mount your Google Drive to Colab's working directory and perform uploads/downloads directly from Google Drive.

In [None]:
#@title Install dependencies

!git clone https://github.com/ComputBiophys/ProtRAP-LM.git
import torch
import numpy as np
import argparse,csv,sys
import os,requests
import torch.nn as nn
import torch.nn.functional as nnF
import os # 用于检测 Colab TPU 环境
from google.colab import drive,files
!pip install biopython
from Bio import SeqIO
from tqdm import tqdm
# --- 设备检测 ---
device = None
device_type = None

if 'COLAB_TPU_ADDR' in os.environ and os.environ['COLAB_TPU_ADDR']:
    try:
        import torch_xla
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()
        device_type = "TPU"
        print("TPU detected. Using TPU.")
    except ImportError:
        print("TPU environment detected, but torch_xla is not installed.")
        print("Falling back to CPU/GPU check.")
if device is None:
    if torch.cuda.is_available():
        device = torch.device("cuda")
        device_type = "GPU"
        print(f"GPU detected. Using {torch.cuda.get_device_name(0)}") # 显示 GPU 名称
    else:
        device = torch.device("cpu")
        device_type = "CPU"
        print("No TPU or GPU detected. Using CPU.")

model_path=lambda x:'ProtRAP-LM/models/model_'+str(x)+'.pts'
github_url=lambda x:f"https://github.com/ComputBiophys/ProtRAP-LM/releases/download/Version1.0/model_{str(x)}.pts"

def download_file(url, output_path):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(output_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded file from {url} to {output_path}")
    except Exception as e:
        print(f"Error downloading file: {e}, You may manually download this one")

for i in range(10):
    if not os.path.exists(model_path(i)):
        print('Downloading model_'+str(i))
        download_file(github_url(i), model_path(i))

def fasta_load(fasta_dir):
    fp = open(fasta_dir, 'r')
    lines = fp.readlines()
    fp.close()
    sequence = ''
    for line in lines[1:]:
        sequence = sequence + line.split()[0]
    return sequence
def weight_MSE_loss(labels,logits,weights=1):
    l=(labels-logits)**2
    l=l*weights
    return torch.sum(l)
def focal_loss_softmax(labels,logits):
    y_pred=logits
    l=-labels*torch.log(y_pred+1e-8)*((1-y_pred)**2)
    return torch.sum(l)

class MultiScaleCNN(nn.Module):
    def __init__(self,input_dim=1280,output_dim=256):#,size=[3,7,11],padding=[1,3,5]):
        super().__init__()
        self.cnn1=nn.Conv1d(input_dim,output_dim,3,padding=1)
        self.cnn2=nn.Conv1d(input_dim,output_dim,5,padding=2)
        self.cnn3=nn.Conv1d(input_dim,output_dim,7,padding=3)
        self.cnn4=nn.Conv1d(input_dim,output_dim,9,padding=4)
    def forward(self,x):
        x=x.permute(0,2,1)
        x1=self.cnn1(x)
        x2=self.cnn2(x)
        x3=self.cnn3(x)
        x4=self.cnn4(x)
        x=torch.cat((x1,x2,x3,x4), -2)
        x=x.permute(0,2,1)
        return x

class ProtRAP_LM():

    def __init__(self,device_name='cpu'):
        device = torch.device(device_name)
        self.device=device

        esm_model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")
        batch_converter = alphabet.get_batch_converter()
        esm_model=esm_model.eval().to(device)
        models=[]
        for i in range(10):
            model=torch.jit.load(model_path(i)).to(device).eval()
            models.append(model)
        self.models=models
        self.esm_model=esm_model
        self.batch_converter=batch_converter

    def predict(self,seq):
        data=[('prot',seq)]
        _, _, batch_tokens = self.batch_converter(data)
        batch_tokens=batch_tokens.to(self.device)
        preds=[]
        with torch.no_grad():
            results=self.esm_model(batch_tokens,repr_layers=[33])
            Repr= results["representations"][33]
            for model in self.models:
                pred=model(Repr).to(torch.device("cpu"))
                preds.append(np.array(pred[0,1:-1,:]))
        preds=np.array(preds)
        mean_pred=np.mean(preds,axis=0)
        std_pred=np.std(preds,axis=0)
        result=np.concatenate((mean_pred,std_pred),axis=-1)
        return result
ProtRAP_LM_model=ProtRAP_LM(device)
fasta_str=''
heads=[]
seqs=[]
sequence1=''

Cloning into 'ProtRAP-LM'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 61 (delta 29), reused 53 (delta 24), pack-reused 1 (from 1)[K
Receiving objects: 100% (61/61), 73.99 KiB | 18.50 MiB/s, done.
Resolving deltas: 100% (29/29), done.
Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
GPU detected. Using Tesla T4
Downloading model_0
Downloaded file from https://github.com/ComputBiophys/ProtRAP-LM/releases/download/Version1.0/model_0.pts to ProtRAP-LM/models/model_0.pts
Downloading model_1
Downloaded file fr

Downloading: "https://github.com/facebookresearch/esm/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


There are currently two options: **single input** and **multiple input**. You can choose either one.

**Single Input:** Requires you to enter a sequence in the text box.

**Multiple Input:** Requires you to upload a FASTA file. This file can contain multiple sequences. The program will process them together and return the results in a compressed file.

In [None]:
#@title Single Input


jobname = 'test' #@param {type:"string"}
sequence1 = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK' #@param {type:"string"}
result_name=jobname+'.zip'
os.system(f"rm {result_name}")

256

In [None]:
#@title Multiple Input

#@markdown To avoid potential network issues, it is recommended to slice your too large fasta file into smaller portions before uploading.

jobname = 'test' #@param {type:"string"}
uploaded = files.upload()

for k in uploaded:
  for record in SeqIO.parse(k, "fasta"):
    heads.append(record.id)
    seqs.append(str(record.seq))
print(f'Successfully loaded a fasta file contains {len(seqs)} sequences.')
result_name=jobname+'.zip'
os.system("rm *.fasta")

In [None]:
#@title Run
run_seqs=[]
run_heads=[]
if len(seqs)>0:
  run_seqs=seqs
  run_heads=heads
if len(sequence1)>0:
  run_seqs.append(sequence1)
  run_heads.append('single_input')
results=[]
infos=zip(run_seqs,run_heads)
for seq,head in tqdm(infos):
  result=ProtRAP_LM_model.predict(seq)
  np.savetxt(head+'_result.csv',result,  header='MCP,RASA,MCP_std,RASA_std',delimiter=',')

In [None]:
#@title Package and download results

result_name='test.zip'
os.system(f"zip {result_name} *.csv")
os.system("rm *.csv")
files.download(f"{result_name}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>