### Dataset Generator

In [None]:
!pip install -q SentencePiece transformers

#### Import Statements

In [None]:
import torch
from transformers import T5EncoderModel, T5Tokenizer

import pandas as pd
import numpy as np

import re
import gc
from tqdm import tqdm
import os
import shutil
import time

import warnings
warnings.simplefilter("ignore")

#### Features Generator Definitions

**`generate_aa`**
- *Protein Amino Acids* **:** https://www.cryst.bbk.ac.uk/education/AminoAcid/the_twenty.html

**`generate_pcp` & `generate_psp`**
- *`inference_utils.py`* **:** https://github.com/thuxugang/opus_tass/blob/master/inference_utils.py

**`generate_prottrans`**
- *ProtTrans* **:** https://github.com/agemagician/ProtTrans
- *`ProtT5-XL-UniRef50.ipynb`* **:** https://github.com/agemagician/ProtTrans/blob/master/Embedding/PyTorch/Advanced/ProtT5-XL-UniRef50.ipynb
- *`generate_prottrans.py`* **:** https://github.com/jas-preet/SPOT-1D-LM/blob/main/generate_prottrans.py

**`generate_esm`**
- *Meta Fundamental AI Research (FAIR) Evolutionary Scale Modeling (ESM)* **:** https://github.com/facebookresearch/esm
- *`generate_esm.py`* **:** https://github.com/jas-preet/SPOT-1D-LM/blob/main/generate_esm.py

In [None]:
def generate_pssm(pssm_file_path, pseq, output_file_path):
    num_pssm_columns = 44
    pssm_column_names = [str(index) for index in range(num_pssm_columns)]
    
    with open(pssm_file_path, 'r') as pssm_file:
        pssm = pd.read_csv(pssm_file, names=pssm_column_names, delim_whitespace=True).dropna().values[:, 2:22].astype(np.float32)
    
    assert pssm.shape[0] == len(pseq), "PSSM file is in wrong format!"
    
    pssm = (pssm - np.mean(pssm, axis=0, keepdims=True)) / np.std(pssm, axis=0, keepdims=True)
    
    with open(output_file_path, 'wb') as pssm_file:
        np.save(file=pssm_file, arr=pssm)
    
    return pssm

def generate_hhm(hhm_file_path, pseq, output_file_path):
    num_hhm_columns = 22
    hhm_column_names = [str(index) for index in range(num_hhm_columns)]
    
    with open(hhm_file_path, 'r') as hhm_file:
        hhm = pd.read_csv(hhm_file, names=hhm_column_names, delim_whitespace=True)
    
    pos1 = (hhm["0"] == "HMM").idxmax() + 3
    num_columns = len(hhm.columns)
    hhm = hhm[pos1:-1].values[:, :num_hhm_columns].reshape((-1, 44))
    hhm[hhm == '*'] = "9999"
    hhm = hhm[:, 2:-12].astype(np.float32)
    
    assert hhm.shape[0] == len(pseq), "HHM file is in wrong format!"
    
    hhm = (hhm - np.mean(hhm, axis=0, keepdims=True)) / np.std(hhm, axis=0, keepdims=True)
    
    with open(output_file_path, 'wb') as hhm_file:
        np.save(file=hhm_file, arr=hhm)
    
    return hhm

def get_pcp_dictionary():
    pcp_dictionary = {
        'A': [-0.350, -0.680, -0.677, -0.171, -0.170, 0.900, -0.476], 
        'R': [0.105, 0.373, 0.466, -0.900, 0.900, 0.528, -0.371], 
        'N': [-0.213, -0.329, -0.243, -0.674, -0.075, -0.403, -0.529], 
        'D': [-0.213, -0.417, -0.281, -0.767, -0.900, -0.155, -0.635], 
        'C': [-0.140, -0.329, -0.359, 0.508, -0.114, -0.652, 0.476], 
        'Q': [-0.230, -0.110, -0.020, -0.464, -0.276, 0.528, -0.371], 
        'E': [-0.230, -0.241, -0.058, -0.696, -0.868, 0.900, -0.582], 
        'G': [-0.900, -0.900, -0.900, -0.342, -0.179, -0.900, -0.900], 
        'H': [0.384, 0.110, 0.138, -0.271, 0.195, -0.031, -0.106], 
        'I': [0.900, -0.066, -0.009, 0.652, -0.186, 0.155, 0.688], 
        'L': [0.213, -0.066, -0.009, 0.596, -0.186, 0.714, -0.053], 
        'K': [-0.088, 0.066, 0.163, -0.889, 0.727, 0.279, -0.265], 
        'M': [0.110, 0.066, 0.087, 0.337, -0.262, 0.652, -0.001], 
        'F': [0.363, 0.373, 0.412, 0.646, -0.272, 0.155, 0.318], 
        'P': [0.247, -0.900, -0.294, 0.055, -0.010, -0.900, 0.106], 
        'S': [-0.337, -0.637, -0.544, -0.364, -0.265, -0.466, -0.212], 
        'T': [0.402, -0.417, -0.321, -0.199, -0.288, -0.403, 0.212], 
        'W': [0.479, 0.900, 0.900, 0.900, -0.209, 0.279, 0.529], 
        'Y': [0.363, 0.417, 0.541, 0.188, -0.274, -0.155, 0.476], 
        'V': [0.677, -0.285, -0.232, 0.331, -0.191, -0.031, 0.900]
    }
    return pcp_dictionary

def generate_pcp(pseq, output_file_path):
    pcp_dictionary = get_pcp_dictionary()
    pcp = np.array([pcp_dictionary.get(amino_acid_residue, [0] * 7) for amino_acid_residue in pseq], dtype=np.float32)
    
    pcp = (pcp - np.mean(pcp, axis=0, keepdims=True)) / np.std(pcp, axis=0, keepdims=True)
    
    with open(output_file_path, 'wb') as pcp_file:
        np.save(file=pcp_file, arr=pcp)
    
    return pcp

def generate_aa(pseq, output_file_path):
    amino_acid_residues = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    aa = np.zeros(shape=(len(pseq), 20), dtype=np.float32)
    
    for index, amino_acid_residue in enumerate(pseq):
        if amino_acid_residue in amino_acid_residues:
            aa[index, amino_acid_residues.index(amino_acid_residue)] = 1
    
    with open(output_file_path, 'wb') as aa_file:
        np.save(file=aa_file, arr=aa)
    
    return aa

def get_psp_dictionary():
    psp_dictionary = {
        'A': [1, 3, 7], 
        'R': [1, 5, 6, 7, 13], 
        'N': [1, 5, 7, 14], 
        'D': [1, 5, 7, 11], 
        'C': [1, 7, 8], 
        'Q': [1, 6, 7, 14], 
        'E': [1, 6, 7, 11], 
        'G': [1, 4, 7], 
        'H': [1, 5, 7, 17], 
        'I': [1, 3, 7, 12], 
        'L': [1, 5, 7, 12], 
        'K': [1, 5, 6, 7, 10], 
        'M': [1, 6, 7, 9], 
        'F': [1, 5, 7, 16], 
        'P': [7, 19], 
        'S': [1, 2, 5, 7], 
        'T': [1, 7, 15], 
        'W': [1, 5, 7, 18], 
        'Y': [1, 2, 5, 7, 16], 
        'V': [1, 7, 12]
    }
    return psp_dictionary

def generate_psp(pseq, output_file_path):
    psp_dictionary = get_psp_dictionary()
    psp = np.zeros(shape=(len(pseq), 19), dtype=np.float32)
    
    for index, amino_acid_residue in enumerate(pseq):
        if amino_acid_residue in psp_dictionary:
            indices = psp_dictionary[amino_acid_residue]
            
            for idx in indices:
                psp[index, idx - 1] = 1
    
    with open(output_file_path, 'wb') as psp_file:
        np.save(file=psp_file, arr=psp)
    
    return psp

def generate_contact(contact_file_path, pseq, output_file_path, window_size, min_sep=3):
    if not window_size > 0:
        return None
    
    pseq_length = len(pseq)
    contact_feats = np.zeros(shape=(pseq_length, pseq_length, 1))
    
    with open(contact_file_path, 'r') as contact_file:
        contact_map = pd.read_csv(contact_file, names=["pos1", "pos2", "idk1", "idk2", "score"], delim_whitespace=True)
    
    contact_map = contact_map[contact_map["pos1"].astype(str).str.isdigit()].dropna().values
    
    if contact_map.shape[0] == 0:
        with open(contact_file_path, 'r') as contact_file:
            contact_map = pd.read_csv(contact_file, names=["pos1", "pos2", "score"], delim_whitespace=True)
        
        contact_map = contact_map[contact_map["pos1"].astype(str).str.isdigit()].dropna().values
        pos1 = contact_map[:, 0].astype(int)
        pos2 = contact_map[:, 1].astype(int)
    else:
        pos1 = contact_map[:, 0].astype(int) - 1
        pos2 = contact_map[:, 1].astype(int) - 1
    
    score = contact_map[:, -1:]
    contact_feats[pos1, pos2] = score
    contact_feats = contact_feats + np.transpose(contact_feats, axes=(1, 0, 2)) + np.tril(m=np.triu(m=np.ones(shape=(pseq_length, pseq_length)), k=(-min_sep + 1)), k=(min_sep - 1))[:, :, None]
    
    contact_image = []
    contact_image.append(contact_feats)
    contact_image = np.concatenate(contact_image, axis=2)
    
    assert pseq_length == contact_image.shape[0]
    
    features_depth = contact_image.shape[2]
    window_size = int(window_size)
    
    resize = np.concatenate([np.zeros(shape=(window_size, pseq_length, features_depth)), np.concatenate([contact_image, np.zeros(shape=(window_size, pseq_length, features_depth))], axis=0)], axis=0)
    contact_array = np.concatenate([resize[index:(index + 2 * window_size + 1), index, :features_depth] for index in range(pseq_length)], axis=1).T
    removal_indices = np.array([window_size + index for index in range(-2, 3)])
    contact = np.delete(contact_array, obj=removal_indices, axis=1).astype(np.float32)
    
    contact = (contact - np.mean(contact, axis=0, keepdims=True)) / np.std(contact, axis=0, keepdims=True)
    
    with open(output_file_path, 'wb') as contact_file:
        np.save(file=contact_file, arr=contact)
    
    return contact

def generate_prottrans(pseq, output_file_path, use_gpu=False):
    tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False)
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
    gc.collect()
    
    device = torch.device("cuda:0") if use_gpu and torch.cuda.is_available() else torch.device("cpu")
    model = model.to(device)
    model = model.eval()
    
    p_s_e_q = [re.sub(r"[UZOB]", 'X', ' '.join(pseq))]
    
    ids = tokenizer.batch_encode_plus(p_s_e_q, add_special_tokens=True, padding=True)
    input_ids = torch.tensor(ids["input_ids"]).to(device)
    attention_mask = torch.tensor(ids["attention_mask"]).to(device)
    
    with torch.no_grad():
        embedding = model(input_ids=input_ids, attention_mask=attention_mask)
    
    embedding = embedding.last_hidden_state.cpu().numpy()
    prottrans = []
    
    for sequence_num in range(len(embedding)):
        sequence_len = (attention_mask[sequence_num] == 1).sum()
        sequence_emd = embedding[sequence_num][:sequence_len - 1]
        prottrans.append(sequence_emd)
    
    with open(output_file_path, 'wb') as prottrans_file:
        np.save(file=prottrans_file, arr=prottrans[0])
    
    return prottrans[0]

def generate_esm(protein_name, pseq, output_file_path, use_gpu=False):
    model, alphabet = torch.hub.load(repo_or_dir="facebookresearch/esm", model="esm1b_t33_650M_UR50S", verbose=False)
    batch_converter = alphabet.get_batch_converter()
    
    device = torch.device("cuda:0") if use_gpu and torch.cuda.is_available() else torch.device("cpu")
    model = model.to(device)
    model = model.eval()
    
    data = [(protein_name, pseq)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)
    
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)
    
    token_representations = results["representations"][33]
    
    for index, (name, sequence) in enumerate(data):
        esm = token_representations[index, 1:len(sequence) + 1].cpu().numpy()
        
        with open(output_file_path, 'wb') as esm_file:
            np.save(file=esm_file, arr=esm)
    
    return esm

In [None]:
def generate_batch_prottrans(proteins_dict, use_gpu=False):
    tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False)
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
    gc.collect()
    
    device = torch.device("cuda:0") if use_gpu and torch.cuda.is_available() else torch.device("cpu")
    model = model.to(device)
    model = model.eval()
    
    for protein_name in tqdm(iterable=proteins_dict, desc=f"Generating ProtTrans Feature...", ncols=100, unit="protein"):
        p_s_e_q = [re.sub(r"[UZOB]", 'X', ' '.join(proteins_dict[protein_name]["pseq"]))]
        
        ids = tokenizer.batch_encode_plus(p_s_e_q, add_special_tokens=True, padding=True)
        input_ids = torch.tensor(ids["input_ids"]).to(device)
        attention_mask = torch.tensor(ids["attention_mask"]).to(device)
        
        with torch.no_grad():
            embedding = model(input_ids=input_ids, attention_mask=attention_mask)
        
        embedding = embedding.last_hidden_state.cpu().numpy()
        prottrans = []
        
        for sequence_num in range(len(embedding)):
            sequence_len = (attention_mask[sequence_num] == 1).sum()
            sequence_emd = embedding[sequence_num][:sequence_len - 1]
            prottrans.append(sequence_emd)
        
        with open(proteins_dict[protein_name]["output_file_path"] + "_prottrans.npy", 'wb') as prottrans_file:
            np.save(file=prottrans_file, arr=prottrans[0])
    
    return True

def generate_batch_esm(proteins_dict, use_gpu=False):
    model, alphabet = torch.hub.load(repo_or_dir="facebookresearch/esm", model="esm1b_t33_650M_UR50S", verbose=False)
    batch_converter = alphabet.get_batch_converter()
    
    device = torch.device("cuda:0") if use_gpu and torch.cuda.is_available() else torch.device("cpu")
    model = model.to(device)
    model = model.eval()
    
    for protein_name in tqdm(iterable=proteins_dict, desc=f"Generating ESM-1b Feature...", ncols=100, unit="protein"):
        data = [(protein_name, proteins_dict[protein_name]["pseq"])]
        batch_labels, batch_strs, batch_tokens = batch_converter(data)
        batch_tokens = batch_tokens.to(device)
        
        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[33], return_contacts=True)
        
        token_representations = results["representations"][33]
        
        for index, (name, sequence) in enumerate(data):
            esm = token_representations[index, 1:len(sequence) + 1].cpu().numpy()
            
            with open(proteins_dict[protein_name]["output_file_path"] + "_esm.npy", 'wb') as esm_file:
                np.save(file=esm_file, arr=esm)
    
    return True

#### Labels Generator Definitions

**`generate_ss3_ss8_phi_psi`**
- *SPOT-1D-Single Paper* **:** https://doi.org/10.1093/bioinformatics/btab316

In [None]:
def generate_ss3_ss8_phi_psi(protein_name, dssp_file_path, output_dir_path):
    with open(dssp_file_path, 'r') as dssp_file:
        contents = [content for content in dssp_file.read().split('\n') if content != '']
    
    secondary_structures = contents[2]
    phi_angles = [(-500 if phi == 'X' else float(phi)) for phi in contents[3].split() if phi != '']
    psi_angles = [(-500 if psi == 'X' else float(psi)) for psi in contents[4].split() if psi != '']
    
    assert len(contents[1]) == len(secondary_structures) == len(phi_angles) == len(psi_angles)
    
    ss3 = np.zeros(shape=(len(secondary_structures), 3))
    ss8 = np.zeros(shape=(len(secondary_structures), 8))
    
    ss_s = ['G', 'H', 'I', 'B', 'E', 'S', 'T', 'C']
    
    for index, ss in enumerate(secondary_structures):
        if ss == 'G' or ss == 'H' or ss == 'I':
            ss3[index, 0], ss8[index, ss_s.index(ss)] = 1, 1
        elif ss == 'B' or ss == 'E':
            ss3[index, 1], ss8[index, ss_s.index(ss)] = 1, 1
        elif ss == 'S' or ss == 'T' or ss == 'C':
            ss3[index, 2], ss8[index, ss_s.index(ss)] = 1, 1
    
    phi = np.zeros(shape=(len(phi_angles), 1))
    psi = np.zeros(shape=(len(psi_angles), 1))
    
    for index, (phi_angle, psi_angle) in enumerate(zip(phi_angles, psi_angles)):
        phi[index, 0] = phi_angle if -180 < phi_angle < 180 else -500
        psi[index, 0] = psi_angle if -180 < psi_angle < 180 else -500
    
    with open(output_dir_path + os.sep + protein_name + "_ss3.npy", 'wb') as ss3_file:
        np.save(file=ss3_file, arr=ss3)
    
    with open(output_dir_path + os.sep + protein_name + "_ss8.npy", 'wb') as ss8_file:
        np.save(file=ss8_file, arr=ss8)
    
    with open(output_dir_path + os.sep + protein_name + "_phi.npy", 'wb') as phi_file:
        np.save(file=phi_file, arr=phi)
    
    with open(output_dir_path + os.sep + protein_name + "_psi.npy", 'wb') as psi_file:
        np.save(file=psi_file, arr=psi)
    
    return ss3, ss8, phi, psi

#### Datasets Generator Definitions

**`generate_spot_1d_single`**
- *SPOT-1D-Single Paper* **:** https://doi.org/10.1093/bioinformatics/btab316

In [None]:
def generate_proteins_lists(list_path, fasta_dir_path, dssp_dir_path, dst_path, dataset_name, max_length=700, spot_1d=False):
    with open(list_path, 'r') as proteins_list:
        dataset_proteins = [name for name in proteins_list.read().split('\n') if name != '']
    
    eligibles, non_eligibles, not_founds = {}, {}, []
    
    for name in dataset_proteins:
        fasta_file_path = fasta_dir_path + os.sep + (name + os.sep if spot_1d else '') + name + ".fasta"
        dssp_file_path = dssp_dir_path + os.sep + (name + os.sep if spot_1d else '') + name + ".dssp"
        
        if not os.path.isfile(fasta_file_path) or not os.path.isfile(dssp_file_path):
            not_founds.append(name)
        else:
            with open(fasta_file_path, 'r') as fasta_file:
                pseq = fasta_file.read().split('\n')[1]

            if len(pseq) > max_length:
                non_eligibles[name] = len(pseq)
            else:
                eligibles[name] = len(pseq)
    
    dataset_path = dst_path + os.sep + dataset_name
    
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    
    with open(dataset_path + os.sep + dataset_name + f"_below_{max_length}_proteins.txt", 'w') as proteins_file:
        eligible_proteins = [name + ',' + str(length) for name, length in eligibles.items()]
        proteins_file.write('\n'.join(eligible_proteins))
    
    with open(dataset_path + os.sep + dataset_name + f"_above_{max_length}_proteins.txt", 'w') as proteins_file:
        non_eligible_proteins = [name + ',' + str(length) for name, length in non_eligibles.items()]
        proteins_file.write('\n'.join(non_eligible_proteins))
    
    with open(dataset_path + os.sep + dataset_name + "_not_found_proteins.txt", 'w') as proteins_file:
        proteins_file.write('\n'.join(not_founds))
    
    return True

def generate_protein_features_labels(fasta_dir_path, dssp_dir_path, dataset_path, name, spot_1d=False):
    data_path = dataset_path + os.sep + "Rawdata" + os.sep + name
    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    fasta_file_path = fasta_dir_path + os.sep + (name + os.sep if spot_1d else '') + name + ".fasta"
    dst_path = data_path + os.sep + name
    dssp_file_path = dssp_dir_path + os.sep + (name + os.sep if spot_1d else '') + name + ".dssp"
    
    shutil.copyfile(src=fasta_file_path, dst=dst_path + ".fasta")
    
    with open(dst_path + ".fasta", 'r') as fasta_file:
        pseq = fasta_file.read().split('\n')[1]
    
    if spot_1d:
        src_path = fasta_dir_path + os.sep + name + os.sep + name
        generate_pssm(pssm_file_path=src_path + ".pssm", pseq=pseq, output_file_path=dst_path + "_pssm.npy")
        generate_hhm(hhm_file_path=src_path + ".hhm", pseq=pseq, output_file_path=dst_path + "_hhm.npy")
    
    generate_pcp(pseq=pseq, output_file_path=dst_path + "_pcp.npy")
    generate_aa(pseq=pseq, output_file_path=dst_path + "_aa.npy")
    generate_psp(pseq=pseq, output_file_path=dst_path + "_psp.npy")
    generate_prottrans(pseq=pseq, output_file_path=dst_path + "_prottrans.npy")
    generate_esm(protein_name=name, pseq=pseq, output_file_path=dst_path + "_esm.npy")
    generate_ss3_ss8_phi_psi(protein_name=name, dssp_file_path=dssp_file_path, output_dir_path=data_path)
    
    return True

def generate_dataset(list_path, fasta_dir_path, dssp_dir_path, dst_path, ds_name, max_len=700, spot_1d=False):
    dataset_path = dst_path + os.sep + ds_name
    eligibles_file_path = dataset_path + os.sep + ds_name + f"_below_{max_len}_proteins.txt"
    non_eligibles_file_path = dataset_path + os.sep + ds_name + f"_above_{max_len}_proteins.txt"
    
    if not os.path.isfile(eligibles_file_path) or not os.path.isfile(non_eligibles_file_path):
        generate_proteins_lists(list_path, fasta_dir_path, dssp_dir_path, dst_path, ds_name, max_len, spot_1d)
    
    temp_proteins_list_path = dataset_path + os.sep + ds_name + "_temp_proteins.txt"
    raw_data_path = dataset_path + os.sep + "Rawdata"
    
    if os.path.isfile(temp_proteins_list_path):
        with open(temp_proteins_list_path, 'r') as proteins_file:
            ds_proteins = [name for name in proteins_file.read().split('\n') if name != '']
    else:
        if not os.path.exists(raw_data_path):
            with open(eligibles_file_path, 'r') as proteins_file:
                ds_proteins = [row.split(',')[0] for row in proteins_file.read().split('\n') if row != '']
            
            with open(temp_proteins_list_path, 'w') as proteins_file:
                proteins_file.write('\n'.join(ds_proteins))
        else:
            ds_proteins = []
    
    if ds_proteins is []:
        if os.path.exists(temp_proteins_list_path):
            os.remove(temp_proteins_list_path)
        return True
    
    temp_ds_proteins = [protein_name for protein_name in ds_proteins]
    
    for name in tqdm(iterable=ds_proteins, desc=f"{ds_name} Generation Progress", ncols=100, unit="protein"):
        generate_protein_features_labels(fasta_dir_path, dssp_dir_path, dataset_path, name, spot_1d)
        temp_ds_proteins.remove(name)
        
        with open(temp_proteins_list_path, 'w') as proteins_file:
            proteins_file.write('\n'.join(temp_ds_proteins))
    
    if os.path.exists(temp_proteins_list_path):
        os.remove(temp_proteins_list_path)
    
    return True

def generate_spot_1d(max_length=700):
    ds_map = {
        "Training": "training", 
        "Validation": "validation", 
        "TEST2016": "TEST2016", 
        "TEST2018": "TEST2018"
    }
    ds_src_path, ds_dst_path = "../datasets/SPOT-1D/Data", "../datasets/SPOT-1D/Features"
    
    for key in ds_map:
        list_path = ds_src_path + os.sep + "Accessions" + os.sep + ds_map[key] + "-accessions"
        fasta_dir_path = dssp_dir_path = ds_src_path + os.sep + "Rawdata" + os.sep + ds_map[key]
        generate_dataset(list_path, fasta_dir_path, dssp_dir_path, ds_dst_path, key, max_length, spot_1d=True)
    
    return True

def generate_spot_1d_single(max_length=700):
    ds_map = {
        "Training": "train", 
        "Validation": "val", 
        "TEST2018": "TEST2018", 
        "SPOT-2016": "SPOT-2016", 
        "SPOT-2016-HQ": "SPOT-2016-HQ", 
        "SPOT-2018": "SPOT-2018", 
        "SPOT-2018-HQ": "SPOT-2018-HQ", 
        "SPOT-2018-Neff1": "neff1-2018", 
        "CASP12-FM": "casp12", 
        "CASP13-FM": "casp13"
    }
    ds_src_path, ds_dst_path = "../datasets/SPOT-1D-Single/Data", "../datasets/SPOT-1D-Single/Features"
    
    for key in ds_map:
        list_path = ds_src_path + os.sep + "lists" + os.sep + ds_map[key] + ".txt"
        fasta_dir_path = ds_src_path + os.sep + "fasta"
        dssp_dir_path = ds_src_path + os.sep + "dssp"
        generate_dataset(list_path, fasta_dir_path, dssp_dir_path, ds_dst_path, key, max_length, spot_1d=False)
    
    return True

def generate_casp(max_length=700):
    casp_ds_map = {
        "CASP12(49)": "CASP12(49)", 
        "CASP12(55)": "CASP12(55)", 
        "CASP13(31)": "CASP13(31)", 
        "CASP13(32)": "CASP13(32)", 
        "CASP-FM": "CASP-FM"
    }
    ds_src_path, ds_dst_path = "../datasets/CASP/Data", "../datasets/CASP/Features"
    
    for key in casp_ds_map:
        list_path = ds_src_path + os.sep + "Accessions" + os.sep + casp_ds_map[key] + "-accessions"
        fasta_dir_path = dssp_dir_path = ds_src_path + os.sep + "Rawdata" + os.sep + casp_ds_map[key]
        generate_dataset(list_path, fasta_dir_path, dssp_dir_path, ds_dst_path, key, max_length, spot_1d=True)
    
    return True

def generate_casp12_fm(max_length=700):
    ds_map = {"CASP12-FM": "casp12"}
    ds_src_path, ds_dst_path = "../datasets/SPOT-1D-Single/Data", "../datasets/SPOT-1D-Single/Features"
    
    for key in ds_map:
        list_path = ds_src_path + os.sep + "lists" + os.sep + ds_map[key] + ".txt"
        fasta_dir_path, dssp_dir_path = ds_src_path + os.sep + "fasta", ds_src_path + os.sep + "dssp"
        generate_dataset(list_path, fasta_dir_path, dssp_dir_path, ds_dst_path, key, max_length, spot_1d=False)
    
    return True

def generate_spot_1d_lm(max_length=700):
    ds_map = {"SPOT-2018-Neff1": "Neff1-2020", "CASP14-FM": "casp14"}
    ds_src_path, ds_dst_path = "../datasets/SPOT-1D-LM/Data", "../datasets/SPOT-1D-LM/Features"
    
    for key in ds_map:
        list_path = ds_src_path + os.sep + "lists" + os.sep + ds_map[key] + ".txt"
        fasta_dir_path, dssp_dir_path = ds_src_path + os.sep + "fasta", ds_src_path + os.sep + "dssp"
        generate_dataset(list_path, fasta_dir_path, dssp_dir_path, ds_dst_path, key, max_length, spot_1d=False)
    
    return True

#### Datasets Generation from SPOT-1D and SPOT-1D-Single Data

In [None]:
generate_spot_1d();

In [None]:
generate_spot_1d_single();

In [None]:
generate_casp()
generate_casp12_fm();

In [None]:
generate_spot_1d_lm();

#### Window Features Generation for SPOT-1D Proteins

In [None]:
spot1d_datasets = {
    "Training": "training", 
    "Validation": "validation", 
    "TEST2016": "TEST2016", 
    "TEST2018": "TEST2018"
}

for dataset in spot1d_datasets:
    in_path = "../datasets/SPOT-1D/Data/Rawdata" + os.sep + spot1d_datasets[dataset]
    dataset_path = "../datasets/SPOT-1D/Features" + os.sep + dataset
    out_path = dataset_path + os.sep + "Rawdata"
    
    with open(dataset_path + os.sep + dataset + "_below_700_proteins.txt", 'r') as accessions_file:
        accessions = [content.split(',')[0] for content in accessions_file.read().split('\n') if content != '']
    
    for accession in tqdm(iterable=accessions, desc=f"{dataset} Running...", ncols=100, unit="protein"):
        in_raw_path, out_raw_path = in_path + os.sep + accession, out_path + os.sep + accession
        
        with open(in_raw_path + os.sep + accession + ".fasta", 'r') as fasta_file:
            pseq = fasta_file.read().split('\n')[1]
        
        generate_contact(
            contact_file_path=in_raw_path + os.sep + accession + ".spotcon", 
            pseq=pseq, 
            output_file_path=out_raw_path + os.sep + accession + "_win10.npy", 
            window_size=10
        )
        generate_contact(
            contact_file_path=in_raw_path + os.sep + accession + ".spotcon", 
            pseq=pseq, 
            output_file_path=out_raw_path + os.sep + accession + "_win20.npy", 
            window_size=20
        )
        generate_contact(
            contact_file_path=in_raw_path + os.sep + accession + ".spotcon", 
            pseq=pseq, 
            output_file_path=out_raw_path + os.sep + accession + "_win50.npy", 
            window_size=50
        )

In [None]:
casp_datasets = {"CASP12(49)": "CASP12(49)", "CASP12(55)": "CASP12(55)", "CASP13(31)": "CASP13(31)", "CASP13(32)": "CASP13(32)", "CASP-FM": "CASP-FM"}

for dataset in casp_datasets:
    in_path = "../datasets/CASP/Data/Rawdata" + os.sep + casp_datasets[dataset]
    dataset_path = "../datasets/CASP/Features" + os.sep + dataset
    out_path = dataset_path + os.sep + "Rawdata"
    
    with open(dataset_path + os.sep + dataset + "_below_700_proteins.txt", 'r') as accessions_file:
        accessions = [content.split(',')[0] for content in accessions_file.read().split('\n') if content != '']
    
    for accession in tqdm(iterable=accessions, desc=f"{dataset} Running...", ncols=100, unit="protein"):
        in_raw_path = in_path + os.sep + accession + os.sep + accession
        out_raw_path = out_path + os.sep + accession + os.sep + accession
        
        with open(in_raw_path + ".fasta", 'r') as fasta_file:
            pseq = fasta_file.read().split('\n')[1]
        
        generate_contact(contact_file_path=in_raw_path + ".spotcon", pseq=pseq, output_file_path=out_raw_path + "_win10.npy", window_size=10)
        generate_contact(contact_file_path=in_raw_path + ".spotcon", pseq=pseq, output_file_path=out_raw_path + "_win20.npy", window_size=20)
        generate_contact(contact_file_path=in_raw_path + ".spotcon", pseq=pseq, output_file_path=out_raw_path + "_win50.npy", window_size=50)

#### Features Generation Time Measurement for Different Datasets

In [None]:
dataset_name, data_ds_name, features_ds_name = "SPOT-1D", "TEST2018", "TEST2018"  # "CASP", "CASP-FM", "CASP-FM"

input_features_path = "../datasets" + os.sep + dataset_name + os.sep + "Data/Rawdata" + os.sep + data_ds_name
dataset_path = "../datasets" + os.sep + dataset_name + os.sep + "Features" + os.sep + features_ds_name
output_features_path = "../temporary" + os.sep + features_ds_name

if os.path.exists(output_features_path):
    shutil.rmtree(output_features_path)

if not os.path.exists(output_features_path):
    os.makedirs(output_features_path)

features_generation_start_time = time.time()

with open(dataset_path + os.sep + features_ds_name + "_below_700_proteins.txt", 'r') as accessions_file:
    accessions = [content.split(',')[0] for content in accessions_file.read().split('\n') if content != '']

proteins_dict = dict()

for accession in accessions:
    if not os.path.exists(output_features_path + os.sep + accession):
        os.makedirs(output_features_path + os.sep + accession)
    
    with open(input_features_path + os.sep + accession + os.sep + accession + ".fasta", 'r') as fasta_file:
        pseq = fasta_file.read().split('\n')[1]
    
    proteins_dict[accession] = {"pseq": pseq, "output_file_path": output_features_path + os.sep + accession + os.sep + accession}

for accession in tqdm(iterable=accessions, desc=f"{features_ds_name} Running...", ncols=100, unit="protein"):
    src_path, dst_path = input_features_path + os.sep + accession + os.sep + accession, proteins_dict[accession]["output_file_path"]
    
    generate_pssm(pssm_file_path=src_path + ".pssm", pseq=proteins_dict[accession]["pseq"], output_file_path=dst_path + "_pssm.npy")
    generate_hhm(hhm_file_path=src_path + ".hhm", pseq=proteins_dict[accession]["pseq"], output_file_path=dst_path + "_hhm.npy")
    generate_pcp(pseq=proteins_dict[accession]["pseq"], output_file_path=dst_path + "_pcp.npy")
    generate_contact(
        contact_file_path=src_path + ".spotcon", 
        pseq=proteins_dict[accession]["pseq"], 
        output_file_path=dst_path + "_win10.npy", 
        window_size=10
    )
    generate_contact(
        contact_file_path=src_path + ".spotcon", 
        pseq=proteins_dict[accession]["pseq"], 
        output_file_path=dst_path + "_win20.npy", 
        window_size=20
    )
    generate_contact(
        contact_file_path=src_path + ".spotcon", 
        pseq=proteins_dict[accession]["pseq"], 
        output_file_path=dst_path + "_win50.npy", 
        window_size=50
    )

generate_batch_prottrans(proteins_dict=proteins_dict)

features_generation_required_time = time.time() - features_generation_start_time

print(f"Done! ~ Time taken to generate features for SAINT-Evolve is {features_generation_required_time} seconds.")

In [None]:
dataset_name, data_ds_name, features_ds_name = "SPOT-1D-Single", "TEST2018", "TEST2018"  # "SPOT-1D-Single", "SPOT-2018", "SPOT-2018"

input_features_path = "../datasets" + os.sep + dataset_name + os.sep + "Data" + os.sep + "fasta"
dataset_path = "../datasets" + os.sep + dataset_name + os.sep + "Features" + os.sep + features_ds_name
output_features_path = "../temporary" + os.sep + features_ds_name

if os.path.exists(output_features_path):
    shutil.rmtree(output_features_path)

if not os.path.exists(output_features_path):
    os.makedirs(output_features_path)

features_generation_start_time = time.time()

with open(dataset_path + os.sep + features_ds_name + "_below_700_proteins.txt", 'r') as accessions_file:
    accessions = [content.split(',')[0] for content in accessions_file.read().split('\n') if content != '']

proteins_dict = dict()

for accession in accessions:
    if not os.path.exists(output_features_path + os.sep + accession):
        os.makedirs(output_features_path + os.sep + accession)
    
    with open(input_features_path + os.sep + accession + ".fasta", 'r') as fasta_file:
        pseq = fasta_file.read().split('\n')[1]
    
    proteins_dict[accession] = {"pseq": pseq, "output_file_path": output_features_path + os.sep + accession + os.sep + accession}

for accession in tqdm(iterable=accessions, desc=f"{features_ds_name} Running...", ncols=100, unit="protein"):
    generate_aa(pseq=proteins_dict[accession]["pseq"], output_file_path=proteins_dict[accession]["output_file_path"] + "_aa.npy")

generate_batch_prottrans(proteins_dict=proteins_dict)
generate_batch_esm(proteins_dict=proteins_dict)

features_generation_required_time = time.time() - features_generation_start_time

print(f"Done! ~ Time taken to generate features for SAINT-Single is {features_generation_required_time} seconds.")