# Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import re 
import urllib.request
import json
import os 
from difflib import SequenceMatcher

# Define paths

Google drive = https://drive.google.com/drive/folders/17z34rgAw2nz4ywlF2G4CTJifcAI66lka?usp=sharing

In [2]:
data_path = os.path.dirname(os.getcwd()) + '/data/'
figure_path = os.path.dirname(os.getcwd()) + '/figures'
url_template = 'http://www.rcsb.org/pdb/files/{}.pdb'
rest_url='https://www3.cmbi.umcn.nl/xssp/'

# Imort data

In [3]:
expression_file=pd.read_csv(data_path + 'expression/expression.csv')
expression=dict(zip(list(expression_file.PDB_ID),list(expression_file.Expression)))
species=dict(zip(list(expression_file.PDB_ID),list(expression_file.Species)))
patches = os.listdir(data_path + 'patches/raw')

AA = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
HYDR = ['A','C','F','I','L','M','V','W','Y']
SPEC = dict(zip(list(expression_file.Species.value_counts().index[:10]),range(0,10)))
EXP = np.percentile(expression_file[~expression_file.Expression.isnull()].Expression, np.arange(0,100,10)).tolist()

In [4]:
def retrieve_uniprot(pdbid):
    try:
        protein=pdbid
        url = url_template.format(protein.upper()[:4])
        response = urllib.request.urlopen(url)
        pdb = response.read().decode('utf-8')
        response.close()
        m = re.search('UNP\ +(\w+)', pdb)
        return m.group(1)  
    except:
        None

In [5]:
def get_ASA(data, list_data, agg_data=None):
    #num_columns = 97 if agg_data is not None else 95
    new_data=np.zeros((len(data),len(data[0]),97))
    
    print(new_data.shape, 'initial')
    for i in tqdm(range(len(data))):
        #Column len(AA)
        column=np.zeros((len(data[i]),27))
        mask = np.count_nonzero(data[i,:,50])
        
        #Column 0 equals TASA = sum(RSA*ASAmax)
        tasa=sum(data[i,:,53]*data[i,:,50])
        hydr=[bool(AA[k] in HYDR) for k in np.argmax(data[i,:,:20],axis=-1)]
        #Column 1 equals THSA = sum(RSA*ASAmax*hydr_mask)
        thsa=sum(data[i,:,53]*data[i,:,50]*hydr)
        #Column 2 equals RHSA = sum(RSA*ASAmax*hydr_mask)/sum(RSA*ASAmax)
        rhsa=thsa/tasa
        column[0,0]=round(tasa,1)
        column[0,1]=round(thsa,1)
        column[0,2]=round(rhsa,5)

        #Column 3 equals Patch Size
        id_patch=list_data[i].replace('-','').upper()
        if id_patch+'.csv' in patches: 
            patch_info = pd.read_csv(os.path.join(data_path + 'patches','raw',f'{id_patch}.csv'))
            # Checking if we can find a match between our two files
            data_fasta = ''.join([AA[k] for k in np.argmax(data[i,:mask,:20],axis=-1)])
            patch_fasta = ''.join(list(patch_info.residue))
            s = SequenceMatcher(None, patch_fasta, data_fasta)
            Match = s.find_longest_match(0, len(patch_fasta), 0, len(data_fasta))
            if Match.size/max(len(data_fasta),len(patch_fasta))>0.8:                
                patch_column = np.nan_to_num(np.array(list(patch_info.patch_size)))
                patch_place = np.zeros((len(data[i])))
                patch_place[Match.b:Match.b+Match.size] = patch_column[Match.a:Match.a+Match.size]
                column[:,4]=np.where(patch_place>0,1,0)
                if not((patch_place==0).all()):
                    column[0,3]=round(max(patch_place),1)
                    column[:,5]=np.where(patch_place==max(patch_place),1,0)

        #Column 6 equals Species
        if list_data[i] in species.keys() and species[list_data[i]] in SPEC.keys():
            column[0,6+SPEC[species[list_data[i]]]]=1

        # Columns 17 equals expression
        if list_data[i] in expression.keys() and not(np.isnan(expression[list_data[i]])):
            expression_value=round(expression[list_data[i]],1)  
            column[0,16]=expression_value
            rk=0
            for k in EXP[1:]:
                if expression_value>=k:
                    rk+=1
            column[0,17+rk]=1    
        padding_zero = np.zeros((len(data[i]),2))  
        column = np.c_[column,padding_zero]
        new_data[i]=np.c_[data[i],column]        
    
    print(new_data.shape, 'final')
    return new_data

In [6]:
if __name__=="__main__":
    name_train,name_casp,name_cb,name_ts=np.load(r"/Users/deagogishvili/Documents/PhD/multitask/data/source_dataset/Train_HHblits.npz")['pdbids'],np.load(r"/Users/deagogishvili/Documents/PhD/multitask/data/source_dataset/CASP12_HHblits.npz")['pdbids'],np.load(r"/Users/deagogishvili/Documents/PhD/multitask/data/source_dataset/CB513_HHblits.npz")['pdbids'],np.load(r"/Users/deagogishvili/Documents/PhD/multitask/data/source_dataset/TS115_HHblits.npz")['pdbids']
    data_train,data_casp,data_cb,data_ts=np.load(r"/Users/deagogishvili/Documents/PhD/multitask/data/source_dataset/Train_HHblits.npz")['data'],np.load(r"/Users/deagogishvili/Documents/PhD/multitask/data/source_dataset/CASP12_HHblits.npz")['data'],np.load(r"/Users/deagogishvili/Documents/PhD/multitask/data/source_dataset/CB513_HHblits.npz")['data'],np.load(r"/Users/deagogishvili/Documents/PhD/multitask/data/source_dataset/TS115_HHblits.npz")['data']
    list_train, list_casp,list_cb,list_ts=get_ASA(data_train,name_train),get_ASA(data_casp,name_casp),get_ASA(data_cb,name_cb),get_ASA(data_ts,name_ts)
    np.savez_compressed('/Users/deagogishvili/Documents/PhD/multitask/data/extended/Train_HHblits_extended.npz',pdbids=name_train,data=list_train)
    np.savez_compressed('/Users/deagogishvili/Documents/PhD/multitask/data/extended/CASP12_HHblits_extended.npz',pdbids=name_casp,data=list_casp)
    np.savez_compressed('/Users/deagogishvili/Documents/PhD/multitask/data/extended/CB513_HHblits_extended.npz',pdbids=name_cb,data=list_cb)
    np.savez_compressed('/Users/deagogishvili/Documents/PhD/multitask/data/extended/TS115_HHblits_extended.npz',pdbids=name_ts,data=list_ts)

(10848, 1632, 97) initial


100%|██████████| 10848/10848 [00:48<00:00, 222.49it/s]


(10848, 1632, 97) final
(21, 1494, 97) initial


100%|██████████| 21/21 [00:00<00:00, 242.01it/s]


(21, 1494, 97) final
(513, 874, 97) initial


100%|██████████| 513/513 [00:01<00:00, 277.96it/s]


(513, 874, 97) final
(115, 1111, 97) initial


100%|██████████| 115/115 [00:00<00:00, 280.59it/s]


(115, 1111, 97) final


In [8]:
print(data_train.shape)
print(data_casp.shape)
print(data_cb.shape)
print(data_ts.shape)

(10848, 1632, 68)
(21, 1494, 68)
(513, 874, 68)
(115, 1111, 68)


# Save separate csv files containing THSA RHSA and LHP only for a baseline model

In [9]:
import numpy as np
import pandas as pd

def extract_and_save_list_dataset(input_path, output_csv_path):
    # Load the extended dataset
    extended_data = np.load(input_path)
    
    # Extract data and pdbids
    data = extended_data['data']
    pdbids = extended_data['pdbids']
    
    # Assuming the columns 69, 70, and 71 correspond to THSA, RHSA, and LHP
    thsa_list = [data[i, 0, 69] for i in range(data.shape[0])]
    rhsa_list = [data[i, 0, 70] for i in range(data.shape[0])]
    lhp_list = [data[i, 0, 71] for i in range(data.shape[0])]
    
    # Create a DataFrame with PDB IDs and lists
    df = pd.DataFrame({'PDB_ID': pdbids, 'THSA': thsa_list, 'RHSA': rhsa_list, 'LHP': lhp_list})
    
    # Save DataFrame as a CSV file
    df.to_csv(output_csv_path, index=False)

In [11]:
extract_and_save_list_dataset('/Users/deagogishvili/Documents/PhD/multitask/data/extended/Train_HHblits_extended.npz', 
                              data_path + 'patches/Train_LHP.csv')
extract_and_save_list_dataset('/Users/deagogishvili/Documents/PhD/multitask/data/extended/CASP12_HHblits_extended.npz', 
                              data_path + 'patches/CASP12_LHP.csv')
extract_and_save_list_dataset('/Users/deagogishvili/Documents/PhD/multitask/data/extended/CB513_HHblits_extended.npz', 
                              data_path + 'patches/CB513_LHP.csv')
extract_and_save_list_dataset('/Users/deagogishvili/Documents/PhD/multitask/data/extended/TS115_HHblits_extended.npz', 
                              data_path + 'patches/TS115_LHP.csv')