### 1.1 Data Preprocessing (Families)

In [1]:
import pandas as pd
import numpy as np
import torch
import os
import configparser
import json

In [2]:
config_path = "../config/main.conf"
conf = configparser.ConfigParser()
conf.read(config_path)

model_conf = configparser.ConfigParser()
model_conf.read(conf['path']['model'])

['../config/model.conf']

In [3]:
data_partitions_dirpath = conf['path']['data_part']
print('Available dataset partitions: ', os.listdir(data_partitions_dirpath))

Available dataset partitions:  ['train', 'dev', 'test', 'download.sh']


In [4]:
%%time
def read_all_shards(partition='dev', data_dir=data_partitions_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    return pd.concat(shards)

test = read_all_shards('test')
dev = read_all_shards('dev')
train = read_all_shards('train')

partitions = {'test': test, 'dev': dev, 'train': train}
for name, df in partitions.items():
    print('Dataset partition "%s" has %d sequences' % (name, len(df)))

Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences
CPU times: user 6.48 s, sys: 1.09 s, total: 7.57 s
Wall time: 15.5 s


In [5]:
SAMPLE_RATE = 200

fams = np.array(train["family_accession"].value_counts().index)[::SAMPLE_RATE]

In [6]:
fam_list = ','.join(fams)
print(fam_list)

PF13649.6,PF14310.6,PF00478.25,PF00858.24,PF00396.18,PF10143.9,PF13464.6,PF03987.15,PF03456.18,PF11791.8,PF09320.11,PF12802.7,PF07143.11,PF00079.20,PF04050.14,PF04226.13,PF10322.9,PF12705.7,PF03129.20,PF16369.5,PF07870.11,PF04069.12,PF02089.15,PF06491.11,PF11716.8,PF00642.24,PF17242.2,PF13854.6,PF07575.13,PF04346.12,PF14704.6,PF00017.24,PF06923.11,PF18263.1,PF04732.14,PF07986.12,PF06956.11,PF10232.9,PF11454.8,PF00031.21,PF07120.11,PF16540.5,PF02576.17,PF11256.8,PF00954.20,PF09771.9,PF12684.7,PF03237.15,PF12167.8,PF05586.11,PF10673.9,PF18602.1,PF06970.11,PF05918.11,PF04589.13,PF14904.6,PF12286.8,PF13179.6,PF10018.9,PF16764.5,PF15567.6,PF15164.6,PF09639.10,PF11353.8,PF13846.6,PF16243.5,PF15774.5,PF10843.8,PF08067.11,PF13842.6,PF13016.6,PF08087.11,PF15081.6,PF09715.10,PF17298.2,PF11989.8,PF03043.14,PF15814.5,PF03668.15,PF05418.11,PF07028.11,PF06543.12,PF09499.10,PF11246.8,PF06453.11,PF17470.2,PF04311.13,PF06301.11,PF12212.8,PF06919.11


### 1.2 Contact Maps

In [17]:
import numpy as np
import Bio.PDB
import os 
import scipy.sparse

In [46]:
config_path = "../config/contact_maps.conf"
conf = configparser.ConfigParser()
conf.read(config_path)

filename = conf['path']['pdb_codes']
pdb_dir = conf['path']['pdb_dir']
contact_map_dir = conf['path']['contact_maps_dir']

In [34]:
def calc_residue_dist(residue_one, residue_two) :
    """Returns the C-alpha distance between two residues"""
    if Bio.PDB.is_aa(residue_one) and Bio.PDB.is_aa(residue_two):
        diff_vector  = residue_one["CA"].coord - residue_two["CA"].coord
        return np.sqrt(np.sum(diff_vector * diff_vector))
    else:
        return 0

def calc_dist_matrix(chain) :
    """Returns a matrix of C-alpha distances between two chains"""
    answer = np.zeros((len(chain), len(chain)), np.float)
    for row, residue_one in enumerate(chain) :
        for col, residue_two in enumerate(chain) :
            answer[row, col] = calc_residue_dist(residue_one, residue_two)
    return answer

In [45]:
%%time
pdb_file = open(filename, 'r')
pdb_list = pdb_file.read()
for pdb_code in pdb_list.split(', ')[0:1]: #change to all codes when deployed
    try:
        pdbl = Bio.PDB.PDBList()
        pdb_path = pdbl.retrieve_pdb_file(pdb_code, pdir = pdb_dir, file_format = 'pdb', overwrite = True)
        structure = Bio.PDB.PDBParser(QUIET = True).get_structure(pdb_code, pdb_path)
        model = structure[0]
        sequence = Bio.PDB.Selection.unfold_entities(model, 'R')
        dist_matrix = calc_dist_matrix(sequence)
        contact_map = np.array((dist_matrix < 8.0) & (dist_matrix > 0.01))*1
        sparse_contact_map = scipy.sparse.coo_matrix(contact_map)
        scipy.sparse.save_npz(contact_map_dir + '/' + pdb_code + '.npz', sparse_contact_map)
    except:
        print('Failed for PDB code {}'.format(pdb_code)) #this should probably use a logger?


Downloading PDB structure '1A28'...
CPU times: user 3.31 s, sys: 91.3 ms, total: 3.4 s
Wall time: 3.6 s
