In [1]:
from Bio.PDB import *
import numpy as np
import csv
import matplotlib.pyplot as plt
from operator import itemgetter

In [2]:
def calc_residue_dist(residue_one, residue_two, method = "heavy"):
    """Returns the minimum distance between two residues
    default method is between heavy atoms
    method can be calpha, allatom"""
    dist_ij = []
    
    if method == 'allatom':
        for i in residue_one:
            for j in residue_two:
                r_ij = np.linalg.norm(i.coord - j.coord)
                dist_ij.append(r_ij)
    elif method == 'calpha':
        for i in residue_one:
            for j in residue_two:
                if i.id in ['CA','OW'] and j.id in ['CA', 'OW']:
                    r_ij = np.linalg.norm(i.coord - j.coord)
                    dist_ij.append(r_ij)
    else:
    # heavy atoms
        for i in residue_one:
            for j in residue_two:
                if i.mass > 12 and j.mass > 12:
                    r_ij = np.linalg.norm(i.coord - j.coord)
                    dist_ij.append(r_ij)
    return min(dist_ij)

def calc_dist_matrix(chain_one, chain_two, method = "heavy") :
    """Returns a matrix of minimum distances between two chains"""
    #n_rows = len([res for res in chain_one.get_residues()])
    n_rows = 1
    n_cols = len([res for res in chain_two.get_residues()]) - 1

    answer = np.zeros((n_rows, n_cols), np.float)
    row = 0
    for residue_one in chain_one.get_residues():
        if residue_one.resname != ' CA':
            continue
        col = 0
        for residue_two in chain_two.get_residues():
            if residue_two.resname == ' CA':
                continue
            answer[row, col] = calc_residue_dist(residue_one, residue_two, method)
            col = col + 1
        row = row + 1
    return answer

In [3]:
import glob
import sys

contact_cutoff = 6
contact_method = "heavy"
counter = 0
dist_matrix = dict()

allpdb = glob.glob("LOOP_STRUCTURES/*.pdb")
for struct in allpdb:
    pdb_code = struct.split('/')[1].split('.pdb')[0]
    structure = PDBParser().get_structure(pdb_code, struct)
    dist_matrix[pdb_code] = calc_dist_matrix(structure, structure, contact_method)[:,1:13]

    counter = counter + 1
    if counter % 100 == 0:
        print(counter, "structures processed ...", end = '\r')
        sys.stdout.flush()

7800 structures processed ...

In [4]:
pdb_name = [i_name for i_name in dist_matrix]
X = np.array([dist_matrix[i_name][0] for i_name in pdb_name])

newX = np.column_stack((X[:,0],X[:,2],X[:,4],X[:,6],X[:,9],X[:,11]))

In [5]:
contact_ca_residue = np.sum(X<contact_cutoff, axis=1)

asym = np.sum(contact_ca_residue <= 3)
hemi = np.sum(contact_ca_residue == 4) + np.sum(contact_ca_residue == 5)
holo = np.sum(contact_ca_residue > 5)

In [6]:
total = len(pdb_name)
if asym + hemi + holo == total:
    print(asym/total, hemi/total, holo/total)

0.11666879877190738 0.12882179864398108 0.7545094025841116
