In [82]:
import pandas as pd
import numpy as np
import math
from Bio.PDB.PDBParser import PDBParser
parser = PDBParser(PERMISSIVE=1)
import warnings
from Bio.PDB.PDBExceptions import PDBConstructionWarning
warnings.simplefilter('ignore', PDBConstructionWarning)

In [83]:
def get_characteristics_of_structure(name):
    name = name.lower()
    structure = parser.get_structure(name, name+'.pdb')

    water_counter = 0 
    ligands = set()
    chains_counter = 0
    residues_dict = {}
    for model in structure:
        for chain in model:
            residues_counter = 0
            chains_counter += 1
            for residue in chain:
                hetero_flag = residue.get_id()[0]
                if hetero_flag == 'W':
                    water_counter += 1
                if hetero_flag != ' ' and hetero_flag != 'W': # heteroatom (ligand)
                    ligands.add(residue.id[0])
                residues_counter += 1
#                 for atom in residue:
#                     print(atom)
            residues_dict[chain.get_id()] = residues_counter

    print(f'--------- {name.upper()} ---------')
    print(f'The total number of chains is {chains_counter}')        
    print('The number of residues per chain is:')
    for chain_id, res in residues_dict.items():
        print(f'Chain {chain_id} contains {res} residues.')
    print(f'Number of water molecules is {water_counter}')
    print(f'The ligands that present in the structure are {ligands}')
    
    return residues_dict, water_counter, ligands

In [84]:
struc_7neh = get_characteristics_of_structure(name='7neh')

--------- 7NEH ---------
The total number of chains is 4
The number of residues per chain is:
Chain H contains 468 residues.
Chain L contains 387 residues.
Chain E contains 296 residues.
Chain A contains 3 residues.
Number of water molecules is 496
The ligands that present in the structure are {'H_NO3', 'H_EDO', 'H_ CL', 'H_SO4', 'H_PEG', 'H_FUC', 'H_NAG'}


In [85]:
struc_7neg = get_characteristics_of_structure(name='7neg')

--------- 7NEG ---------
The total number of chains is 4
The number of residues per chain is:
Chain H contains 285 residues.
Chain L contains 258 residues.
Chain E contains 213 residues.
Chain A contains 3 residues.
Number of water molecules is 134
The ligands that present in the structure are {'H_SO4', 'H_GOL', 'H_FUC', 'H_NAG'}


### Determine the R.M.S.D. between receptor binding domain os SARS-COV-2 Spike glycoprotein complex and its mutant

In [86]:
def get_RBD_atoms_of_structure(name, start_res_name, start_res_code, stop_res_name, stop_res_code):
    name = name.lower()
    structure = parser.get_structure(name, name+'.pdb')
    rbd_flag = False
    receptor_binding_domain = []
    residues_list = []

    for model in structure:
        for chain in model:
            counter = 0
            if chain.id == 'E': # Spike Glycoprotein
                for residue in chain:
                    name = residue.get_resname() 
                    code = int(str(residue).split('=')[2].split(' ')[0])
#                     print(chain)
#                     print(name)
#                     print(code)
                        
                    if name == start_res_name and code == start_res_code: # start of the receptor binding domain
                        print('hello')
                        rbd_flag = True                         
                    if name == stop_res_name and code == stop_res_code: # end of the receptor binding domain
                        residues_list.append((name, code))
                        for atom in residue:
                            receptor_binding_domain.append((name, code, atom.get_name(), atom.get_coord()))
                        rbd_flag = False
                        
                    if rbd_flag:
                        residues_list.append((name, code))
                        for atom in residue:
                            receptor_binding_domain.append((name, code, atom.get_name(), atom.get_coord()))
    return receptor_binding_domain, residues_list

In [87]:
rbd_7neh, residues_7neh = get_RBD_atoms_of_structure(name='7neh', start_res_name='THR', start_res_code=333, stop_res_name='GLY', stop_res_code=526)
print(len(rbd_7neh))
print(residues_7neh)

hello
1536
[('THR', 333), ('ASN', 334), ('LEU', 335), ('CYS', 336), ('PRO', 337), ('PHE', 338), ('GLY', 339), ('GLU', 340), ('VAL', 341), ('PHE', 342), ('ASN', 343), ('ALA', 344), ('THR', 345), ('ARG', 346), ('PHE', 347), ('ALA', 348), ('SER', 349), ('VAL', 350), ('TYR', 351), ('ALA', 352), ('TRP', 353), ('ASN', 354), ('ARG', 355), ('LYS', 356), ('ARG', 357), ('ILE', 358), ('SER', 359), ('ASN', 360), ('CYS', 361), ('VAL', 362), ('ALA', 363), ('ASP', 364), ('TYR', 365), ('SER', 366), ('VAL', 367), ('LEU', 368), ('TYR', 369), ('ASN', 370), ('SER', 371), ('ALA', 372), ('SER', 373), ('PHE', 374), ('SER', 375), ('THR', 376), ('PHE', 377), ('LYS', 378), ('CYS', 379), ('TYR', 380), ('GLY', 381), ('VAL', 382), ('SER', 383), ('PRO', 384), ('THR', 385), ('LYS', 386), ('LEU', 387), ('ASN', 388), ('ASP', 389), ('LEU', 390), ('CYS', 391), ('PHE', 392), ('THR', 393), ('ASN', 394), ('VAL', 395), ('TYR', 396), ('ALA', 397), ('ASP', 398), ('SER', 399), ('PHE', 400), ('VAL', 401), ('ILE', 402), ('ARG', 

In [88]:
rbd_7neg, residues_7neg = get_RBD_atoms_of_structure(name='7neg', start_res_name='ASN', start_res_code=334, stop_res_name='GLU', stop_res_code=516)
print(len(rbd_7neg))
print(residues_7neg)

hello
1466
[('ASN', 334), ('LEU', 335), ('CYS', 336), ('PRO', 337), ('PHE', 338), ('GLY', 339), ('GLU', 340), ('VAL', 341), ('PHE', 342), ('ASN', 343), ('ALA', 344), ('THR', 345), ('ARG', 346), ('PHE', 347), ('ALA', 348), ('SER', 349), ('VAL', 350), ('TYR', 351), ('ALA', 352), ('TRP', 353), ('ASN', 354), ('ARG', 355), ('LYS', 356), ('ARG', 357), ('ILE', 358), ('SER', 359), ('ASN', 360), ('CYS', 361), ('VAL', 362), ('ALA', 363), ('ASP', 364), ('TYR', 365), ('SER', 366), ('VAL', 367), ('LEU', 368), ('TYR', 369), ('ASN', 370), ('SER', 371), ('ALA', 372), ('SER', 373), ('PHE', 374), ('SER', 375), ('THR', 376), ('PHE', 377), ('LYS', 378), ('CYS', 379), ('TYR', 380), ('GLY', 381), ('VAL', 382), ('SER', 383), ('PRO', 384), ('THR', 385), ('LYS', 386), ('LEU', 387), ('ASN', 388), ('ASP', 389), ('LEU', 390), ('CYS', 391), ('PHE', 392), ('THR', 393), ('ASN', 394), ('VAL', 395), ('TYR', 396), ('ALA', 397), ('ASP', 398), ('SER', 399), ('PHE', 400), ('VAL', 401), ('ILE', 402), ('ARG', 403), ('GLY', 

In [89]:
def valid_residues(residues1, residues2, name1, name2):
    deleted = []
    deleted.append(residues1[0])
    deleted.extend(residues1[-10:-1])
    deleted.append(residues1[-1]) 
    residues1 = residues1[1:]
    residues1 = residues1[:-9]
    
    for (res1, res2) in zip(residues1, residues2):
        if res1 != res2: # mutations
            print(f'The residue {res1} of the {name1} has become {res2} in {name2}')
    return residues1, residues2, deleted

In [90]:
residues_7neh_final, residues_7neg_final, deleted = valid_residues(residues_7neh, residues_7neg, '7NEH', '7NEG')
deleted

The residue ('ASN', 501) of the 7NEH has become ('TYR', 501) in 7NEG


[('THR', 333),
 ('LEU', 517),
 ('LEU', 518),
 ('HIS', 519),
 ('ALA', 520),
 ('PRO', 521),
 ('ALA', 522),
 ('THR', 523),
 ('VAL', 524),
 ('CYS', 525),
 ('GLY', 526)]

In [91]:
def clean_rbd(rbd_list, deleted):
    for_deletion = []
    
    for item in rbd_list:
        residue_name = item[0]
        residue_code = item[1]
        if (residue_name, residue_code) in deleted:
            for_deletion.append(item)
    
    for item in for_deletion: 
        rbd_list.remove(item)
    return rbd_list  

In [92]:
print(len(rbd_7neh))
rbd_7neh = clean_rbd(rbd_7neh, deleted)
print(len(rbd_7neh))

1536
1462


In [93]:
def write_CA_at_txt(rbd_list, filename):
    ca = []
    for item in rbd_list:
        residue_name = item[0]
        residue_code = item[1]
        atom = item[2]
        coordinates = item[3]
        if atom == 'CA':
            ca.append((coordinates[0], coordinates[1], coordinates[2]))
            
    with open(filename, 'w') as fp:
        fp.write('\n'.join('%s %s %s' % x for x in ca))

In [94]:
write_CA_at_txt(rbd_7neh, 'CA_7NEH.txt')
write_CA_at_txt(rbd_7neg, 'CA_7NEG.txt')

In [99]:
def write_atoms_at_txt(rbd_list, filename):
    ca = []
    for item in rbd_list:
        residue_name = item[0]
        residue_code = item[1]
        atom = item[2]
        coordinates = item[3]
        ca.append((coordinates[0], coordinates[1], coordinates[2]))
            
    with open(filename, 'w') as fp:
        fp.write('\n'.join('%s %s %s' % x for x in ca))

In [102]:
write_atoms_at_txt(rbd_7neh, 'ATOMS_7NEH.txt')
write_atoms_at_txt(rbd_7neg, 'ATOMS_7NEG.txt')

In [97]:
class cRMSD:
    def __init__(self, filename1, filename2):
        self.filename1 = filename1
        self.filename2 = filename2
        self.conformations, self.number_of_atoms = self.read_conformations()
        self.centroid_1 = []
        self.centroid_2 = []
        self.U = np.empty((3,3))
        self.Sigma = np.empty((0,3))
        self.VT = np.empty((3,3))
        self.Q = np.empty((3,3))
        
    
    # read the conformations from the txt file
    def read_conformations(self):     
        conformations_dict = dict()
        df1 = pd.read_csv(self.filename1, delimiter = " ", header=None)
        df2 = pd.read_csv(self.filename2, delimiter = " ", header=None)
        conformations_dict[0] = df1
        conformations_dict[1] = df2
        return conformations_dict, df1.shape[0]
        
        
    # calculate the centroid
    def find_centroid(self, conformation1, conformation2):
        sum_result1 = conformation1.sum(axis = 0)
        sum_result2 = conformation2.sum(axis = 0)
        self.centroid_1 = [sum_result1[i]/self.number_of_atoms for i in range(len(sum_result1))]
        self.centroid_2 = [sum_result2[i]/self.number_of_atoms for i in range(len(sum_result2))]
        
        
    # move the conformations to the origin
    def move_to_origin(self, conformation1, conformation2):
        number_of_cols = conformation1.shape[1]
        for i in range(number_of_cols):
            conformation1[i] -= self.centroid_1[i]
            conformation2[i] -= self.centroid_2[i]
        return conformation1, conformation2
        
       
    # find best tranformation of one conformation, using SVD
    def SVD_process(self, conformation1, conformation2):
        XT_Y = np.matmul(conformation1.T.to_numpy(), conformation2.to_numpy())
        self.U, self.Sigma, self.VT = np.linalg.svd(XT_Y)
        self.Q = np.matmul(self.U, self.VT)
        detQ = np.linalg.det(self.Q)
        if detQ < 0:
            self.U[2] = -self.U[2]
            self.Q = np.matmul(self.U, self.VT)        
        
    
    # calculate cRMSD
    def cRMSD_distance(self, conformation1, conformation2):
        temp = np.matmul(conformation1.to_numpy(), self.Q) - conformation2.to_numpy()
    
        sum_norms = 0
        for i in range(temp.shape[0]):
            sum_norms += pow(np.linalg.norm(temp[i]),2)
        return math.sqrt(sum_norms/self.number_of_atoms)
     
       
    # compare two conformations
    def compare(self, conformation1, conformation2, print_flag=False):
        self.find_centroid(conformation1, conformation2)
        conformation1, conformation2 = self.move_to_origin(conformation1, conformation2)
        self.SVD_process(conformation1, conformation2)
        c_rmsd = self.cRMSD_distance(conformation1, conformation2)
        if print_flag:
            print(f'cRMSD = {c_rmsd}')
        return c_rmsd
    
    
    def pipeline(self):
        self.read_conformations()
        self.compare(conformation1=self.conformations[0], conformation2=self.conformations[1], print_flag=True)

In [101]:
crmsd = cRMSD(filename1='ATOMS_7NEH.txt', filename2='ATOMS_7NEG.txt')
crmsd.pipeline()

cRMSD = 0.29239686356608174


In [98]:
crmsd = cRMSD(filename1='CA_7NEH.txt', filename2='CA_7NEG.txt')
crmsd.pipeline()

cRMSD = 0.29239686356608174


In [15]:
# def get_atoms_of_structure(name):
#     name = name.lower()
#     structure = parser.get_structure(name, name+'.pdb')

#     receptor_binding_domain = []

#     for model in structure:
#         for chain in model:
#             for residue in chain:
#                 name = residue.get_resname() 
#                 code = int(str(residue).split('=')[2].split(' ')[0])

#                 for atom in residue:
#                     receptor_binding_domain.append((name, code, atom.get_name(), atom.get_coord()))
                    
#     return receptor_binding_domain

In [16]:
# atoms_7neh = get_atoms_of_structure(name='7neh')
# print(len(atoms_7neh))

In [17]:
# atoms_7neg = get_atoms_of_structure(name='7neg')
# print(len(atoms_7neg))