In [32]:
#!/usr/bin/env python

import pandas as pd
import numpy as np

import glob
from ase.io import read
from dscribe.descriptors import ACSF

import random
import sys

from sklearn import preprocessing

In [33]:
class Structure:

    def __init__(self, pdb_path):
        self.name = pdb_path.split('/')[1].split('.pdb')[0]
        self.water = int(self.name.split('.')[2][5:])
        self.system = self.name.split('.')[0]
        self.loop = self.name.split('.')[1]
        self.mdframe = self.name.split('.')[3]
        self.feat_atom=dict()
        #print("PDB name is ", self.name)

    def calc_sym(self):
        '''
        The function takes the structure file and computes the
        symmetry function for each atom
        output: a list of symmetry functions for each atom.
                Length of the list = number of atoms
            UPDATE: remove water molecules.
        '''
        pdb_file = self.name+'.pdb'
        chg_file = self.name + '.chg'
        chg=np.genfromtxt("charge/"+chg_file)

        structure = read("pdb/"+pdb_file)
        feat_lst=[]
        atomPos=[np.where(chg==20)[0][0]]
        cachg=chg[chg[:,0]==20][0][1]


        # Setting up the ACSF descriptor
        for r_cut in [2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0]:
            acsf = ACSF(
                species=["C", "H", "O", "N", "Ca"],
                rcut=r_cut,
                g2_params=[[0.0001,0]],
            )
            # skipping g1 here
            feat_lst.append(acsf.create(structure,positions=atomPos,n_jobs=4)[:,5:])
        # 5 x 8 = 40 features 

        # skipping g1 here
        acsf_ang = ACSF(
            species=["C", "H", "O", "N", "Ca"],
            rcut=3.0,
            g4_params=[[0.0001,0.5,-1]],
        )
        feat_lst.append(acsf_ang.create(structure,positions=atomPos,n_jobs=4)[:,5:])
        # 15 features

        # skipping g1 here
        acsf_ang = ACSF(
            species=["C", "H", "O", "N", "Ca"],
            rcut=6.0,
            g4_params=[[0.0001,0.5,-1]],
        )
        feat_lst.append(acsf_ang.create(structure,positions=atomPos,n_jobs=4)[:,5:])
        # 15 features

        # combine the above functions (2d arrays) into one 2d array
        feat = np.hstack(tuple(feat_lst))
        #print(np.append(feat, np.array(cachg,ndmin=2)))
        # add charge to the end of the 2d array
        self.feat_atom["Ca"] = np.append(feat, np.array(cachg,ndmin=2))
        
    # crudely add the network parameters
    def get_network(self):
        network_file = 'network/net.'+self.name+'.csv'
        net = pd.read_csv(network_file)
        return np.array(net.iloc[:,1:]).flatten()

In [34]:
# Use all the pdb files in the data directory ./pdb/
loops = glob.glob("pdb/*.pdb")
structures = dict()
counter = 0
count_total = len(loops)
for loop in loops:
    name = loop.split('/')[1].split('.pdb')[0]
    structures[name]=Structure(loop)
    structures[name].calc_sym()

    # adding the 14x4 network parameters to the beginning of the feat matrix
    net = structures[name].get_network()
    structures[name].feat_atom['Ca'] = np.hstack((net,structures[name].feat_atom['Ca']))

    counter = counter + 1
    if counter % 100 == 0:
        print(counter, "structures out of ", count_total, " processed ...", end = '\r')
        sys.stdout.flush()

7800 structures out of  7817  processed ...

In [35]:
for loop in loops:
    name = loop.split('/')[1].split('.pdb')[0]
    if structures[name].feat_atom['Ca'].shape[0] != 127:
        print(name)

In [36]:
for ele in ["Ca"]:
    #col = len(structures[name].feat_atom[ele])
    col=127
    data = np.empty([0,col])
    print(ele,col)
    print(data.shape)
    for loop_name in structures.keys():
        #print(loop_name)
        data = np.vstack((data,structures[loop_name].feat_atom[ele]))
    np.savetxt(ele+'_all_data.txt', data, delimiter=',')

Ca 127
(0, 127)
