In [1]:
cd /home/dmoi/projects/foldtree2

/home/dmoi/projects/foldtree2


# Protein Structure Graph Construction and Analysis

This notebook demonstrates the process of converting protein structures (PDB files) into graph representations suitable for geometric deep learning. The workflow includes:

- Downloading and parsing PDB files using Bio.PDB.
- Extracting residue-level features, backbone and contact maps, bond angles, and other structural properties.
- Converting structures into PyTorch Geometric HeteroData objects for downstream machine learning tasks.
- Visualizing structural features such as Ramachandran plots.
- Preparing large datasets of protein structures for training graph neural networks.

The notebook leverages custom modules (e.g., `src.pdbgraph.PDB2PyG`) for feature extraction and graph construction, and supports efficient data loading and batching for large-scale experiments.

In [2]:
import wget
#download an example pdb file
filename = './1eei (1).pdb'
url = 'https://files.rcsb.org/download/1EEI.pdb'
#filename = wget.download(url)
datadir = '../../datasets/foldtree2/'

In [3]:
#use autoreload to reload modules
%load_ext autoreload
%autoreload 2


In [4]:
from foldtree2.src import pdbgraph
import importlib
importlib.reload(pdbgraph)
converter = pdbgraph.PDB2PyG( 
aapropcsv = 'foldtree2/config/aaindex1.csv'
)

In [5]:
from Bio import PDB
import warnings
from matplotlib import pyplot as plt
import numpy as np
import pydssp
import tqdm
from Bio.PDB import PDBParser   
import numpy as np

testing = False

In [6]:

if testing == True:
	#filename = './foldtree2/config/1eei.pdb'
	filename = './tmp/A0A0C2GTF1.pdb'
	res  = converter.create_features(filename, distance = 10, verbose = False , add_prody = True)
	print(len(res))
	angles, contact_points, ss , hbond_mat, backbone , backbone_rev , positional_encoding , plddt , aa , bondangles , foldxvals , coords , window, windowrev, prodymats = res
	print( len(angles) )

	#read the pdb file
	parser = PDBParser()
	structure = parser.get_structure('1eei', filename)
	model = structure[0]

	aa_dict = {
	'ALA': 'A',
	'ARG': 'R',
	'ASN': 'N',
	'ASP': 'D',
	'CYS': 'C',
	'GLN': 'Q',
	'GLU': 'E',
	'GLY': 'G',
	'HIS': 'H',
	'ILE': 'I',
	'LEU': 'L',
	'LYS': 'K',
	'MET': 'M',
	'PHE': 'F',
	'PRO': 'P',
	'SER': 'S',
	'THR': 'T',
	'TRP': 'W',
	'TYR': 'Y',
	'VAL': 'V'

	}
	#extract the sequence
	seq = ''
	chains = model.get_chains()
	print('chains:', [chain.id for chain in chains])
	chain = model['A']  # Assuming you want the first chain
	for residue in chain:
		if PDB.is_aa(residue) and residue.get_resname() in aa_dict:
			seq += aa_dict[residue.get_resname()]
	print('sequence:' , seq)
	print('length of sequence:', len(seq))
	print('df:', angles)
	angles[['Phi_Angle', 'Psi_Angle' , 'Omega_Angle']].plot()
	plt.show()

In [7]:
#show prody matrices
if testing == True:
	for key in prodymats:
		print('prody matrix key:', key)
		print(prodymats[key])
		print('matrix shape:', prodymats[key].shape)
		plt.spy(prodymats[key].toarray() , markersize=1)
		plt.title(f'ProDy Matrix: {key}')
		plt.show()

In [8]:
if testing == True:
	plt.figure( figsize=(10,10) )
	plt.title('Secondary Structure Matrix')

	plt.spy(ss.T)
	plt.show()
	print('ss shape:', ss.shape)

In [9]:
#plot ramachandran
if testing == True:
	import matplotlib.pyplot as plt
	import seaborn as sns
	sns.set(style="whitegrid")
	plt.figure(figsize=(8, 6))
	plt.scatter(angles['Phi_Angle'], angles['Psi_Angle'], c='blue', alpha=0.5)
	plt.xlim(-np.pi, np.pi)
	plt.ylim(-np.pi, np.pi)
	plt.axhline(0, color='black', lw=1)
	plt.axvline(0, color='black', lw=1)
	plt.title('Ramachandran Plot')
	plt.xlabel('Phi Angle (degrees)')
	plt.ylabel('Psi Angle (degrees)')
	plt.grid()
	plt.show()

In [10]:
if testing == True:
	print('angles shape:', angles.shape)
	print(angles.head())
	data = converter.struct2pyg( filename , verbose = True , add_prody = True )
	print('data:', data)

In [11]:
import multiprocessing as mp
import tqdm
import os
import numpy as np
import wget 
import pandas as pd

datadir = '/mnt/data2/datasets/'
cols = 'repId_isDark_nMem_repLen_avgLen_repPlddt_avgPlddt_LCAtaxId'.split('_')
repdf = pd.read_table(datadir+ 'afdbclusters/2-repId_isDark_nMem_repLen_avgLen_repPlddt_avgPlddt_LCAtaxId.tsv')
repdf.columns = cols
print(repdf.head() , len(repdf))

def download_pdb(rep ,structdir = datadir+'structs/'):
	url = f'https://alphafold.ebi.ac.uk/files/AF-{rep}-F1-model_v4.pdb'
	#check if file exists
	if os.path.exists( structdir + rep + '.pdb'):
		return structdir + rep + '.pdb'
	filename = wget.download(url, out=structdir + rep + '.pdb')
	return filename

def download(repdf , nreps = 100 , structdir = datadir +'structs/'):
	if not os.path.exists(structdir):
		os.makedirs(structdir)
	reps = repdf.repId.unique()
	if nreps:
		#select a random sample of representatives
		reps = np.random.choice(reps, nreps)
	with mp.Pool(20) as p:
		filenames = p.map(download_pdb, tqdm.tqdm(reps))
		return filenames
		


        repId  isDark  nMem  repLen   avgLen  repPlddt  avgPlddt  LCAtaxId
0  A0A6M1CKG1       0     3     122  122.667     96.06   88.1467     91347
1  A0A4Q3Q6P1       1     3     117  113.333     71.94   67.8333     80864
2  A0A4V3EFQ4       0     3     153  156.000     68.56   58.8533    131567
3  A0A4V3EKB1       1     3      51   51.000     62.62   62.7700      1883
4  A0A4V3EP89       0     2      96   94.500     70.88   72.1900   2593676 2302907


In [12]:
#download(repdf, nreps = 10000 , structdir = '../datasets/foldtree2/structs/' )

In [13]:
import torch
import os
import numpy as np
# Setting the seed for everything
torch.manual_seed(0)
np.random.seed(0)
# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [14]:
if testing == True:
	data_sample =converter.struct2pyg( filename, verbose=False , add_prody = True )
	print(data_sample)
	ndim = data_sample['res'].x.shape[1]
	ndim_godnode = data_sample['godnode'].x.shape[1]
	ss = data_sample['ss'].x
	print('ss shape:', ss.shape)
	print(data_sample['res'].x.shape)

In [15]:
import glob
pdbfiles = glob.glob(datadir+'structs/*.pdb')
print(len(pdbfiles))

20090


In [16]:

data_sample =converter.struct2pyg( pdbfiles[0], foldxdir=None) #)'./foldx/',  verbose=False)
print(data_sample)

KeyboardInterrupt: 

In [None]:
#shuffle the data
np.random.shuffle(pdbfiles)
#converter.store_pyg(pdbfiles[0:10000], filename='structs_train_final_big.h5', foldxdir = None, verbose = False , add_prody = False , distance=15)

In [None]:
'''
converter.store_pyg_mp(
    pdbfiles[:100],
    filename="structs_mptest.h5",
    ncpu=30,
    distance=15,
    add_prody=False,
    compute_hbonds=False,
    start_method="spawn",
    chunksize=5,
	maxtasksperchild=10,
)

'''


#converter.store_pyg_mp_polling( pdbfiles[:100], filename="structs_mptest.h5", ncpu=10, verbose=False, chunksize=5,
#								  start_method='spawn', maxtasksperchild=10, distance=15, add_prody=False, compute_hbonds=True )
                                  
                    

{'distance': 15, 'add_prody': False, 'compute_hbonds': False, 'chunksize': 5, 'maxtasksperchild': 10}


Processing and storing PDB files:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
'''
from torch_geometric.data import DataLoader
from torch_geometric.data import Batch

#load the dataset
struct_dat = pdbgraph.StructureDataset('structs_traininffttest.h5')

train_loader = DataLoader(struct_dat, batch_size=20, shuffle=True , worker_init_fn = np.random.seed(0) , num_workers=4 )
#test out the dataloader
for i, data in enumerate(train_loader):
	print(i)
	print(data)
	if i > 10:
		break
'''

"\nfrom torch_geometric.data import DataLoader\nfrom torch_geometric.data import Batch\n\n#load the dataset\nstruct_dat = pdbgraph.StructureDataset('structs_traininffttest.h5')\n\ntrain_loader = DataLoader(struct_dat, batch_size=20, shuffle=True , worker_init_fn = np.random.seed(0) , num_workers=4 )\n#test out the dataloader\nfor i, data in enumerate(train_loader):\n\tprint(i)\n\tprint(data)\n\tif i > 10:\n\t\tbreak\n"

In [None]:
import glob
pdbfiles_structalign = glob.glob(datadir + 'struct_align/*/structs/*.pdb')
print(len(pdbfiles_structalign))


NameError: name 'datadir' is not defined

In [20]:
from foldtree2.src import foldseek2tree
import os
import tqdm
import glob

reps = glob.glob(datadir + 'struct_align/*/')
for rep in tqdm.tqdm(reps):
		if not os.path.exists( os.path.join(datadir, 'struct_align', rep, 'allvall.csv') ):
			#check more than 1 pdb file exists
			pdbfiles = glob.glob( os.path.join(datadir, 'struct_align', rep, 'structs', '*.pdb') )
			if len(pdbfiles) < 2:
				continue
			foldseek2tree.runFoldseek_allvall_EZsearch(
				infolder=os.path.join(datadir, 'struct_align', rep, 'structs'),
				outpath=os.path.join(datadir, 'struct_align', rep, 'allvall.csv')
			)
			#clear cell output to save memory
			import IPython
			IPython.display.clear_output(wait=True)

100%|██████████| 20350/20350 [11:18:53<00:00,  2.00s/it]


In [None]:
#converter.store_pyg(pdbfiles_structalign, filename='structalignFFT.h5', verbose = False)
converter.store_pyg_mp(
    pdbfiles_structalign,
    filename="structalignFFT_mp.h5",
    ncpu=30,
    distance=15,
    add_prody=False,
    compute_hbonds=True,
    start_method="spawn",
)

{'distance': 15, 'add_prody': False, 'compute_hbonds': True}


Processing and storing PDB files:   4%|▍         | 2004/45853 [20:44<11:48:36,  1.03it/s] Exception in thread Thread-10:
Traceback (most recent call last):
  File "/home/dmoi/miniforge3/envs/foldtree2/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/dmoi/miniforge3/envs/foldtree2/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "/home/dmoi/miniforge3/envs/foldtree2/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/home/dmoi/miniforge3/envs/foldtree2/lib/python3.9/site-packages/pebble/pool/process.py", line 193, in message_manager_loop
    pool_manager.process_next_message(CONSTS.sleep_unit)
  File "/home/dmoi/miniforge3/envs/foldtree2/lib/python3.9/site-packages/pebble/pool/process.py", line 225, in process_next_message
    message = self.worker_manager.receive(timeout)
  File "/home/dmoi/miniforge3/envs/foldtree2/lib/python3.9/sit