In [1]:
# block warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# read ../datasets/cath-4.2/chain_set_splits.json into data
import json
with open('../datasets/cath-4.2/chain_set_splits.json') as f:
    data = json.load(f) 


In [3]:
data_test = data['test']
data_validation = data['validation']
combined_data = data_test + data_validation
data_train = data['train']

In [4]:
len(combined_data), len(data_train)

(1728, 18024)

In [7]:
combined_data[:5]  # Display the first 5 entries to verify the content

['3fkf.A', '2d9e.A', '2lkl.A', '1ud9.A', '2rem.B']

In [9]:
import requests
def download_pdb(pdb_code, save_path=None):
    pdb_code = pdb_code.lower()
    url = f"https://files.rcsb.org/download/{pdb_code}.pdb"
    response = requests.get(url)

    if response.status_code == 200:
        if save_path is None:
            save_path = f"{pdb_code}.pdb"
        with open(save_path, 'w') as file:
            file.write(response.text)
        print(f"PDB file saved as {save_path}")
    else:
        print(f"Failed to download PDB file for code: {pdb_code}")

# Example usage
download_pdb("3fkf")


PDB file saved as 3fkf.pdb


In [None]:
extract_chain_from_pdb("3fkf.pdb", "3fkf_chainA_attempt2.pdb", "A")

In [None]:
import requests

from Bio import PDB
import os

parser = PDB.PDBParser()

io = PDB.PDBIO()



# wrap that in a function
def extract_chain_from_pdb(pdb_code, chain):
    """
    Extracts a specific chain from a PDB file and writes it to a new file.

    Parameters:
    - input_path: Path to the original PDB file.
    - output_path: Path to save the filtered PDB file.
    - chain_id: The chain identifier to keep (e.g., 'A').
    """
    full_pdb_path = f"{DATA_DIR}/test_val_full_pdbs/{pdb_code}.pdb"
    chain_pdb_path = f"{DATA_DIR}/test_val_chain_pdbs/{pdb_code}_chain{chain}.pdb"

    # if the path exists, don't do anything
    if os.path.exists(chain_pdb_path):
        return 3

    pdb_code = pdb_code.lower()
    url = f"https://files.rcsb.org/download/{pdb_code}.pdb"
    response = requests.get(url)

    DATA_DIR = "../datasets"

    if response.status_code == 200:
        
        with open(full_pdb_path, 'w') as file:
            file.write(response.text)
    else:
        print(f"Failed to download PDB file for code: {pdb_code}")
        return pdb_code
    try:
        parser = PDB.PDBParser()
        structure = parser.get_structure("protein", full_pdb_path)

        io = PDB.PDBIO()
        io.set_structure(structure[0][chain_id])  # Select specified chain
        io.save(chain_pdb_path)  # Save to new file
    except Exception as e:
        print(f"Error processing {pdb_code}: {e}")
        return pdb_code
    return 3

In [27]:
from tqdm import tqdm
failed_entries = []
for entry in tqdm(combined_data):
    pdb_code, chain_id = entry.split(".")
    output = extract_chain_from_pdb(pdb_code, chain_id)
    if output != 3:
        failed_entries.append(entry)

100%|██████████| 1728/1728 [23:05<00:00,  1.25it/s]


In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

def process_entry(entry):
    pdb_code, chain_id = entry.split(".")
    output = extract_chain_from_pdb(pdb_code, chain_id)
    return entry if output != 3 else None

failed_entries = []
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(process_entry, entry) for entry in data_train]
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result is not None:
            failed_entries.append(result)

In [9]:
from tqdm import tqdm
failed_entries = []
for entry in tqdm(data_train):
    pdb_code, chain_id = entry.split(".")
    output = extract_chain_from_pdb(pdb_code, chain_id)
    if output != 3:
        failed_entries.append(entry)

 12%|█▏        | 2101/18024 [27:13<3:26:18,  1.29it/s]


KeyboardInterrupt: 

In [8]:
failed_entries

[]

In [1]:
import os

In [2]:
list_full_chains = os.listdir("../datasets/test_val_full_pdbs")
list_chains = os.listdir("../datasets/test_val_chain_pdbs")

In [3]:
list_chains_pdb = []
for file in list_chains:
    pdb_id = file.split("_")[0]
    list_chains_pdb.append(pdb_id)



In [4]:
for file in list_full_chains:
    pdb_id = file.split(".")[0]
    if pdb_id not in list_chains_pdb:
        print(f"Missing chain file for {pdb_id}")