In [42]:
import xml.etree.ElementTree as ET
import gzip
from pathlib import Path
import pandas as pd
import numpy as np


pd.set_option('display.max_rows', 5000)
DEFAULT_MMCIF_NUM = 50000

# Assuming 'sifts_file_path' is already defined
sifts_file_path = Path('./mypath/SIFTS/2aa3.xml.gz')  # Update this to your actual file path

def SIFTS_tree_parser(file):
    tree = ET.parse(file)
    root = tree.getroot()

    namespace = {'sifts': 'http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd'}
    
    data = []
    for protein in root.findall('.//sifts:entity[@type="protein"]', namespace):
        entityId = protein.attrib["entityId"]
        for residue in protein.findall('.//sifts:residue[@dbSource="PDBe"]', namespace):
            entry = {
                "entityId": entityId,
                "PDBe_resNum": residue.attrib["dbResNum"],
                "PDBe_resName": residue.attrib["dbResName"],
                "PDB_resNum": None,
                "PDB_resName": None,
                "PDB_chainId": None,
                "UniProt_resNum": None,
                "UniProt_resName": None,
                "dbAccessionId": None,
                "UniProt_secondaryId": None  # Added field for secondaryId
            }
    
            crossRefs = residue.findall('.//sifts:crossRefDb', namespace)
            for crossRef in crossRefs:
                db_source = crossRef.attrib["dbSource"]
                if db_source == "PDB":
                    entry["PDB_resNum"] = crossRef.attrib["dbResNum"]
                    entry["PDB_resName"] = crossRef.attrib["dbResName"]
                    entry["PDB_chainId"] = crossRef.attrib["dbChainId"]
                elif db_source == "UniProt":
                    entry["UniProt_resNum"] = crossRef.attrib["dbResNum"]
                    entry["UniProt_resName"] = crossRef.attrib["dbResName"]
                    entry["dbAccessionId"] = crossRef.attrib["dbAccessionId"]
                    
                    # Look for the secondaryId within sibling dbDetail elements
                    dbDetails = protein.find('.//sifts:dbDetail[@dbSource="UniProt"]', namespace)
                    if dbDetails is not None:
                        entry["UniProt_secondaryId"] = dbDetails.text  # Capture the secondaryId
                       
            data.append(entry)
            
    df = pd.DataFrame(data)
    condition_mask = (df['PDB_resNum'].notnull() & df['UniProt_resNum'].notnull() &
                      (df['PDB_resNum'] != df['UniProt_resNum']) & (df['PDB_resNum'] != "null"))

    # Apply the condition mask within each group to identify if any discrepancies exist
    df['chains_to_change'] = condition_mask
    return df



In [79]:
from pathlib import Path
import gzip
import os
from typing import Callable, Optional, Union
import Bio.PDB.MMCIF2Dict
from src.download.downloadwithThreadPool import run_downloads_with_ThreadPool, url_formation_for_pool
from src.download.downloadwithThreadPool import download_file

def download_file_general(file_type: str, file_name: str, input_path: Path) -> bool:
    """Attempts to download a given file to the specified path."""
    url_path = url_formation_for_pool(file_type,[file_name], str(input_path))[0]
    return download_file(url_path)  # Assume this function returns True on success


def try_file_parser(default_path: Path, file_name: str,
                    parser_function: Callable[[Path], Union[Bio.PDB.MMCIF2Dict.MMCIF2Dict, Optional[any]]],
                    file_type: str) -> Union[Bio.PDB.MMCIF2Dict.MMCIF2Dict, Optional[any], None]:
    """General function to attempt loading a file into a parser, redownloading up to 3 times if necessary."""
    input_path = default_path / file_type
    input_path.mkdir(parents=True, exist_ok=True)
    file_path = input_path / file_name
    print(file_path)

    for attempt in range(3):
        try:
            with gzip.open(file_path, 'rt') as file:
                return parser_function(file)
        except (EOFError, ValueError, OSError) as e:
            print(f"Error processing {file_name}: {e}. Attempting to redownload.")
            file_path.unlink(missing_ok=True)  # Remove potentially corrupt file
            if not download_file_general(file_type, file_name, input_path):
                print(f"Failed to download {file_name}.")

    return None



print("Starting PDBrenum...")
DEFAULT_PATH = Path("mypath")
mmCIF_files = "2aa3.cif.gz"
SIFTS_files = "2aa3.xml.gz"

mmcif_dict = try_file_parser(DEFAULT_PATH, mmCIF_files, Bio.PDB.MMCIF2Dict.MMCIF2Dict,
                             file_type="mmCIF_assembly" if "assembly" in mmCIF_files else "mmCIF")

SIFTS_data = try_file_parser(DEFAULT_PATH, SIFTS_files, SIFTS_tree_parser, file_type="SIFTS")

Starting PDBrenum...
mypath/mmCIF/2aa3.cif.gz
mypath/SIFTS/2aa3.xml.gz


In [56]:
import re
pattern_auth_seq_num = re.compile(r'([^\.]+)\.((?:[^\.]+)?)((?:auth_seq_id|auth_seq_num))((?:[^\.]+)?)')

pattern_split = []
mmcif_dict_keys = mmcif_dict.keys()
for key in mmcif_dict_keys:
    match = pattern_auth_seq_num.match(key)
    if match:
        table_name, col_prefix, col_name, col_suffix = match.groups()
        pattern_split.append((table_name, col_prefix, col_name, col_suffix))

# Initialize a list to collect items with their priority
priority_list = []
pattern_list = []
except_list = ("_pdbx_unobs_or_zero_occ_residues", "_pdbx_nonpoly_scheme", 
               "_pdbx_poly_seq_scheme", "_ndb_struct_na_base_pair", "_struct_ref_seq_dif")

for pattern in pattern_split:
    priority_list = []
    for key in mmcif_dict_keys:
        if key.split('.')[0] in except_list: continue
        table_name, prefix, num, suffix = pattern
        if table_name == key.split(".")[0] and prefix in key and suffix in key:
            # Check each condition and assign priority
            if prefix == "" and suffix == "":
                if "auth_seq_id" == key.split(".")[1] or "auth_seq_num" == key.split(".")[1]:
                    priority_list.append((key, 1))  # Priority 1 for auth_seq_id/auth_seq_num
                elif "strand_id" == key.split(".")[1] or "auth_asym_id" == key.split(".")[1]:
                    priority_list.append((key, 2))  # Priority 2 for strand_id
                elif ("ins_code" == key.split(".")[1] or "pdbx_PDB_ins_code" == key.split(".")[1] or 
                      "PDB_ins_code" == key.split(".")[1] or 'pdbx_auth_ins_code' == key.split(".")[1]):
                    priority_list.append((key, 3))  # Priority 3 for ins_code
                elif "auth_comp_id" == key.split(".")[1] or "mon_id" == key.split(".")[1] or "pdbx_mon_id" == key.split(".")[1]:
                    priority_list.append((key, 4))  # Priority 4 for auth_comp_id/mon_id
            else:
                if "auth_seq_id" in key or "auth_seq_num" in key:
                    priority_list.append((key, 1))  # Priority 1 for auth_seq_id/auth_seq_num
                elif "strand_id" in key or "auth_asym_id" in key:
                    priority_list.append((key, 2))  # Priority 2 for strand_id
                elif "ins_code" in key or "pdbx_PDB_ins_code" in key:
                    priority_list.append((key, 3))  # Priority 3 for ins_code
                elif "auth_comp_id" in key or "mon_id" in key:
                    priority_list.append((key, 4))  # Priority 4 for auth_comp_id/mon_id

    # Sort items by their priority
    priority_list.sort(key=lambda x: x[1])

    # Now, build temp_list with the sorted items
    temp_list = [item[0] for item in priority_list]

    # Use temp_list as needed, or append it to another list
    if temp_list:
        pattern_list.append(temp_list)

In [97]:
pattern_list

[['_pdbx_unobs_or_zero_occ_atoms.auth_seq_id',
  '_pdbx_unobs_or_zero_occ_atoms.auth_asym_id',
  '_pdbx_unobs_or_zero_occ_atoms.PDB_ins_code',
  '_pdbx_unobs_or_zero_occ_atoms.auth_comp_id'],
 ['_struct_conf.beg_auth_seq_id',
  '_struct_conf.beg_auth_asym_id',
  '_struct_conf.pdbx_beg_PDB_ins_code',
  '_struct_conf.beg_auth_comp_id'],
 ['_struct_conf.end_auth_seq_id',
  '_struct_conf.end_auth_asym_id',
  '_struct_conf.pdbx_end_PDB_ins_code',
  '_struct_conf.end_auth_comp_id'],
 ['_struct_sheet_range.beg_auth_seq_id',
  '_struct_sheet_range.beg_auth_asym_id',
  '_struct_sheet_range.pdbx_beg_PDB_ins_code',
  '_struct_sheet_range.beg_auth_comp_id'],
 ['_struct_sheet_range.end_auth_seq_id',
  '_struct_sheet_range.end_auth_asym_id',
  '_struct_sheet_range.pdbx_end_PDB_ins_code',
  '_struct_sheet_range.end_auth_comp_id'],
 ['_pdbx_struct_sheet_hbond.range_1_auth_seq_id',
  '_pdbx_struct_sheet_hbond.range_1_auth_asym_id',
  '_pdbx_struct_sheet_hbond.range_1_PDB_ins_code',
  '_pdbx_struct_shee

In [93]:
df_mmCIF = pd.DataFrame({
    "label_seq_id": mmcif_dict["_atom_site.label_seq_id"],
    "label_comp_id": mmcif_dict["_atom_site.label_comp_id"],
    "label_asym_id": mmcif_dict["_atom_site.label_asym_id"],
    "auth_seq_id": mmcif_dict["_atom_site.auth_seq_id"],
    "auth_comp_id": mmcif_dict["_atom_site.auth_comp_id"],
    "auth_asym_id": mmcif_dict["_atom_site.auth_asym_id"],
    "ins_code": mmcif_dict["_atom_site.pdbx_PDB_ins_code"],})

df_mmCIF['merged_seq_id'] = df_mmCIF.apply(lambda row: str(row['auth_seq_id']) + row['ins_code'] if row['ins_code'] not in ['?', '.'] else str(row['auth_seq_id']), axis=1)


merged_df = pd.merge(df_mmCIF, SIFTS_data, how='left', left_on=['merged_seq_id', 'auth_asym_id'], right_on=['PDB_resNum', 'PDB_chainId'])

# Ensure chains_to_change is boolean, treating NaN as False
merged_df['chains_to_change'] = merged_df['chains_to_change'].fillna(False)

# Now filter for rows where chains_to_change is True, then get unique auth_asym_id values
unique_auth_asym_ids_to_change = merged_df[merged_df['chains_to_change']]['auth_asym_id'].unique()

# Convert to a set for faster membership checking later
auth_asym_ids_to_change_set = set(unique_auth_asym_ids_to_change)

def determine_new_num(row):
    if row['auth_asym_id'] in auth_asym_ids_to_change_set:
        try:
            # Try to convert UniProt_resNum to an int to see if it's a number
            return int(row['UniProt_resNum'])
        except (ValueError, TypeError):
            pass  # UniProt_resNum is not a number, proceed to next condition
    else:
        return row['auth_seq_id']
    
    try:
        # Try to convert label_seq_id to an int and add DEFAULT_MMCIF_NUM if it's a number
        return int(row['label_seq_id']) + DEFAULT_MMCIF_NUM
    except (ValueError, TypeError):
        # label_seq_id is not a number, proceed to next condition
        pass

    # As a last resort, return auth_seq_id + DEFAULT_MMCIF_NUM + 10000
    # Assuming auth_seq_id is always a number based on your dataframe; if not, additional error handling may be needed
    return str(int(row['auth_seq_id']) + DEFAULT_MMCIF_NUM + 10000)

# Apply the function to each row
merged_df['new_num'] = merged_df.apply(determine_new_num, axis=1)


# Function to determine the new ins_code
# Determine the default ins_code based on the most common value in the 'pdbx_PDB_ins_code' column that matches one of the specified default candidates
default_candidates = ['?', '.']
default_ins_code_counts = merged_df['ins_code'].value_counts()
default_ins_code = next((code for code in default_candidates if code in default_ins_code_counts), default_candidates[0])

# Define the function to determine the new ins_code
def determine_new_ins_code(row, default_ins_code, change_set):
    if row['auth_asym_id'] in change_set:
        return default_ins_code
    else:
        return row['ins_code']


# Use lambda to pass additional arguments to your function
merged_df['new_ins_code'] = merged_df.apply(lambda row: determine_new_ins_code(row, default_ins_code=default_ins_code, change_set=auth_asym_ids_to_change_set), axis=1)


In [98]:
merged_df

Unnamed: 0,label_seq_id,label_comp_id,label_asym_id,auth_seq_id,auth_comp_id,auth_asym_id,ins_code,merged_seq_id,entityId,PDBe_resNum,...,PDB_resNum,PDB_resName,PDB_chainId,UniProt_resNum,UniProt_resName,dbAccessionId,UniProt_secondaryId,chains_to_change,new_num,new_ins_code
0,1,THR,A,18,THR,A,?,18,A,1,...,18,THR,A,2,T,Q4PRK9,Q4PRK9_PLAVI,True,2,?
1,1,THR,A,18,THR,A,?,18,A,1,...,18,THR,A,2,T,Q4PRK9,Q4PRK9_PLAVI,True,2,?
2,1,THR,A,18,THR,A,?,18,A,1,...,18,THR,A,2,T,Q4PRK9,Q4PRK9_PLAVI,True,2,?
3,1,THR,A,18,THR,A,?,18,A,1,...,18,THR,A,2,T,Q4PRK9,Q4PRK9_PLAVI,True,2,?
4,1,THR,A,18,THR,A,?,18,A,1,...,18,THR,A,2,T,Q4PRK9,Q4PRK9_PLAVI,True,2,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10297,.,HOH,N,1487,HOH,D,?,1487,,,...,,,,,,,,False,61487,?
10298,.,HOH,N,1488,HOH,D,?,1488,,,...,,,,,,,,False,61488,?
10299,.,HOH,N,1489,HOH,D,?,1489,,,...,,,,,,,,False,61489,?
10300,.,HOH,N,1490,HOH,D,?,1490,,,...,,,,,,,,False,61490,?


In [99]:
merged_df_unique = merged_df.drop_duplicates(subset=['merged_seq_id', 'auth_asym_id'])

In [100]:
merged_df_unique

Unnamed: 0,label_seq_id,label_comp_id,label_asym_id,auth_seq_id,auth_comp_id,auth_asym_id,ins_code,merged_seq_id,entityId,PDBe_resNum,...,PDB_resNum,PDB_resName,PDB_chainId,UniProt_resNum,UniProt_resName,dbAccessionId,UniProt_secondaryId,chains_to_change,new_num,new_ins_code
0,1,THR,A,18,THR,A,?,18,A,1.0,...,18,THR,A,2.0,T,Q4PRK9,Q4PRK9_PLAVI,True,2,?
7,2,PRO,A,19,PRO,A,?,19,A,2.0,...,19,PRO,A,3.0,P,Q4PRK9,Q4PRK9_PLAVI,True,3,?
14,3,LYS,A,20,LYS,A,?,20,A,3.0,...,20,LYS,A,4.0,K,Q4PRK9,Q4PRK9_PLAVI,True,4,?
23,4,PRO,A,21,PRO,A,?,21,A,4.0,...,21,PRO,A,5.0,P,Q4PRK9,Q4PRK9_PLAVI,True,5,?
30,5,LYS,A,22,LYS,A,?,22,A,5.0,...,22,LYS,A,6.0,K,Q4PRK9,Q4PRK9_PLAVI,True,6,?
39,6,ILE,A,23,ILE,A,?,23,A,6.0,...,23,ILE,A,7.0,I,Q4PRK9,Q4PRK9_PLAVI,True,7,?
47,7,VAL,A,24,VAL,A,?,24,A,7.0,...,24,VAL,A,8.0,V,Q4PRK9,Q4PRK9_PLAVI,True,8,?
54,8,LEU,A,25,LEU,A,?,25,A,8.0,...,25,LEU,A,9.0,L,Q4PRK9,Q4PRK9_PLAVI,True,9,?
62,9,VAL,A,26,VAL,A,?,26,A,9.0,...,26,VAL,A,10.0,V,Q4PRK9,Q4PRK9_PLAVI,True,10,?
69,10,GLY,A,27,GLY,A,?,27,A,10.0,...,27,GLY,A,11.0,G,Q4PRK9,Q4PRK9_PLAVI,True,11,?


In [96]:
def poly_nonpoly_renum(mmcif_dict, df_PDBe_PDB_UniProt, chains_to_change, default_mmCIF_num):
    try:
        _pdbx_poly_seq_scheme_seq_id = mmcif_dict["_pdbx_poly_seq_scheme.seq_id"]
        _pdbx_poly_seq_scheme_asym_id = mmcif_dict["_pdbx_poly_seq_scheme.asym_id"]
        _pdbx_poly_seq_scheme_mon_id = mmcif_dict["_pdbx_poly_seq_scheme.mon_id"]

        _pdbx_poly_seq_scheme_pdb_seq_num = mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"]
        _pdbx_poly_seq_scheme_auth_seq_num = mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"]
        _pdbx_poly_seq_scheme_pdb_mon_id = mmcif_dict["_pdbx_poly_seq_scheme.pdb_mon_id"]
        _pdbx_poly_seq_scheme_auth_mon_id = mmcif_dict["_pdbx_poly_seq_scheme.auth_mon_id"]
        _pdbx_poly_seq_scheme_pdb_strand_id = mmcif_dict["_pdbx_poly_seq_scheme.pdb_strand_id"]
        _pdbx_poly_seq_scheme_pdb_ins_code = mmcif_dict["_pdbx_poly_seq_scheme.pdb_ins_code"]
    except KeyError:
        try:
            _pdbx_poly_seq_scheme_seq_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.seq_id"]
            _pdbx_poly_seq_scheme_asym_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.asym_id"]
            _pdbx_poly_seq_scheme_mon_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.mon_id"]

            _pdbx_poly_seq_scheme_pdb_seq_num = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_seq_num"]
            _pdbx_poly_seq_scheme_auth_seq_num = mmcif_dict["_pdbe_orig_poly_seq_scheme.auth_seq_num"]
            _pdbx_poly_seq_scheme_pdb_mon_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_mon_id"]
            _pdbx_poly_seq_scheme_auth_mon_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.auth_mon_id"]
            _pdbx_poly_seq_scheme_pdb_strand_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_strand_id"]
            _pdbx_poly_seq_scheme_pdb_ins_code = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_ins_code"]

        except KeyError:
            # continue
            return 0

    if type(_pdbx_poly_seq_scheme_pdb_strand_id) == str:
        _pdbx_poly_seq_scheme_pdb_seq_num = [_pdbx_poly_seq_scheme_pdb_seq_num]
        _pdbx_poly_seq_scheme_auth_seq_num = [_pdbx_poly_seq_scheme_auth_seq_num]
        _pdbx_poly_seq_scheme_pdb_mon_id = [_pdbx_poly_seq_scheme_pdb_mon_id]
        _pdbx_poly_seq_scheme_auth_mon_id = [_pdbx_poly_seq_scheme_auth_mon_id]
        _pdbx_poly_seq_scheme_pdb_strand_id = [_pdbx_poly_seq_scheme_pdb_strand_id]
        _pdbx_poly_seq_scheme_pdb_ins_code = [_pdbx_poly_seq_scheme_pdb_ins_code]

    mmCIF_pdbx_poly_seq_scheme_label = list(zip(_pdbx_poly_seq_scheme_seq_id,
                                                _pdbx_poly_seq_scheme_mon_id,
                                                _pdbx_poly_seq_scheme_asym_id))
    mmCIF_pdbx_poly_seq_scheme_pdb = list(zip(_pdbx_poly_seq_scheme_pdb_seq_num,
                                              _pdbx_poly_seq_scheme_pdb_mon_id,
                                              _pdbx_poly_seq_scheme_pdb_strand_id))
    mmCIF_pdbx_poly_seq_scheme_auth = list(zip(_pdbx_poly_seq_scheme_auth_seq_num,
                                               _pdbx_poly_seq_scheme_auth_mon_id,
                                               _pdbx_poly_seq_scheme_pdb_strand_id))

    df_mmCIF_pdbx_poly_seq_scheme = pd.DataFrame(zip(mmCIF_pdbx_poly_seq_scheme_label,
                                                     mmCIF_pdbx_poly_seq_scheme_pdb,
                                                     mmCIF_pdbx_poly_seq_scheme_auth,
                                                     _pdbx_poly_seq_scheme_pdb_ins_code))

    df_mmCIF_pdbx_poly_seq_scheme = df_mmCIF_pdbx_poly_seq_scheme.rename(
        columns={0: "_pdbx_poly_seq_scheme_label", 1: "pdbx_poly_seq_scheme_pdb",
                 2: "pdbx_poly_seq_scheme_auth", 3: "pdbx_poly_seq_scheme_pdb_ins_code"})

    df_pdbx_poly_seq_scheme_pdb_final = df_mmCIF_pdbx_poly_seq_scheme.merge(
        df_PDBe_PDB_UniProt, left_on="_pdbx_poly_seq_scheme_label", right_on="PDBe", how='left')
    df_pdbx_poly_seq_scheme_pdb_final["PDBe_num_and_chain"] = df_pdbx_poly_seq_scheme_pdb_final[
        "_pdbx_poly_seq_scheme_label"].apply(lambda x: (x[0], x[2]))

    df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"] = np.where(
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb_ins_code"].apply(lambda x: x == "."),
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb"].apply(lambda x: (x[0], x[2])),
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb"].apply(lambda x: x[0]) +
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb_ins_code"].apply(lambda x: x) + "," +
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb"].apply(lambda x: x[2]))
    df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"] = df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"].apply(
        lambda x: tuple(x.split(",")) if type(x) == str else x)

    df_pdbx_poly_seq_scheme_pdb_final["Uni_or_50k"] = np.where(
        df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"].apply(lambda x: x[1] in chains_to_change),
        df_pdbx_poly_seq_scheme_pdb_final["UniProt_50k"].apply(lambda x: x),
        df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"].apply(lambda x: x[0].strip(re.sub('[0-9\-\?\.]+', '', x[0]))))

    try:
        mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"]  # check if key exists
        mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"] = list(df_pdbx_poly_seq_scheme_pdb_final["Uni_or_50k"].values)
        mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"] = _pdbx_poly_seq_scheme_pdb_seq_num
    except KeyError:
        mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_seq_num"] = list(df_pdbx_poly_seq_scheme_pdb_final["Uni_or_50k"].values)
        mmcif_dict["_pdbe_orig_poly_seq_scheme.auth_seq_num"] = _pdbx_poly_seq_scheme_pdb_seq_num

    nonpoly_present = False

    try:
        _pdbx_nonpoly_scheme_pdb_seq_num = mmcif_dict["_pdbx_nonpoly_scheme.pdb_seq_num"]
        _pdbx_nonpoly_scheme_auth_seq_num = mmcif_dict["_pdbx_nonpoly_scheme.auth_seq_num"]
        _pdbx_nonpoly_scheme_pdb_mon_id = mmcif_dict["_pdbx_nonpoly_scheme.pdb_mon_id"]
        _pdbx_nonpoly_scheme_auth_mon_id = mmcif_dict["_pdbx_nonpoly_scheme.auth_mon_id"]
        _pdbx_nonpoly_scheme_pdb_strand_id = mmcif_dict["_pdbx_nonpoly_scheme.pdb_strand_id"]
        _pdbx_nonpoly_scheme_asym_id = mmcif_dict["_pdbx_nonpoly_scheme.asym_id"]
        dots_for_label = ["." for _ in range(len(_pdbx_nonpoly_scheme_asym_id)) if type(_pdbx_nonpoly_scheme_asym_id) == list]
        nonpoly_present = True
    except KeyError:
        try:
            _pdbx_nonpoly_scheme_pdb_seq_num = mmcif_dict["_pdbe_orig_nonpoly_scheme.pdb_seq_num"]
            _pdbx_nonpoly_scheme_auth_seq_num = mmcif_dict["_pdbe_orig_nonpoly_scheme.auth_seq_num"]
            _pdbx_nonpoly_scheme_pdb_mon_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.pdb_mon_id"]
            _pdbx_nonpoly_scheme_auth_mon_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.auth_mon_id"]
            _pdbx_nonpoly_scheme_pdb_strand_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.pdb_strand_id"]
            _pdbx_nonpoly_scheme_asym_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.asym_id"]
            dots_for_label = ["." for _ in range(len(_pdbx_nonpoly_scheme_asym_id)) if type(_pdbx_nonpoly_scheme_asym_id) == list]
            nonpoly_present = True
        except KeyError:
            pass

    if nonpoly_present:
        if type(_pdbx_nonpoly_scheme_pdb_strand_id) == str:
            _pdbx_nonpoly_scheme_pdb_seq_num = [_pdbx_nonpoly_scheme_pdb_seq_num]
            _pdbx_nonpoly_scheme_auth_seq_num = [_pdbx_nonpoly_scheme_auth_seq_num]
            _pdbx_nonpoly_scheme_pdb_mon_id = [_pdbx_nonpoly_scheme_pdb_mon_id]
            _pdbx_nonpoly_scheme_auth_mon_id = [_pdbx_nonpoly_scheme_auth_mon_id]
            _pdbx_nonpoly_scheme_pdb_strand_id = [_pdbx_nonpoly_scheme_pdb_strand_id]
            _pdbx_nonpoly_scheme_asym_id = [_pdbx_nonpoly_scheme_asym_id]
            dots_for_label = ["."]

        mmCIF_pdbx_nonpoly_scheme_pdb = list(zip(_pdbx_nonpoly_scheme_pdb_seq_num,
                                                 _pdbx_nonpoly_scheme_pdb_mon_id,
                                                 _pdbx_nonpoly_scheme_pdb_strand_id))
        mmCIF_pdbx_nonpoly_scheme_auth = list(zip(_pdbx_nonpoly_scheme_auth_seq_num,
                                                  _pdbx_nonpoly_scheme_auth_mon_id,
                                                  _pdbx_nonpoly_scheme_pdb_strand_id))
        mmCIF_pdbx_nonpoly_scheme_label = list(zip(dots_for_label,
                                                   _pdbx_nonpoly_scheme_pdb_mon_id,
                                                   _pdbx_nonpoly_scheme_asym_id))

        df_mmCIF_pdbx_nonpoly_scheme = pd.DataFrame(zip(mmCIF_pdbx_nonpoly_scheme_pdb,
                                                        mmCIF_pdbx_nonpoly_scheme_auth,
                                                        mmCIF_pdbx_nonpoly_scheme_label))
        df_mmCIF_pdbx_nonpoly_scheme = df_mmCIF_pdbx_nonpoly_scheme.rename(columns={0: "pdbx_nonpoly_scheme_pdb",
                                                                                    1: "pdbx_nonpoly_scheme_auth",
                                                                                    2: "pdbx_nonpoly_scheme_label"})

        df_mmCIF_pdbx_nonpoly_scheme["PDB"] = df_mmCIF_pdbx_nonpoly_scheme["pdbx_nonpoly_scheme_pdb"]
        df_mmCIF_pdbx_nonpoly_scheme["PDB_num_and_chain"] = df_mmCIF_pdbx_nonpoly_scheme["pdbx_nonpoly_scheme_pdb"].apply(lambda x: (x[0], x[2]))
        df_mmCIF_pdbx_nonpoly_scheme["Uni_or_50k"] = df_mmCIF_pdbx_nonpoly_scheme["pdbx_nonpoly_scheme_pdb"].apply(
            lambda x: str(int(x[0]) + default_mmCIF_num + 10000) if x[2] in chains_to_change else x[0])

        try:
            mmcif_dict["_pdbx_nonpoly_scheme.pdb_seq_num"]  # check if key exists
            mmcif_dict["_pdbx_nonpoly_scheme.pdb_seq_num"] = list(df_mmCIF_pdbx_nonpoly_scheme["Uni_or_50k"].values)
            mmcif_dict["_pdbx_nonpoly_scheme.auth_seq_num"] = _pdbx_nonpoly_scheme_pdb_seq_num
        except KeyError:
            try:
                mmcif_dict["_pdbe_orig_nonpoly_scheme.pdb_seq_num"] = list(df_mmCIF_pdbx_nonpoly_scheme["Uni_or_50k"].values)
                mmcif_dict["_pdbe_orig_nonpoly_scheme.auth_seq_num"] = _pdbx_nonpoly_scheme_pdb_seq_num
            except KeyError:
                pass

        poly_nonpoly_append = df_pdbx_poly_seq_scheme_pdb_final.append(df_mmCIF_pdbx_nonpoly_scheme)
        poly_nonpoly_append = poly_nonpoly_append[["PDBe", "PDB", "UniProt", "PDBe_num_and_chain", "PDB_num_and_chain", "AccessionID", "Uni_or_50k"]]
    else:
        poly_nonpoly_append = df_pdbx_poly_seq_scheme_pdb_final[
            ["PDBe", "PDB", "UniProt", "PDBe_num_and_chain", "PDB_num_and_chain", "AccessionID", "Uni_or_50k"]]

    return poly_nonpoly_append

In [114]:

def get_mmcif_values(mmcif_dict, keys, default_value=None):
    """
    Attempt to extract values from mmcif_dict using a list of keys.
    Falls back to the next key if the current key is not found.
    Ensures the returned value is a list.
    """
    for key in keys:
        try:
            value = mmcif_dict[key]
            # Ensure the value is in list format
            if not isinstance(value, list):
                value = [value]
            return value
        except KeyError:
            continue
    # Return default_value (which is None if not specified) if none of the keys are found.
    return default_value

# Define keys for primary and fallback options
keys_mapping = [
    (["_pdbx_poly_seq_scheme.seq_id", "_pdbe_orig_poly_seq_scheme.seq_id"], "seq_id"),
    (["_pdbx_poly_seq_scheme.asym_id", "_pdbe_orig_poly_seq_scheme.asym_id"], "asym_id"),
    (["_pdbx_poly_seq_scheme.mon_id", "_pdbe_orig_poly_seq_scheme.mon_id"], "mon_id"),
    (["_pdbx_poly_seq_scheme.pdb_seq_num", "_pdbe_orig_poly_seq_scheme.pdb_seq_num"], "pdb_seq_num"),
    (["_pdbx_poly_seq_scheme.auth_seq_num", "_pdbe_orig_poly_seq_scheme.auth_seq_num"], "auth_seq_num"),
    (["_pdbx_poly_seq_scheme.pdb_mon_id", "_pdbe_orig_poly_seq_scheme.pdb_mon_id"], "pdb_mon_id"),
    (["_pdbx_poly_seq_scheme.auth_mon_id", "_pdbe_orig_poly_seq_scheme.auth_mon_id"], "auth_mon_id"),
    (["_pdbx_poly_seq_scheme.pdb_strand_id", "_pdbe_orig_poly_seq_scheme.pdb_strand_id"], "pdb_strand_id"),
    (["_pdbx_poly_seq_scheme.pdb_ins_code", "_pdbe_orig_poly_seq_scheme.pdb_ins_code"], "pdb_ins_code"),
]

# Extract values using the defined keys and mapping
extracted_data = {mapping[1]: get_mmcif_values(mmcif_dict, mapping[0]) for mapping in keys_mapping}
# Now, create DataFrames from the extracted data
poly_seq_scheme_df = pd.DataFrame(extracted_data)


keys_mapping = [
    #(["_pdbx_nonpoly_scheme.seq_id", "_pdbe_orig_nonpoly_scheme.seq_id"], "seq_id"),
    (["_pdbx_nonpoly_scheme.asym_id", "_pdbe_orig_nonpoly_scheme.asym_id"], "asym_id"),
    (["_pdbx_nonpoly_scheme.mon_id", "_pdbe_orig_nonpoly_scheme.mon_id"], "mon_id"),
    (["_pdbx_nonpoly_scheme.pdb_seq_num", "_pdbe_orig_nonpoly_scheme.pdb_seq_num"], "pdb_seq_num"),
    (["_pdbx_nonpoly_scheme.auth_seq_num", "_pdbe_orig_nonpoly_scheme.auth_seq_num"], "auth_seq_num"),
    (["_pdbx_nonpoly_scheme.pdb_mon_id", "_pdbe_orig_nonpoly_scheme.pdb_mon_id"], "pdb_mon_id"),
    (["_pdbx_nonpoly_scheme.auth_mon_id", "_pdbe_orig_nonpoly_scheme.auth_mon_id"], "auth_mon_id"),
    (["_pdbx_nonpoly_scheme.pdb_strand_id", "_pdbe_orig_nonpoly_scheme.pdb_strand_id"], "pdb_strand_id"),
    (["_pdbx_nonpoly_scheme.pdb_ins_code", "_pdbe_orig_nonpoly_scheme.pdb_ins_code"], "pdb_ins_code"),
]


# Extract values using the defined keys and mapping
extracted_data = {mapping[1]: get_mmcif_values(mmcif_dict, mapping[0]) for mapping in keys_mapping}
# Continue with the rest of your logic to create DataFrames or process the extracted data further
nonpoly_seq_scheme_df = pd.DataFrame(extracted_data)

In [115]:
poly_seq_scheme_df

Unnamed: 0,seq_id,asym_id,mon_id,pdb_seq_num,auth_seq_num,pdb_mon_id,auth_mon_id,pdb_strand_id,pdb_ins_code
0,1,A,THR,18,18,THR,THR,A,.
1,2,A,PRO,19,19,PRO,PRO,A,.
2,3,A,LYS,20,20,LYS,LYS,A,.
3,4,A,PRO,21,21,PRO,PRO,A,.
4,5,A,LYS,22,22,LYS,LYS,A,.
5,6,A,ILE,23,23,ILE,ILE,A,.
6,7,A,VAL,24,24,VAL,VAL,A,.
7,8,A,LEU,25,25,LEU,LEU,A,.
8,9,A,VAL,26,26,VAL,VAL,A,.
9,10,A,GLY,27,27,GLY,GLY,A,.


In [116]:
nonpoly_seq_scheme_df

Unnamed: 0,asym_id,mon_id,pdb_seq_num,auth_seq_num,pdb_mon_id,auth_mon_id,pdb_strand_id,pdb_ins_code
0,E,SO4,1001,1001,SO4,SO4,A,.
1,F,AP0,1401,1401,AP0,AP0,A,.
2,G,AP0,1407,1407,AP0,AP0,B,.
3,H,SO4,1002,1002,SO4,SO4,C,.
4,I,AP0,1405,1405,AP0,AP0,C,.
5,J,AP0,1403,1403,AP0,AP0,D,.
6,K,HOH,1402,2,HOH,HOH,A,.
7,K,HOH,1403,3,HOH,HOH,A,.
8,K,HOH,1404,4,HOH,HOH,A,.
9,K,HOH,1405,5,HOH,HOH,A,.
