In [1]:
# downloading all modules that we will need for the job
# it will be in conda environment|
import os

# import all required modules
import Bio
from Bio.PDB import *
import pandas as pd
import numpy as np
import re  # Regular expression operations
import argparse
import time
import timeit
import signal
import sys
from os import listdir
from os.path import isfile, join
import gzip
import xml.etree.ElementTree as ET
import urllib
from datetime import date
import math
from string import punctuation
import platform
from platform import python_version
from pathlib import Path
import tqdm


import multiprocessing
from multiprocessing import Process
from multiprocessing import Pool
from lxml import html
import shutil

low_memory=False
pd.set_option('display.max_rows', 10000)
pd.options.display.max_rows = 10000
default_mmCIF_num = 50000
### script below can increase width of cells in jupyter-notebook
from IPython.display import display, HTML

display(HTML(data="""
 <style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
 </style>
 """))


from multiprocessing.pool import ThreadPool
import requests
from functools import partial,reduce, cmp_to_key
from concurrent.futures import as_completed, ProcessPoolExecutor, ThreadPoolExecutor


In [2]:
current_directory = os.getcwd()
def default_path():
    default_input_path_to_mmCIF = current_directory + "/mmCIF"
    default_input_path_to_PDB = current_directory + "/PDB"
    default_input_path_to_SIFTS = current_directory + "/SIFTS"
    default_output_path_to_mmCIF = current_directory + "/output_mmCIF"
    default_output_path_to_PDB = current_directory + "/output_PDB"
    default_input_path_to_mmCIF_assembly = current_directory + "/mmCIF_assembly"
    default_input_path_to_PDB_assembly = current_directory + "/PDB_assembly"
    default_output_path_to_mmCIF_assembly = current_directory + "/output_mmCIF_assembly"
    default_output_path_to_PDB_assembly = current_directory + "/output_PDB_assembly"
    return [default_input_path_to_mmCIF, default_input_path_to_PDB, default_input_path_to_SIFTS,
            default_output_path_to_mmCIF, default_output_path_to_PDB, default_input_path_to_mmCIF_assembly,
            default_input_path_to_PDB_assembly,default_output_path_to_mmCIF_assembly,
            default_output_path_to_PDB_assembly]


default_paths = default_path()
default_input_path_to_mmCIF = default_paths[0]
default_input_path_to_PDB = default_paths[1]
default_input_path_to_SIFTS = default_paths[2]
default_output_path_to_mmCIF = default_paths[3]
default_output_path_to_PDB = default_paths[4]
default_input_path_to_mmCIF_assembly = default_paths[5]
default_input_path_to_PDB_assembly = default_paths[6]
default_output_path_to_mmCIF_assembly = default_paths[7]
default_output_path_to_PDB_assembly = default_paths[8]
gzip_mode = "on"
exception_AccessionIDs = ["P42212", "Q17104", "Q27903", "Q93125", "P03069", "D3DLN9", "Q96UT3", "P0ABE7", "P00192", "P76805", "Q8XCE3", "P00720", "Q38170", "Q94N07", "P0AEX9", "P02928", "Q2M6S0"]
nproc = None
default_mmCIF_num = 50000
default_PDB_num = 5000

In [3]:
def look_what_is_inside(format_to_look_at,
                        default_input_path_to_mmCIF=current_directory + "/mmCIF",
                        default_input_path_to_PDB=current_directory + "/PDB",
                        default_input_path_to_SIFTS=current_directory + "/SIFTS",
                        default_output_path_to_mmCIF=current_directory + "/output_mmCIF",
                        default_output_path_to_PDB=current_directory + "/output_PDB",
                        default_input_path_to_mmCIF_assembly = current_directory + "/mmCIF_assembly",
                        default_input_path_to_PDB_assembly = current_directory + "/PDB_assembly",
                        default_output_path_to_mmCIF_assembly=current_directory + "/output_mmCIF_assembly",
                        default_output_path_to_PDB_assembly=current_directory + "/output_PDB_assembly"):
    if format_to_look_at == "SIFTS":
        if not os.path.exists(default_input_path_to_SIFTS):
            os.makedirs(default_input_path_to_SIFTS)
        result = [f for f in listdir(default_input_path_to_SIFTS) if isfile(join(default_input_path_to_SIFTS, f))]
        return result
    if format_to_look_at == "mmCIF":
        if not os.path.exists(default_input_path_to_mmCIF):
            os.makedirs(default_input_path_to_mmCIF)
        result = [f for f in listdir(default_input_path_to_mmCIF) if isfile(join(default_input_path_to_mmCIF, f))]
        return result
    if format_to_look_at == "PDB":
        if not os.path.exists(default_input_path_to_PDB):
            os.makedirs(default_input_path_to_PDB)
        result = [f for f in listdir(default_input_path_to_PDB) if isfile(join(default_input_path_to_PDB, f))]
        return result
    if format_to_look_at == "output_mmCIF":
        if not os.path.exists(default_output_path_to_mmCIF):
            os.makedirs(default_output_path_to_mmCIF)
        result = [f for f in listdir(default_output_path_to_mmCIF) if isfile(join(default_output_path_to_mmCIF, f))]
        return result
    if format_to_look_at == "output_PDB":
        if not os.path.exists(default_output_path_to_PDB):
            os.makedirs(default_output_path_to_PDB)
        result = [f for f in listdir(default_output_path_to_PDB) if isfile(join(default_output_path_to_PDB, f))]
        return result
    if format_to_look_at == "mmCIF_assembly":
        if not os.path.exists(default_input_path_to_mmCIF_assembly):
            os.makedirs(default_input_path_to_mmCIF_assembly)
        result = [f for f in listdir(default_input_path_to_mmCIF_assembly) if isfile(join(default_input_path_to_mmCIF_assembly, f))]
        return result
    if format_to_look_at == "PDB_assembly":
        if not os.path.exists(default_input_path_to_PDB_assembly):
            os.makedirs(default_input_path_to_PDB_assembly)
        result = [f for f in listdir(default_input_path_to_PDB_assembly) if isfile(join(default_input_path_to_PDB_assembly, f))]
        return result
    if format_to_look_at == "output_mmCIF_assembly":
        if not os.path.exists(default_output_path_to_mmCIF_assembly):
            os.makedirs(default_output_path_to_mmCIF_assembly)
        result = [f for f in listdir(default_output_path_to_mmCIF_assembly) if isfile(join(default_output_path_to_mmCIF_assembly, f))]
        return result
    if format_to_look_at == "output_PDB_assembly":
        if not os.path.exists(default_output_path_to_PDB_assembly):
            os.makedirs(default_output_path_to_PDB_assembly)
        result = [f for f in listdir(default_output_path_to_PDB_assembly) if isfile(join(default_output_path_to_PDB_assembly, f))]
        return result

In [4]:
from src.download.modules import *
from src.download import compressor
from src.download.lookfilesinside import look_what_is_inside
from src.download.downloadwithThreadPool import url_formation_for_pool, download_with_pool, download_pdb_assemblies_list_with_lxml

In [5]:
RENARK_mmCIF = ["#\n",
                "loop_\n",
                "_database_PDB_remark.id       1\n",
                "_database_PDB_remark.text\n",
                ";File processed by PDBrenum: http://dunbrack3.fccc.edu/PDBrenum\n",
                "Author sequence numbering is replaced with UniProt numbering according to\n",
                "alignment by SIFTS (https://www.ebi.ac.uk/pdbe/docs/sifts/).\n",
                "Only chains with UniProt sequences in SIFTS are renumbered.\n",
                "Residues in UniProt chains without UniProt residue numbers in SIFTS\n",
                "(e.g., sequence tags) are given residue numbers 50000+label_seq_id\n",
                "(where label_seq_id is the 1-to-N residue numbering of each chain.\n",
                "Ligands are numbered 50000+their residue number in the original file.\n",
                "The _poly_seq_scheme table contains a correspondence between the\n",
                "1-to-N sequence (seq_id), the new numbering based on UniProt (pdb_seq_num =\n",
                "auth_seq_id in the _atom_site records), and the author numbering\n",
                "in the original mmCIF file from the PDB (auth_seq_num).\n",
                ";\n",
                "#\n"] 

In [6]:
def try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name):    
    mmcif_dict = 0
    for _ in range(3):
        try:
            mmcif_dict = Bio.PDB.MMCIF2Dict.MMCIF2Dict(gzip.open(Path(str(default_input_path_to_mmCIF) + "/" + mmCIF_name), 'rt'))
            break
        except EOFError:
            os.remove(Path(str(default_input_path_to_mmCIF) + "/" + mmCIF_name))
            if "assembly" in mmCIF_name: 
                download_with_pool(url_formation_for_pool("mmCIF_assembly", [mmCIF_name])[0])
            else:
                download_with_pool(url_formation_for_pool("mmCIF", [mmCIF_name])[0])
        except ValueError:
            os.remove(Path(str(default_input_path_to_mmCIF) + "/" + mmCIF_name))
            if "assembly" in mmCIF_name: 
                download_with_pool(url_formation_for_pool("mmCIF_assembly", [mmCIF_name])[0])
            else:
                download_with_pool(url_formation_for_pool("mmCIF", [mmCIF_name])[0])
        except OSError:
            if "assembly" in mmCIF_name: 
                download_with_pool(url_formation_for_pool("mmCIF_assembly", [mmCIF_name])[0])
            else:
                download_with_pool(url_formation_for_pool("mmCIF", [mmCIF_name])[0])       
    return mmcif_dict

In [7]:
def try_SIFTS_tree_parser(default_input_path_to_SIFTS, SIFTS_name):
    product_tree_SIFTS = 0
    for _ in range(3):
        try:
            handle_SIFTS = gzip.open(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name), 'rt')
            product_tree_SIFTS = SIFTS_tree_parser(handle_SIFTS)
            break
        except EOFError:
            os.remove(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name))
            download_with_pool(url_formation_for_pool("SIFTS", [SIFTS_name])[0])
        except ValueError:
            os.remove(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name))
            download_with_pool(url_formation_for_pool("SIFTS", [SIFTS_name])[0])     
        except OSError:
            download_with_pool(url_formation_for_pool("SIFTS", [SIFTS_name])[0])
    return product_tree_SIFTS


In [8]:
def output_with_this_name_ending(name_ending, path, mmcif_dict, mmCIF_name, gzip_mode=gzip_mode, current_directory=current_directory):
    mmCIF_name = mmCIF_name[:mmCIF_name.rfind(".cif.gz")]
    os.chdir(path)
    io = MMCIFIO()
    io.set_dict(mmcif_dict)
    io.save(mmCIF_name + name_ending)
    if gzip_mode == "on":
        compressor.compress_output_files(mmCIF_name + name_ending, gzip_mode)
        os.remove(mmCIF_name + name_ending)
    os.chdir(current_directory)

In [9]:
def copy_file(inpath, file_name, outpath, postfix, gzip_mode):
    mmCIF_name = file_name[:file_name.rfind(".cif.gz")]
    absolute_path_in = inpath + "/" + file_name
    absolute_path_out = outpath + "/" + mmCIF_name + postfix
    if gzip_mode == "off":
        with gzip.open(absolute_path_in, 'rb') as f_in:
            with open(absolute_path_out[:-3], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    else:
        shutil.copyfile(absolute_path_in, absolute_path_out)

In [10]:
def if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message):
    strand_id_set = set()
    try:
        pull_chains_for_chains_count = mmcif_dict["_pdbx_poly_seq_scheme.pdb_strand_id"]
    except KeyError:
        try:
            pull_chains_for_chains_count = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_strand_id"]
        except KeyError:
            pull_chains_for_chains_count = mmcif_dict["_atom_site.auth_asym_id"]

    for strand in pull_chains_for_chains_count:
        strand_id_set.add(strand)
    strand_id_set = list(strand_id_set)
    strand_id_set.sort()
    for strand in strand_id_set:
        count_elements_in_strand = 0
        for chain_id in pull_chains_for_chains_count:
            if chain_id == strand:
                count_elements_in_strand += 1
        log_message.append([mmCIF_name[:4], strand, "-", "-", len(pull_chains_for_chains_count), "-", count_elements_in_strand, "0", "0"])
    return log_message

In [11]:
def handling_chain_numbering_clashes(df_PDBe_PDB_UniProt, exception_AccessionIDs):
    chains_to_change = set()
    chains_to_change_one_to_end = set()
    AccessionIDs = set()
    chain_AccessionID_dict = dict()

    for PDBe_num_UniProt_PDB_accession in df_PDBe_PDB_UniProt["Three_Rows_CIF_Num_Uni"]:
        if type(PDBe_num_UniProt_PDB_accession[4]) == float:
            continue
        chains_to_change.add(PDBe_num_UniProt_PDB_accession[3][2])
        chains_to_change_one_to_end.add(PDBe_num_UniProt_PDB_accession[2][2])
        AccessionIDs.add(PDBe_num_UniProt_PDB_accession[4])

    for chains in chains_to_change:
        accessions_in_chain = set()
        for PDBe_num_UniProt_PDB_accession in df_PDBe_PDB_UniProt["Three_Rows_CIF_Num_Uni"]:
            if chains == PDBe_num_UniProt_PDB_accession[3][2]:
                if PDBe_num_UniProt_PDB_accession[4] is not np.nan:
                    accessions_in_chain.add(PDBe_num_UniProt_PDB_accession[4])
        chain_AccessionID_dict[chains] = accessions_in_chain

    tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID = list()
    combined_tuple_PDBe_UniProt_AccessionID = list()
    longest_AccessionID_list = list()
    clash = 0

    for chain_accession in chain_AccessionID_dict.items():
        chains_to_change_for_AccessionID = list()
        longest_AccessionID = None
        longest_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID = list()
        if len(chain_accession[1]) > 1:
            for accessions in chain_accession[1]:
                tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID = list()
                target_UniProt_numbers_in_chain = list()
                diff_another_UniProt_numbers_in_same_chain = list()
                diff_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID = list()

                for PDBe_num_UniProt_PDB_accession in df_PDBe_PDB_UniProt["Three_Rows_CIF_Num_Uni"]:
                    if PDBe_num_UniProt_PDB_accession[4] == accessions and PDBe_num_UniProt_PDB_accession[3][2] == chain_accession[0] and \
                            PDBe_num_UniProt_PDB_accession[4] is not np.nan:
                        tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID.append(
                            (PDBe_num_UniProt_PDB_accession[0], PDBe_num_UniProt_PDB_accession[2], PDBe_num_UniProt_PDB_accession[4]))
                        target_UniProt_numbers_in_chain.append(PDBe_num_UniProt_PDB_accession[2])
                        chains_to_change_for_AccessionID.append(PDBe_num_UniProt_PDB_accession[3][2])
                    if PDBe_num_UniProt_PDB_accession[4] != accessions and PDBe_num_UniProt_PDB_accession[3][2] == chain_accession[0] and \
                            PDBe_num_UniProt_PDB_accession[4] is not np.nan:
                        diff_another_UniProt_numbers_in_same_chain.append(PDBe_num_UniProt_PDB_accession[2])
                        diff_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID.append(
                            (PDBe_num_UniProt_PDB_accession[0], PDBe_num_UniProt_PDB_accession[2], PDBe_num_UniProt_PDB_accession[4]))

                for target_Uni in target_UniProt_numbers_in_chain:
                    for diff_Uni in diff_another_UniProt_numbers_in_same_chain:
                        if target_Uni[0] == diff_Uni[0]:
                            clash = 1

                if accessions not in exception_AccessionIDs:
                    if len(longest_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID) < len(
                            tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID):
                        longest_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID = tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID
                        longest_AccessionID = accessions

                if longest_AccessionID is None:
                    if len(longest_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID) < len(
                            tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID):
                        longest_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID = tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID
                        longest_AccessionID = accessions

            if clash == 1:
                combined_tuple_PDBe_UniProt_AccessionID.extend(longest_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID)
                longest_AccessionID_list.append(longest_AccessionID)
            else:
                combined_tuple_PDBe_UniProt_AccessionID.extend(longest_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID)
                combined_tuple_PDBe_UniProt_AccessionID.extend(diff_tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID)
        else:
            for accessions in chain_accession[1]:
                tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID = list()
                target_UniProt_numbers_in_chain = list()

                for PDBe_num_UniProt_PDB_accession in df_PDBe_PDB_UniProt["Three_Rows_CIF_Num_Uni"]:
                    if PDBe_num_UniProt_PDB_accession[4] == accessions and PDBe_num_UniProt_PDB_accession[3][2] == chain_accession[0] and PDBe_num_UniProt_PDB_accession[4] is not np.nan:
                        tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID.append((PDBe_num_UniProt_PDB_accession[0], PDBe_num_UniProt_PDB_accession[2], PDBe_num_UniProt_PDB_accession[4]))
                        target_UniProt_numbers_in_chain.append(PDBe_num_UniProt_PDB_accession[2])
                        chains_to_change_for_AccessionID.append(PDBe_num_UniProt_PDB_accession[3][2])
            combined_tuple_PDBe_UniProt_AccessionID.extend(tuple_PDBe_for_UniProt_and_tuple_UniProt_for_AccessionID)

    return [chains_to_change, combined_tuple_PDBe_UniProt_AccessionID, AccessionIDs, longest_AccessionID_list, chains_to_change_one_to_end]

In [12]:
def renumbered_count_in_chains(chains_to_change_one_to_end, df_PDBe_PDB_UniProt_without_null_index_PDBe, mmCIF_name,
                               UniProt_conversion_dict, longest_AccessionID_list):
    nothing_changed = 1
    chain_total_renum = list()
    UniProt_total_renum = list()
    renum_for_all_chains = 0
    total_renum5000 = 0
    chains_to_change = sorted(chains_to_change_one_to_end)
    chain_PDBe_PDB = dict()
    prot_len = len(df_PDBe_PDB_UniProt_without_null_index_PDBe["Three_Rows_CIF_Num_Uni"])
    
    for chain in chains_to_change:
        total_count_per_chain = 0
        renum_for_the_chains = 0
        renum5000 = 0
        UniProts_set = set()

        for PDBe_num_Uni_PDB in df_PDBe_PDB_UniProt_without_null_index_PDBe["Three_Rows_CIF_Num_Uni"]:
            if chain == PDBe_num_Uni_PDB[2][2]:
                chain_PDBe_PDB[chain] = PDBe_num_Uni_PDB[3][2]
                UniProts_set.add(PDBe_num_Uni_PDB[4])
                total_count_per_chain += 1
                if int(PDBe_num_Uni_PDB[1]) > 5000:
                    renum5000 += 1
                    total_renum5000 += 1
                elif PDBe_num_Uni_PDB[1] != PDBe_num_Uni_PDB[3][0]:
                    renum_for_all_chains += 1
                    renum_for_the_chains += 1

        for accession in UniProts_set:
            renum_for_accession = 0
            coun_accession_len = 0
            for PDBe_num_Uni_PDB in df_PDBe_PDB_UniProt_without_null_index_PDBe["Three_Rows_CIF_Num_Uni"]:
                if accession == PDBe_num_Uni_PDB[4]:
                    coun_accession_len +=1
                if chain == PDBe_num_Uni_PDB[2][2] and accession == PDBe_num_Uni_PDB[4]:
                    if PDBe_num_Uni_PDB[1] != PDBe_num_Uni_PDB[3][0]:
                        renum_for_accession += 1

            if len(longest_AccessionID_list) != 0:
                if accession in longest_AccessionID_list:
                    AccessionID_humanread_longest = UniProt_conversion_dict.get(accession)
                    chain_total_renum.append(
                        [mmCIF_name[:4] + "*", chain, chain_PDBe_PDB[chain], accession, AccessionID_humanread_longest, prot_len, coun_accession_len, total_count_per_chain, renum_for_accession, renum5000])
                else:
                    AccessionID_humanread = UniProt_conversion_dict.get(accession)
                    chain_total_renum.append(
                        [mmCIF_name[:4], chain, chain_PDBe_PDB[chain], accession, AccessionID_humanread, prot_len, coun_accession_len, total_count_per_chain, renum_for_accession, renum5000])
            else:
                AccessionID_humanread = UniProt_conversion_dict.get(accession)
                chain_total_renum.append(
                    [mmCIF_name[:4], chain, chain_PDBe_PDB[chain], accession, AccessionID_humanread, prot_len, coun_accession_len, total_count_per_chain, renum_for_accession, renum5000])

    if renum_for_all_chains == 0 and total_renum5000 == 0:
        nothing_changed = 0

    return [chain_total_renum, nothing_changed]

In [247]:
def renum_struct_ref_seq_pdbx_auth_seq_align(mmcif_dict):
    try:
        _struct_ref_seq_pdbx_strand_id = mmcif_dict["_struct_ref_seq.pdbx_strand_id"]

        _struct_ref_seq_pdbx_seq_align_beg_ins_code = mmcif_dict["_struct_ref_seq.pdbx_seq_align_beg_ins_code"]
        _struct_ref_seq_pdbx_auth_seq_align_beg = mmcif_dict["_struct_ref_seq.pdbx_auth_seq_align_beg"]
        _struct_ref_seq_db_align_beg = mmcif_dict["_struct_ref_seq.db_align_beg"]
        mmcif_dict["_struct_ref_seq.pdbx_auth_seq_align_beg"] = mmcif_dict["_struct_ref_seq.db_align_beg"]

        _struct_ref_seq_pdbx_seq_align_end_ins_code = mmcif_dict["_struct_ref_seq.pdbx_seq_align_end_ins_code"]
        _struct_ref_seq_pdbx_auth_seq_align_end = mmcif_dict["_struct_ref_seq.pdbx_auth_seq_align_end"]
        _struct_ref_seq_db_align_end = mmcif_dict["_struct_ref_seq.db_align_end"]
        mmcif_dict["_struct_ref_seq.pdbx_auth_seq_align_end"] = mmcif_dict["_struct_ref_seq.db_align_end"]

        if type(_struct_ref_seq_pdbx_seq_align_beg_ins_code) == str:
            if "." in _struct_ref_seq_pdbx_seq_align_beg_ins_code:
                mmcif_dict["_struct_ref_seq.pdbx_seq_align_beg_ins_code"] = "."
            else:
                mmcif_dict["_struct_ref_seq.pdbx_seq_align_beg_ins_code"] = "?"
        if type(_struct_ref_seq_pdbx_seq_align_end_ins_code) == str:
            if "." in _struct_ref_seq_pdbx_seq_align_end_ins_code:
                mmcif_dict["_struct_ref_seq.pdbx_seq_align_end_ins_code"] = "."
            else:
                mmcif_dict["_struct_ref_seq.pdbx_seq_align_end_ins_code"] = "?"

        PDB_ins_code_list = list()
        if type(_struct_ref_seq_pdbx_seq_align_beg_ins_code) != str:
            if "." in _struct_ref_seq_pdbx_seq_align_beg_ins_code:
                for _ in range(len(_struct_ref_seq_pdbx_seq_align_beg_ins_code)):
                    PDB_ins_code_list.append(".")
            else:
                for _ in range(len(_struct_ref_seq_pdbx_seq_align_beg_ins_code)):
                    PDB_ins_code_list.append("?")
            mmcif_dict["_struct_ref_seq.pdbx_seq_align_beg_ins_code"] = PDB_ins_code_list
            mmcif_dict["_struct_ref_seq.pdbx_seq_align_end_ins_code"] = PDB_ins_code_list

    except KeyError:
        pass


def poly_nonpoly_renum(mmcif_dict, df_PDBe_PDB_UniProt, chains_to_change, default_mmCIF_num):
    try:
        _pdbx_poly_seq_scheme_seq_id = mmcif_dict["_pdbx_poly_seq_scheme.seq_id"]
        _pdbx_poly_seq_scheme_asym_id = mmcif_dict["_pdbx_poly_seq_scheme.asym_id"]
        _pdbx_poly_seq_scheme_mon_id = mmcif_dict["_pdbx_poly_seq_scheme.mon_id"]

        _pdbx_poly_seq_scheme_pdb_seq_num = mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"]
        _pdbx_poly_seq_scheme_auth_seq_num = mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"]
        _pdbx_poly_seq_scheme_pdb_mon_id = mmcif_dict["_pdbx_poly_seq_scheme.pdb_mon_id"]
        _pdbx_poly_seq_scheme_auth_mon_id = mmcif_dict["_pdbx_poly_seq_scheme.auth_mon_id"]
        _pdbx_poly_seq_scheme_pdb_strand_id = mmcif_dict["_pdbx_poly_seq_scheme.pdb_strand_id"]
        _pdbx_poly_seq_scheme_pdb_ins_code = mmcif_dict["_pdbx_poly_seq_scheme.pdb_ins_code"]
    except KeyError:
        try:
            _pdbx_poly_seq_scheme_seq_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.seq_id"]
            _pdbx_poly_seq_scheme_asym_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.asym_id"]
            _pdbx_poly_seq_scheme_mon_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.mon_id"]

            _pdbx_poly_seq_scheme_pdb_seq_num = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_seq_num"]
            _pdbx_poly_seq_scheme_auth_seq_num = mmcif_dict["_pdbe_orig_poly_seq_scheme.auth_seq_num"]
            _pdbx_poly_seq_scheme_pdb_mon_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_mon_id"]
            _pdbx_poly_seq_scheme_auth_mon_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.auth_mon_id"]
            _pdbx_poly_seq_scheme_pdb_strand_id = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_strand_id"]
            _pdbx_poly_seq_scheme_pdb_ins_code = mmcif_dict["_pdbe_orig_poly_seq_scheme.pdb_ins_code"]
        except KeyError:
            # continue
            return 0

    if type(_pdbx_poly_seq_scheme_pdb_strand_id) == str:
        _pdbx_poly_seq_scheme_pdb_seq_num = [_pdbx_poly_seq_scheme_pdb_seq_num]
        _pdbx_poly_seq_scheme_auth_seq_num = [_pdbx_poly_seq_scheme_auth_seq_num]
        _pdbx_poly_seq_scheme_pdb_mon_id = [_pdbx_poly_seq_scheme_pdb_mon_id]
        _pdbx_poly_seq_scheme_auth_mon_id = [_pdbx_poly_seq_scheme_auth_mon_id]
        _pdbx_poly_seq_scheme_pdb_strand_id = [_pdbx_poly_seq_scheme_pdb_strand_id]
        _pdbx_poly_seq_scheme_pdb_ins_code = [_pdbx_poly_seq_scheme_pdb_ins_code]

    mmCIF_pdbx_poly_seq_scheme_label = list(zip(_pdbx_poly_seq_scheme_seq_id,
                                                _pdbx_poly_seq_scheme_mon_id,
                                                _pdbx_poly_seq_scheme_asym_id))
    mmCIF_pdbx_poly_seq_scheme_pdb = list(zip(_pdbx_poly_seq_scheme_pdb_seq_num,
                                              _pdbx_poly_seq_scheme_pdb_mon_id,
                                              _pdbx_poly_seq_scheme_pdb_strand_id))
    mmCIF_pdbx_poly_seq_scheme_auth = list(zip(_pdbx_poly_seq_scheme_auth_seq_num,
                                               _pdbx_poly_seq_scheme_auth_mon_id,
                                               _pdbx_poly_seq_scheme_pdb_strand_id))

    df_mmCIF_pdbx_poly_seq_scheme = pd.DataFrame(zip(mmCIF_pdbx_poly_seq_scheme_label,
                                                     mmCIF_pdbx_poly_seq_scheme_pdb,
                                                     mmCIF_pdbx_poly_seq_scheme_auth,
                                                     _pdbx_poly_seq_scheme_pdb_ins_code))

    df_mmCIF_pdbx_poly_seq_scheme = df_mmCIF_pdbx_poly_seq_scheme.rename(
        columns={0: "_pdbx_poly_seq_scheme_label", 1: "pdbx_poly_seq_scheme_pdb",
                 2: "pdbx_poly_seq_scheme_auth", 3: "pdbx_poly_seq_scheme_pdb_ins_code"})

    df_pdbx_poly_seq_scheme_pdb_final = df_mmCIF_pdbx_poly_seq_scheme.merge(
        df_PDBe_PDB_UniProt, left_on="_pdbx_poly_seq_scheme_label", right_on="PDBe", how='left')
    df_pdbx_poly_seq_scheme_pdb_final["PDBe_num_and_chain"] = df_pdbx_poly_seq_scheme_pdb_final[
        "_pdbx_poly_seq_scheme_label"].apply(lambda x: (x[0], x[2]))

    df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"] = np.where(
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb_ins_code"].apply(lambda x: x == "."),
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb"].apply(lambda x: (x[0], x[2])),
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb"].apply(lambda x: x[0]) +
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb_ins_code"].apply(lambda x: x) + "," +
        df_pdbx_poly_seq_scheme_pdb_final["pdbx_poly_seq_scheme_pdb"].apply(lambda x: x[2]))
    df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"] = df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"].apply(
        lambda x: tuple(x.split(",")) if type(x) == str else x)

    df_pdbx_poly_seq_scheme_pdb_final["Uni_or_50k"] = np.where(
        df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"].apply(lambda x: x[1] in chains_to_change),
        df_pdbx_poly_seq_scheme_pdb_final["UniProt_50k"].apply(lambda x: x),
        df_pdbx_poly_seq_scheme_pdb_final["PDB_num_and_chain"].apply(lambda x: x[0].strip(re.sub('[0-9\-\?\.]+', '', x[0]))))

    try:
        mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"]  # check if key exists
        mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"] = list(df_pdbx_poly_seq_scheme_pdb_final["Uni_or_50k"].values)
    except KeyError:
        mmcif_dict["_pdbe_orig_poly_seq_scheme.auth_seq_num"] = list(df_pdbx_poly_seq_scheme_pdb_final["Uni_or_50k"].values)

    nonpoly_present = False
    try:
        _pdbx_nonpoly_scheme_pdb_seq_num = mmcif_dict["_pdbx_nonpoly_scheme.pdb_seq_num"]
        _pdbx_nonpoly_scheme_auth_seq_num = mmcif_dict["_pdbx_nonpoly_scheme.auth_seq_num"]
        _pdbx_nonpoly_scheme_pdb_mon_id = mmcif_dict["_pdbx_nonpoly_scheme.pdb_mon_id"]
        _pdbx_nonpoly_scheme_auth_mon_id = mmcif_dict["_pdbx_nonpoly_scheme.auth_mon_id"]
        _pdbx_nonpoly_scheme_pdb_strand_id = mmcif_dict["_pdbx_nonpoly_scheme.pdb_strand_id"]
        _pdbx_nonpoly_scheme_asym_id = mmcif_dict["_pdbx_nonpoly_scheme.asym_id"]
        dots_for_lable = ["." for _ in range(len(_pdbx_nonpoly_scheme_asym_id)) if type(_pdbx_nonpoly_scheme_asym_id) == list]
        nonpoly_present = True
    except KeyError:
        try:
            _pdbx_nonpoly_scheme_pdb_seq_num = mmcif_dict["_pdbe_orig_nonpoly_scheme.pdb_seq_num"]
            _pdbx_nonpoly_scheme_auth_seq_num = mmcif_dict["_pdbe_orig_nonpoly_scheme.auth_seq_num"]
            _pdbx_nonpoly_scheme_pdb_mon_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.pdb_mon_id"]
            _pdbx_nonpoly_scheme_auth_mon_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.auth_mon_id"]
            _pdbx_nonpoly_scheme_pdb_strand_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.pdb_strand_id"]
            _pdbx_nonpoly_scheme_asym_id = mmcif_dict["_pdbe_orig_nonpoly_scheme.asym_id"]
            dots_for_lable = ["." for _ in range(len(_pdbx_nonpoly_scheme_asym_id)) if type(_pdbx_nonpoly_scheme_asym_id) == list]
            nonpoly_present = True
        except KeyError:
            pass

    if nonpoly_present:
        if type(_pdbx_nonpoly_scheme_pdb_strand_id) == str:
            _pdbx_nonpoly_scheme_pdb_seq_num = [_pdbx_nonpoly_scheme_pdb_seq_num]
            _pdbx_nonpoly_scheme_auth_seq_num = [_pdbx_nonpoly_scheme_auth_seq_num]
            _pdbx_nonpoly_scheme_pdb_mon_id = [_pdbx_nonpoly_scheme_pdb_mon_id]
            _pdbx_nonpoly_scheme_auth_mon_id = [_pdbx_nonpoly_scheme_auth_mon_id]
            _pdbx_nonpoly_scheme_pdb_strand_id = [_pdbx_nonpoly_scheme_pdb_strand_id]
            _pdbx_nonpoly_scheme_asym_id = [_pdbx_nonpoly_scheme_asym_id]
            dots_for_lable = ["."]

        mmCIF_pdbx_nonpoly_scheme_pdb = list(zip(_pdbx_nonpoly_scheme_pdb_seq_num,
                                                 _pdbx_nonpoly_scheme_pdb_mon_id,
                                                 _pdbx_nonpoly_scheme_pdb_strand_id))
        mmCIF_pdbx_nonpoly_scheme_auth = list(zip(_pdbx_nonpoly_scheme_auth_seq_num,
                                                  _pdbx_nonpoly_scheme_auth_mon_id,
                                                  _pdbx_nonpoly_scheme_pdb_strand_id))
        mmCIF_pdbx_nonpoly_scheme_label = list(zip(dots_for_lable,
                                                   _pdbx_nonpoly_scheme_pdb_mon_id,
                                                   _pdbx_nonpoly_scheme_asym_id))

        df_mmCIF_pdbx_nonpoly_scheme = pd.DataFrame(zip(mmCIF_pdbx_nonpoly_scheme_pdb,
                                                        mmCIF_pdbx_nonpoly_scheme_auth,
                                                        mmCIF_pdbx_nonpoly_scheme_label))
        df_mmCIF_pdbx_nonpoly_scheme = df_mmCIF_pdbx_nonpoly_scheme.rename(columns={0: "pdbx_nonpoly_scheme_pdb",
                                                                                    1: "pdbx_nonpoly_scheme_auth",
                                                                                    2: "pdbx_nonpoly_scheme_label"})

        df_mmCIF_pdbx_nonpoly_scheme["PDB"] = df_mmCIF_pdbx_nonpoly_scheme["pdbx_nonpoly_scheme_pdb"]
        df_mmCIF_pdbx_nonpoly_scheme["PDB_num_and_chain"] = df_mmCIF_pdbx_nonpoly_scheme["pdbx_nonpoly_scheme_pdb"].apply(lambda x: (x[0], x[2]))
        df_mmCIF_pdbx_nonpoly_scheme["Uni_or_50k"] = df_mmCIF_pdbx_nonpoly_scheme["pdbx_nonpoly_scheme_pdb"].apply(
            lambda x: str(int(x[0]) + default_mmCIF_num + 10000) if x[2] in chains_to_change else x[0])

        try:
            mmcif_dict["_pdbx_nonpoly_scheme.auth_seq_num"]  # check if key exists
            mmcif_dict["_pdbx_nonpoly_scheme.auth_seq_num"] = list(df_mmCIF_pdbx_nonpoly_scheme["Uni_or_50k"].values)
        except KeyError:
            try:
                mmcif_dict["_pdbe_orig_nonpoly_scheme.auth_seq_num"] = list(df_mmCIF_pdbx_nonpoly_scheme["Uni_or_50k"].values)
            except KeyError:
                pass

        poly_nonpoly_append = df_pdbx_poly_seq_scheme_pdb_final.append(df_mmCIF_pdbx_nonpoly_scheme)
        poly_nonpoly_append = poly_nonpoly_append[["PDBe", "PDB", "UniProt", "PDBe_num_and_chain", "PDB_num_and_chain", "AccessionID", "Uni_or_50k"]]
    else:
        poly_nonpoly_append = df_pdbx_poly_seq_scheme_pdb_final[
            ["PDBe", "PDB", "UniProt", "PDBe_num_and_chain", "PDB_num_and_chain", "AccessionID", "Uni_or_50k"]]

    return poly_nonpoly_append


def renumber_tables(formed_columns, mmcif_dict, poly_nonpoly_atom_site, chains_to_change, default_mmCIF_num):
    dot_or_question_tuple = (".", "?")
    for n in formed_columns:
        auth_comp_id = 0
        auth_seq_id = n[0]
        auth_asym_id = n[1]
        try:
            PDB_ins_code = n[2]
            if "ins_code" not in PDB_ins_code:
                auth_comp_id = PDB_ins_code
                PDB_ins_code = 0
        except IndexError:
            PDB_ins_code = 0
        try:
            if auth_comp_id == 0:
                auth_comp_id = n[3]
        except IndexError:
            auth_comp_id = 0

        if "_pdbx_branch_scheme" in auth_seq_id:
            auth_seq_id = "_pdbx_branch_scheme.pdb_seq_num"
            auth_asym_id = "_pdbx_branch_scheme.pdb_asym_id"

        PDB_ins_code_list = list()
        # auth_comp_id_list = mmcif_dict[auth_comp_id] #for debug only
        auth_seq_id_list = mmcif_dict[auth_seq_id]
        auth_asym_id_list = mmcif_dict[auth_asym_id]

        if PDB_ins_code == 0:
            for _ in range(len(auth_seq_id_list)):
                PDB_ins_code_list.append("?")
        else:
            PDB_ins_code_list = mmcif_dict[PDB_ins_code]

        if type(auth_asym_id_list) == str:
            # auth_comp_id_list = [auth_comp_id_list] for debug only
            auth_seq_id_list = [auth_seq_id_list]
            auth_asym_id_list = [auth_asym_id_list]

            if PDB_ins_code == 0:
                PDB_ins_code_list = ["?"]
            else:
                PDB_ins_code_list = [PDB_ins_code]

        if PDB_ins_code != 0:
            dot_to_question = list()
            for ins_code in mmcif_dict[PDB_ins_code]:
                if ins_code == ".":
                    dot_to_question.append("?")
                else:
                    dot_to_question.append(ins_code)
            PDB_ins_code_list = dot_to_question

        auth_seq_id_list_zip = list(zip(auth_seq_id_list, auth_asym_id_list))
        df_mmCIF_auth_seq_id_list_zip = pd.DataFrame(zip(auth_seq_id_list_zip, PDB_ins_code_list))
        df_mmCIF_auth_seq_id_list_zip = df_mmCIF_auth_seq_id_list_zip.rename(columns={0: "auth_seq_id_list_zip", 1: "ins_code"})

        df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code"] = np.where(df_mmCIF_auth_seq_id_list_zip['ins_code'] != "?",
                                                                      (df_mmCIF_auth_seq_id_list_zip['auth_seq_id_list_zip'].apply(lambda x: x[0])
                                                                       + df_mmCIF_auth_seq_id_list_zip['ins_code'].apply(lambda y: y[0]) + ","
                                                                       + df_mmCIF_auth_seq_id_list_zip['auth_seq_id_list_zip'].apply(lambda x: x[1])),
                                                                      df_mmCIF_auth_seq_id_list_zip['ins_code'])

        df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code_cor"] = np.where(df_mmCIF_auth_seq_id_list_zip['PDB_with_ins_code'] != "?",
                                                                          df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code"].apply(
                                                                              lambda x: tuple(x.split(","))),
                                                                          df_mmCIF_auth_seq_id_list_zip["auth_seq_id_list_zip"])

        df_mmCIF_auth_seq_id_list_zip["auth_seq_id_list_zip"] = df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code_cor"]
        df_mmCIF_auth_seq_id_list_zip = df_mmCIF_auth_seq_id_list_zip.drop(columns=["PDB_with_ins_code_cor", "ins_code", "PDB_with_ins_code"])

        df_auth_seq_id_list_zip_final = df_mmCIF_auth_seq_id_list_zip.merge(poly_nonpoly_atom_site, left_on="auth_seq_id_list_zip",
                                                                            right_on="PDB_num_and_chain", how='left')

        df_auth_seq_id_list_zip_final["question_mark"] = np.where(
            df_auth_seq_id_list_zip_final["auth_seq_id_list_zip"].apply(lambda x: x[0] in dot_or_question_tuple),
            df_auth_seq_id_list_zip_final["auth_seq_id_list_zip"].apply(lambda x: x[0]),
            df_auth_seq_id_list_zip_final["Uni_or_50k"].apply(lambda x: x))
        try:
            df_auth_seq_id_list_zip_final["final"] = np.where(df_auth_seq_id_list_zip_final["question_mark"].apply(lambda x: type(x) == float),
                                                              df_auth_seq_id_list_zip_final["auth_seq_id_list_zip"].apply(
                                                                  lambda x: "." if x[0] == "." else
                                                                  "?" if x[0] == "?" else str(
                                                                      int(''.join(filter(str.isdigit, str(x[0])))) + default_mmCIF_num)
                                                                  if x[1] in chains_to_change else str(int(''.join(filter(str.isdigit, str(x[0])))))),
                                                              df_auth_seq_id_list_zip_final["question_mark"].apply(lambda x: x))
        except ValueError:

            return print("ValueError in table " + auth_seq_id + " has non-numeric value point in file" + mmcif_dict["data_"])

        df_auth_seq_id_list_zip_final["ins_code"] = df_auth_seq_id_list_zip_final["final"].apply(lambda x: "?"
        if re.sub('[0-9]+', '', x).strip("-").strip(".").strip('?') == ""
        else re.sub('[0-9]+', '', x).strip("-").strip(".").strip('?'))
        df_auth_seq_id_list_zip_final["final"] = df_auth_seq_id_list_zip_final["final"].apply(lambda x: x.strip(re.sub('[0-9\-\?\.]+', '', x)))

        for num in df_auth_seq_id_list_zip_final["final"]:
            if num == "":
                print("Empty str")
            if type(num) == float:
                print("Float or npNAN")

        # actual replacing auth_num with UniProt_num and of ins_code with '?'

        PDB_ins_code_list = list()
        if PDB_ins_code != 0:
            if "." in mmcif_dict[PDB_ins_code]:
                for ins in df_auth_seq_id_list_zip_final["ins_code"].values:
                    if "?" == ins:
                        PDB_ins_code_list.append(".")
                    else:
                        PDB_ins_code_list.append(ins)
                mmcif_dict[PDB_ins_code] = PDB_ins_code_list
            else:
                mmcif_dict[PDB_ins_code] = list(df_auth_seq_id_list_zip_final["ins_code"].values)

        if "_pdbx_branch_scheme" in auth_seq_id:
            mmcif_dict["_pdbx_branch_scheme.auth_seq_num"] = list(df_auth_seq_id_list_zip_final["final"].values)
        else:
            mmcif_dict[auth_seq_id] = list(df_auth_seq_id_list_zip_final["final"].values)

    return mmcif_dict


def column_formation(mmcif_dict):
    mmcif_dict_keys = mmcif_dict.keys()
    aut_seq_all_splited = list()
    for key in mmcif_dict_keys:
        key_dot_splited = key.split(".")
        for tab_name_col_name in key_dot_splited:
            if "auth_seq" in tab_name_col_name:
                if "auth_seq_id" in key:
                    aut_seq_all_splited.append(key_dot_splited[:1] + key_dot_splited[1].split("auth_seq_id"))
                if "auth_seq_num" in key:
                    aut_seq_all_splited.append(key_dot_splited[:1] + key_dot_splited[1].split("auth_seq_num"))

    totaling_combinations = list()
    for table_name_prefix_suffix in aut_seq_all_splited:
        combinations = list()
        for key in mmcif_dict_keys:
            if table_name_prefix_suffix[0] == key.split(".")[0]:
                # res_num auth_seq_id or auth_seq_num
                if table_name_prefix_suffix[1] in key and table_name_prefix_suffix[2] in key \
                        and "auth_seq_id" in key or "auth_seq_num" in key:
                    combinations.append(key)
                    # chain auth_asym_id or strand_id
                if table_name_prefix_suffix[1] in key and table_name_prefix_suffix[2] in key \
                        and "auth_asym_id" in key or "strand_id" in key:
                    combinations.append(key)
                # ins_code
                if table_name_prefix_suffix[1] in key and table_name_prefix_suffix[2] in key \
                        and "ins_code" in key:
                    combinations.append(key)
                # monomer_type or auth_comp_id or auth_mon_id or mon_id for _struct_ref_seq_dif
                if table_name_prefix_suffix[1] in key and table_name_prefix_suffix[2] in key \
                        and "auth_comp_id" in key or "auth_mon_id" in key:
                    combinations.append(key)
                elif table_name_prefix_suffix[0] == "_struct_ref_seq_dif" \
                        and "mon_id" in key and "db_mon_id" not in key:
                    combinations.append(key)

        # work assuming all the elements in right order
        # and they are not crossing each other
        if len(combinations) > 4:
            combinations = combinations[:4]

        ordered_combination = list()
        for name in combinations:
            if "auth_seq" in name:
                ordered_combination.insert(0, name)
        for name in combinations:
            if "auth_asym_id" in name or "strand_id" in name:
                ordered_combination.insert(1, name)
        for name in combinations:
            if "ins_code" in name:
                ordered_combination.insert(2, name)
        for name in combinations:
            if "auth_comp_id" in name or "mon_id" in name:
                ordered_combination.insert(3, name)

        # exceptions
        if (  # "pdbx_unobs_or_zero_occ_residues" not in ordered_combination[0]
                "nonpoly_scheme" not in ordered_combination[0]
                and "poly_seq_scheme" not in ordered_combination[0]
                and "ndb_struct_na_base" not in ordered_combination[0]):
            totaling_combinations.append(ordered_combination)

    return totaling_combinations


def mmCIF_parser(mmCIF_name, default_input_path_to_mmCIF, df_PDBe_PDB_UniProt_without_null_index_PDBe, default_mmCIF_num, chains_to_change,
                 chains_to_change_one_to_end):
    mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
    if mmcif_dict == 0:
        return None

    try:
        _pdbx_poly_seq_scheme_auth_seq_num_before_change = mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"]
    except KeyError:
        _pdbx_poly_seq_scheme_auth_seq_num_before_change = mmcif_dict["_pdbe_orig_poly_seq_scheme.auth_seq_num"]
        pass

    _atom_site_label_comp_id_list = mmcif_dict["_atom_site.label_comp_id"]
    _atom_site_label_seq_id_list = mmcif_dict["_atom_site.label_seq_id"]
    _atom_site_label_asym_id = mmcif_dict["_atom_site.label_asym_id"]
    _atom_site_pdbx_PDB_ins_code = mmcif_dict["_atom_site.pdbx_PDB_ins_code"]

    _atom_site_auth_comp_id = mmcif_dict["_atom_site.auth_comp_id"]
    _atom_site_auth_seq_id = mmcif_dict["_atom_site.auth_seq_id"]
    _atom_site_auth_asym_id = mmcif_dict["_atom_site.auth_asym_id"]
    _atom_site_pdbx_formal_charge = mmcif_dict["_atom_site.pdbx_formal_charge"]

    final_mmCIF_data_list_of_tuples_just_pdb = list(zip(_atom_site_label_seq_id_list, _atom_site_label_comp_id_list, _atom_site_label_asym_id))
    final_mmCIF_data_list_of_tuples_with_auth = list(zip(_atom_site_auth_seq_id, _atom_site_auth_comp_id, _atom_site_auth_asym_id))
    final_mmCIF_data_list_of_tuples_for_df = list(
        zip(final_mmCIF_data_list_of_tuples_just_pdb, final_mmCIF_data_list_of_tuples_with_auth, _atom_site_pdbx_PDB_ins_code))

    df_mmCIF = pd.DataFrame(final_mmCIF_data_list_of_tuples_for_df)
    df_mmCIF = df_mmCIF.rename(columns={0: "One_to_N_mmCIF", 1: "auth_mmCIF", 2: "ins_code"})

    df_mmCIF["One_to_N_mmCIF"]

    df_mmCIF["PDBnum_inc_code"] = np.where(df_mmCIF['ins_code'] != "?",
                                           (df_mmCIF['auth_mmCIF'].apply(lambda x: x[0]) + df_mmCIF["ins_code"].apply(lambda y: y[0]) + ","
                                            + df_mmCIF['auth_mmCIF'].apply(lambda x: x[1]) + "," + df_mmCIF['auth_mmCIF'].apply(lambda x: x[2])),
                                           df_mmCIF["ins_code"])
    df_mmCIF["PDBnum_inc_code_cor"] = np.where(df_mmCIF["PDBnum_inc_code"] != "?", df_mmCIF["PDBnum_inc_code"].apply(lambda x: tuple(x.split(","))),
                                               df_mmCIF["auth_mmCIF"])

    df_mmCIF["auth_mmCIF"] = df_mmCIF["PDBnum_inc_code_cor"]
    df_mmCIF = df_mmCIF.drop(columns=["PDBnum_inc_code_cor", "ins_code", "PDBnum_inc_code"])

    df_PDBe_PDB_UniProt_without_null_index_PDBe = df_PDBe_PDB_UniProt_without_null_index_PDBe.reset_index()
    df_final = df_mmCIF.merge(df_PDBe_PDB_UniProt_without_null_index_PDBe, left_on="One_to_N_mmCIF", right_on="PDBe", how='left')
    df_final = df_final.rename(columns={"PDBe_copy": "PDBe"})
    df_final = df_final.drop_duplicates(subset="auth_mmCIF", keep='first')
    df_final["PDB_num_and_chain"] = df_final["auth_mmCIF"].apply(lambda x: (x[0], x[2]))
    df_final["PDBe_num_and_chain"] = df_final["One_to_N_mmCIF"].apply(lambda x: (x[0], x[2]))

    df_final["Uni_or_50k_NAN"] = np.where(df_final["One_to_N_mmCIF"].apply(lambda x: x[0] != "."),
                                          df_final["UniProt_50k"].apply(lambda x: x),
                                          df_final["PDB_num_and_chain"].apply(
                                              lambda x: str(int(''.join(filter(str.isdigit, x[0]))) + default_mmCIF_num + 10000)
                                              if x[1] in chains_to_change else str(int(''.join(filter(str.isdigit, x[0]))))))
    df_final["Uni_or_50k"] = np.where(df_final["Uni_or_50k_NAN"].apply(lambda x: type(x) == float),
                                      df_final["PDBe_num_and_chain"].apply(
                                          lambda x: "." if x[0] == "." else str(int(''.join(filter(str.isdigit, x[0]))) + default_mmCIF_num)
                                          if x[1] in chains_to_change_one_to_end else str(int(''.join(filter(str.isdigit, x[0]))))),
                                      df_final["Uni_or_50k_NAN"].apply(lambda x: x))

    df_final_atom_site = df_final[["PDBe", "PDB", "UniProt", "PDBe_num_and_chain", "PDB_num_and_chain", "AccessionID", "Uni_or_50k"]]

    return [df_final_atom_site, mmcif_dict]


def SIFTS_tree_parser(handle_SIFTS):
    tree = ET.parse(handle_SIFTS)
    root = tree.getroot()

    crossRefDb_list = list()
    PDBe_val_tuples_in_list = list()
    PDBe_val_tuples_in_list_for_Uni = list()
    PDBe_val_tuples_in_list_for_PDB = list()
    PDB_val_tuples_in_list = list()
    UniProt_val_tuple_in_list = list()
    UniProtdbAccessionId_list = list()
    UniProt_conversion_dict = dict()
    Human_readble_AccessionID_list = list()
    details_list = list()

    for entity in root:
        if entity.tag.endswith("entity"):
            entity_chainID_list = list(entity.attrib.items())
            if entity_chainID_list[0][0] == "type" and entity_chainID_list[0][1] == "protein":
                for segment in entity:
                    for listResidue in segment:
                        if listResidue.tag.endswith("listMapRegion"):
                            for mapRegion in listResidue:
                                for db in mapRegion:
                                    dbSourse_Uniprot = list(db.attrib.items())
                                    if "dbSource" == dbSourse_Uniprot[0][0] and "UniProt" == dbSourse_Uniprot[0][1]:
                                        if db.text is None:
                                            UniProt = dbSourse_Uniprot[2][1]
                                        else:
                                            Human_readble = db.text
                                            UniProt_conversion_dict[UniProt] = Human_readble

                        for residue in listResidue:
                            key_val_tuples_in_list_parent = list(residue.attrib.items())
                            if key_val_tuples_in_list_parent[0][0] == "dbSource" and key_val_tuples_in_list_parent[0][1] == "PDBe":
                                PDBe_val_tuples_in_list.append(
                                    (key_val_tuples_in_list_parent[2][1], key_val_tuples_in_list_parent[3][1], entity_chainID_list[1][1]))

                                for crossRefDb in residue:
                                    if crossRefDb.tag.endswith("residueDetail") == True and crossRefDb.text != "Not_Observed":
                                        details_list.append((("PDBid", root.get("dbAccessionId")), ("Annotation:", crossRefDb.text), (
                                        key_val_tuples_in_list_parent[2][1], key_val_tuples_in_list_parent[3][1], entity_chainID_list[1][1])))

                                    crossRefDb_list.append(crossRefDb.attrib)
                                    key_val_tuples_in_list_child = list(crossRefDb.attrib.items())

                                    if key_val_tuples_in_list_child[0][0] == "dbSource" and key_val_tuples_in_list_child[0][1] == "PDB":
                                        PDB_val_tuples_in_list.append((key_val_tuples_in_list_child[3][1], key_val_tuples_in_list_child[4][1],
                                                                       key_val_tuples_in_list_child[5][1]))
                                        PDBe_val_tuples_in_list_for_PDB.append(
                                            (key_val_tuples_in_list_parent[2][1], key_val_tuples_in_list_parent[3][1], entity_chainID_list[1][1]))

                                    if key_val_tuples_in_list_child[0][0] == "dbSource" and key_val_tuples_in_list_child[0][1] == "UniProt":
                                        UniProt_val_tuple_in_list.append(
                                            (key_val_tuples_in_list_child[3][1], key_val_tuples_in_list_child[4][1], entity_chainID_list[1][1]))
                                        PDBe_val_tuples_in_list_for_Uni.append(
                                            (key_val_tuples_in_list_parent[2][1], key_val_tuples_in_list_parent[3][1], entity_chainID_list[1][1]))
                                        UniProtdbAccessionId_list.append(key_val_tuples_in_list_child[2][1])

    tuple_PDBe_for_PDB_and_tuple_PDB = list(zip(PDBe_val_tuples_in_list_for_PDB, PDB_val_tuples_in_list))
    tuple_PDBe_for_UniProt_and_tuple_UniProt = list(zip(PDBe_val_tuples_in_list_for_Uni, UniProt_val_tuple_in_list, UniProtdbAccessionId_list))

    return [tuple_PDBe_for_PDB_and_tuple_PDB, tuple_PDBe_for_UniProt_and_tuple_UniProt, UniProt_conversion_dict, details_list]


def SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, tuple_PDBe_for_UniProt_and_tuple_UniProt, default_mmCIF_num,
                                chains_to_change="all"):
    df_PDBe_UniProt = pd.DataFrame(tuple_PDBe_for_UniProt_and_tuple_UniProt, columns=['PDBe', 'UniProt', "AccessionID"])
    df_PDBe_UniProt = df_PDBe_UniProt.drop_duplicates(subset="PDBe", keep='first')
    df_PDBe_PDB = pd.DataFrame(tuple_PDBe_for_PDB_and_tuple_PDB, columns=['PDBe', 'PDB'])
    df_PDBe_PDB = df_PDBe_PDB.drop_duplicates(subset="PDBe", keep='first')

    df_PDBe_PDB_UniProt = df_PDBe_PDB.merge(df_PDBe_UniProt, left_on="PDBe", right_on="PDBe", how='left')
    df_PDBe_PDB_UniProt['UniProt'] = df_PDBe_PDB_UniProt['UniProt'].replace(np.nan, "50000")
    df_PDBe_PDB_UniProt["Uni_moD"] = np.where(df_PDBe_PDB_UniProt['UniProt'] != "50000", df_PDBe_PDB_UniProt['UniProt'], df_PDBe_PDB_UniProt["PDBe"])
    df_PDBe_PDB_UniProt.loc[:, 'new_col_Uni'] = df_PDBe_PDB_UniProt.Uni_moD.map(lambda x: x[0])
    df_PDBe_PDB_UniProt["UniProt_50k"] = df_PDBe_PDB_UniProt.new_col_Uni.apply(lambda x: str(int(x) + default_mmCIF_num if type(x) == str else x))
    df_PDBe_PDB_UniProt.loc[df_PDBe_PDB_UniProt['UniProt'] != '50000', 'UniProt_50k'] = df_PDBe_PDB_UniProt['new_col_Uni']

    Three_Rows_CIF_Num_Uni = []
    if chains_to_change == "all":
        for index, rows in df_PDBe_PDB_UniProt.iterrows():
            intermediate_list = [rows.PDBe, rows.UniProt_50k, rows.Uni_moD, rows.PDB, rows.AccessionID]
            Three_Rows_CIF_Num_Uni.append(intermediate_list)

    else:
        for index, rows in df_PDBe_PDB_UniProt.iterrows():
            if rows.PDB[2].strip() in chains_to_change:
                intermediate_list = [rows.PDBe, rows.UniProt_50k, rows.Uni_moD, rows.PDB, rows.AccessionID]
            else:
                intermediate_list = [rows.PDBe, rows.PDB[0], rows.Uni_moD, rows.PDB, rows.AccessionID]
            Three_Rows_CIF_Num_Uni.append(intermediate_list)

    df_PDBe_PDB_UniProt["Three_Rows_CIF_Num_Uni"] = Three_Rows_CIF_Num_Uni
    df_PDBe_PDB_UniProt_without_null = df_PDBe_PDB_UniProt[df_PDBe_PDB_UniProt.PDB.map(lambda x: x[0]) != "null"]
    df_PDBe_PDB_UniProt_without_null_index_PDBe = df_PDBe_PDB_UniProt_without_null.set_index("PDBe")

    return [df_PDBe_PDB_UniProt_without_null_index_PDBe, df_PDBe_PDB_UniProt]


def master_mmCIF_renumber_function(input_mmCIF_file_were_found, default_input_path_to_mmCIF,
                                   default_input_path_to_SIFTS, default_output_path_to_mmCIF,
                                   default_mmCIF_num, gzip_mode, exception_AccessionIDs):
    input_mmCIF_assembly_files_were_found_list = list()
    input_mmCIF_assembly_files_were_found_list.append(input_mmCIF_file_were_found)

    for mmCIF_name in input_mmCIF_assembly_files_were_found_list:
        log_message = list()
        SIFTS_name = mmCIF_name[:4] + ".xml.gz"

        # for no SIFTS _no_SIFTS_out.cif.gz
        try:
            gzip.open(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name), 'rt')
        except FileNotFoundError:
            mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
            if mmcif_dict == 0:
                continue
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
            continue
            return log_message

        # for zerobyte SIFTS _zerobyte_SIFTS_out.cif.gz
        if os.path.getsize(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name)) == 0:
            mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
            if mmcif_dict == 0:
                continue
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
            return log_message

        product_tree_SIFTS = try_SIFTS_tree_parser(default_input_path_to_SIFTS, SIFTS_name)
        if product_tree_SIFTS == 0:
            continue

        tuple_PDBe_for_PDB_and_tuple_PDB = product_tree_SIFTS[0]
        tuple_PDBe_for_UniProt_and_tuple_UniProt = product_tree_SIFTS[1]
        UniProt_conversion_dict = product_tree_SIFTS[2]

        # _no UniProt in SIFTS _no_UniProt_in_SIFTS_out.cif.gz
        if tuple_PDBe_for_UniProt_and_tuple_UniProt == list():
            mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
            if mmcif_dict == 0:
                continue
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
            return log_message

        product_of_SIFTS_data_parser = SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, tuple_PDBe_for_UniProt_and_tuple_UniProt,
                                                                   default_mmCIF_num, 'all')
        df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

        # all good till here
        handling_chain_numbering = handling_chain_numbering_clashes(df_PDBe_PDB_UniProt, exception_AccessionIDs)
        chains_to_change = handling_chain_numbering[0]
        combined_tuple_PDBe_UniProt_AccessionID = handling_chain_numbering[1]
        longest_AccessionID_list = handling_chain_numbering[3]
        chains_to_change_one_to_end = handling_chain_numbering[4]

        product_of_SIFTS_data_parser = SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, combined_tuple_PDBe_UniProt_AccessionID,
                                                                   default_mmCIF_num, chains_to_change)
        df_PDBe_PDB_UniProt_without_null_index_PDBe = product_of_SIFTS_data_parser[0]
        df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

        renumbered_count = renumbered_count_in_chains(chains_to_change_one_to_end, df_PDBe_PDB_UniProt_without_null_index_PDBe,
                                                      mmCIF_name, UniProt_conversion_dict, longest_AccessionID_list)
        chain_total_renum = renumbered_count[0]
        nothing_changed = renumbered_count[1]

        chain_total_renum.append(nothing_changed)
        mod_log_message = chain_total_renum

        # for no change needed _no_change_out.cif.gz
        if nothing_changed == 0:
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            continue
            # return mod_log_message

        product_of_mmCIF_parser = mmCIF_parser(mmCIF_name, default_input_path_to_mmCIF, df_PDBe_PDB_UniProt_without_null_index_PDBe,
                                               default_mmCIF_num, chains_to_change, chains_to_change_one_to_end)
        df_final_atom_site = product_of_mmCIF_parser[0]
        mmcif_dict = product_of_mmCIF_parser[1]

        poly_nonpoly_append = poly_nonpoly_renum(mmcif_dict, df_PDBe_PDB_UniProt, chains_to_change, default_mmCIF_num)
        poly_nonpoly_atom_site = poly_nonpoly_append.append(df_final_atom_site).drop_duplicates(subset="PDB_num_and_chain", keep='first')

        formed_columns = column_formation(mmcif_dict)
        renumber_tables(formed_columns, mmcif_dict, poly_nonpoly_atom_site, chains_to_change, default_mmCIF_num)

        try:
            output_with_this_name_ending(".cif", default_output_path_to_mmCIF, mmcif_dict, mmCIF_name=mmCIF_name,
                                         gzip_mode=gzip_mode, current_directory=current_directory)
        except IndexError:
            # 5olg data swapped columns
            print("IndexError Warning this file is not renumbered:", mmCIF_name)
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)

In [242]:
input_mmCIF_files_were_found = look_what_is_inside("mmCIF")
input_mmCIF_files_were_found.index("5olg.cif.gz")

2

In [248]:
# # troubleshooting
some_err_list = list()
# # #############################################################################################################################################################################
for input_mmCIF_file_were_found in tqdm.tqdm(input_mmCIF_files_were_found[2:3], total=len(input_mmCIF_files_were_found[85910:85911]), position=0, leave=True, desc="Checking files"):
    
    
# def master_mmCIF_renumber_function(input_mmCIF_file_were_found, default_input_path_to_mmCIF,
#                                    default_input_path_to_SIFTS, default_output_path_to_mmCIF,
#                                    default_mmCIF_num, gzip_mode, exception_AccessionIDs):
    input_mmCIF_assembly_files_were_found_list = list()
    input_mmCIF_assembly_files_were_found_list.append(input_mmCIF_file_were_found)

    for mmCIF_name in input_mmCIF_assembly_files_were_found_list:
        log_message = list()
        SIFTS_name = mmCIF_name[:4] + ".xml.gz"

        # for no SIFTS _no_SIFTS_out.cif.gz
        try:
            gzip.open(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name), 'rt')
        except FileNotFoundError:
            mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
            if mmcif_dict == 0:
                continue
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
            continue
            # return log_message

        # for zerobyte SIFTS _zerobyte_SIFTS_out.cif.gz
        if os.path.getsize(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name)) == 0:
            mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
            if mmcif_dict == 0:
                continue
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
            continue
            #return log_message

        product_tree_SIFTS = try_SIFTS_tree_parser(default_input_path_to_SIFTS, SIFTS_name)
        if product_tree_SIFTS == 0:
            continue

        tuple_PDBe_for_PDB_and_tuple_PDB = product_tree_SIFTS[0]
        tuple_PDBe_for_UniProt_and_tuple_UniProt = product_tree_SIFTS[1]
        UniProt_conversion_dict = product_tree_SIFTS[2]

        # _no UniProt in SIFTS _no_UniProt_in_SIFTS_out.cif.gz
        if tuple_PDBe_for_UniProt_and_tuple_UniProt == list():
            mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
            if mmcif_dict == 0:
                continue
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
            continue
            # return log_message

        product_of_SIFTS_data_parser = SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, tuple_PDBe_for_UniProt_and_tuple_UniProt,
                                                                   default_mmCIF_num, 'all')
        df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

        # all good till here
        handling_chain_numbering = handling_chain_numbering_clashes(df_PDBe_PDB_UniProt, exception_AccessionIDs)
        chains_to_change = handling_chain_numbering[0]
        combined_tuple_PDBe_UniProt_AccessionID = handling_chain_numbering[1]
        longest_AccessionID_list = handling_chain_numbering[3]
        chains_to_change_one_to_end = handling_chain_numbering[4]

        product_of_SIFTS_data_parser = SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, combined_tuple_PDBe_UniProt_AccessionID,
                                                                   default_mmCIF_num, chains_to_change)
        df_PDBe_PDB_UniProt_without_null_index_PDBe = product_of_SIFTS_data_parser[0]
        df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

        renumbered_count = renumbered_count_in_chains(chains_to_change_one_to_end, df_PDBe_PDB_UniProt_without_null_index_PDBe,
                                                      mmCIF_name, UniProt_conversion_dict, longest_AccessionID_list)
        chain_total_renum = renumbered_count[0]
        nothing_changed = renumbered_count[1]

        chain_total_renum.append(nothing_changed)
        mod_log_message = chain_total_renum

        # for no change needed _no_change_out.cif.gz
        if nothing_changed == 0:
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            continue
            # return mod_log_message

        product_of_mmCIF_parser = mmCIF_parser(mmCIF_name, default_input_path_to_mmCIF, df_PDBe_PDB_UniProt_without_null_index_PDBe,
                                               default_mmCIF_num, chains_to_change, chains_to_change_one_to_end)
        df_final_atom_site = product_of_mmCIF_parser[0]
        mmcif_dict = product_of_mmCIF_parser[1]

        poly_nonpoly_append = poly_nonpoly_renum(mmcif_dict, df_PDBe_PDB_UniProt, chains_to_change, default_mmCIF_num)
        poly_nonpoly_atom_site = poly_nonpoly_append.append(df_final_atom_site).drop_duplicates(subset="PDB_num_and_chain", keep='first')

        formed_columns = column_formation(mmcif_dict)
        renumber_tables(formed_columns, mmcif_dict, poly_nonpoly_atom_site, chains_to_change, default_mmCIF_num)

        try:
            output_with_this_name_ending(".cif", default_output_path_to_mmCIF, mmcif_dict, mmCIF_name=mmCIF_name,
                                         gzip_mode=gzip_mode, current_directory=current_directory)
        except IndexError:
            # 5olg data swapped columns
            print("IndexError Warning this file is not renumbered:", mmCIF_name)
            copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
            
#         except TypeError:
#             print("TypeError Warning this file is not renumbered:", mmCIF_name)
#             copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
#         except ValueError:
#             print("ValueError Warning this file is not renumbered:", mmCIF_name)
#             copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
        
        
         

Checking files: 0it [00:00, ?it/s]

ValueError in table _pdbx_refine_tls_group.end_auth_seq_idhas non-numeric value point5OLG


Checking files: 1it [00:00,  1.12it/s]


In [246]:
mmcif_dict["data_"]

'5OLG'

In [214]:
for n in mmcif_dict.keys():
    if "_pdbx_branch_scheme" in n:
        print(mmCIF_name)

_pdbx_poly_seq_scheme.pdb_seq_num
_pdbx_nonpoly_scheme.pdb_seq_num
_pdbx_branch_scheme.pdb_seq_num


In [None]:
# print(mmCIF_name)
# for n in formed_columns[:2]:
#     print(n[0])
#     print(len(mmcif_dict[n[0]]))
#     print(mmcif_dict[n[0]])
#     print(n[2])
#     print(len(mmcif_dict[n[2]]))
#     print(mmcif_dict[n[2]])
#     if len(mmcif_dict[n[0]]) != len(mmcif_dict[n[2]]):
#         print(n[0])
#     try:
#         print(n[2])
#         print(mmcif_dict[n[2]])
#         for a in mmcif_dict[n[2]]:
#             if a != "?":
#                 print(a)
#                 mmcif_dict[n[2]]
#     except IndexError:
#         pass

In [108]:
# for n in formed_columns:
#     print(n[0])
#     print(mmcif_dict[n[0]])


# # # troubleshooting
# input_mmCIF_files_were_found =look_what_is_inside("mmCIF")
# # input_list = input_mmCIF_files_were_found[60216:60217]

# # #############################################################################################################################################################################

# for input_file in tqdm.tqdm(input_mmCIF_files_were_found[60216:60217], total=len(input_mmCIF_files_were_found[:100]), position=0, leave=True, desc="Checking files"):

#     input_mmCIF_assembly_files_were_found_list = list()
#     input_mmCIF_assembly_files_were_found_list.append(input_file)

#     for mmCIF_name in input_mmCIF_assembly_files_were_found_list:
#         print(mmCIF_name)
#         log_message = list()
#         SIFTS_name = mmCIF_name[:4] + ".xml.gz"
        
#         # for no SIFTS _no_SIFTS_out.cif.gz
#         try:
#             gzip.open(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name), 'rt')
#         except FileNotFoundError:
#             mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
#             if mmcif_dict == 0:
#                 continue
#             copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
#             log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
#             continue
#             # return log_message
        
#         # for zerobyte SIFTS _zerobyte_SIFTS_out.cif.gz
#         if os.path.getsize(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name)) == 0:
#             mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
#             if mmcif_dict == 0:
#                 continue
#             copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
#             log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
#             continue
#             # return log_message

#         product_tree_SIFTS = try_SIFTS_tree_parser(default_input_path_to_SIFTS, SIFTS_name)
#         if product_tree_SIFTS == 0:
#             continue

#         tuple_PDBe_for_PDB_and_tuple_PDB = product_tree_SIFTS[0]
#         tuple_PDBe_for_UniProt_and_tuple_UniProt = product_tree_SIFTS[1]
#         UniProt_conversion_dict = product_tree_SIFTS[2]
        
#         # _no UniProt in SIFTS _no_UniProt_in_SIFTS_out.cif.gz
#         if tuple_PDBe_for_UniProt_and_tuple_UniProt == list():
#             mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
#             if mmcif_dict == 0:
#                 continue
#             copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
#             log_message = if_no_SIFTS_data_log(mmCIF_name, mmcif_dict, log_message)
#             continue
#             # return log_message

#         product_of_SIFTS_data_parser = SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, tuple_PDBe_for_UniProt_and_tuple_UniProt,
#                                                                    default_mmCIF_num, 'all')
#         df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

#         # all good till here
#         handling_chain_numbering = handling_chain_numbering_clashes(df_PDBe_PDB_UniProt, exception_AccessionIDs)
#         chains_to_change = handling_chain_numbering[0]
#         combined_tuple_PDBe_UniProt_AccessionID = handling_chain_numbering[1]
#         longest_AccessionID_list = handling_chain_numbering[3]
#         chains_to_change_one_to_end = handling_chain_numbering[4]

#         product_of_SIFTS_data_parser = SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, combined_tuple_PDBe_UniProt_AccessionID,
#                                                                    default_mmCIF_num, chains_to_change)
#         df_PDBe_PDB_UniProt_without_null_index_PDBe = product_of_SIFTS_data_parser[0]
#         df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

#         renumbered_count = renumbered_count_in_chains(chains_to_change_one_to_end, df_PDBe_PDB_UniProt_without_null_index_PDBe,
#                                                       mmCIF_name, UniProt_conversion_dict, longest_AccessionID_list)
#         chain_total_renum = renumbered_count[0]
#         nothing_changed = renumbered_count[1]

#         chain_total_renum.append(nothing_changed)
#         mod_log_message = chain_total_renum
        
#         # for no change needed _no_change_out.cif.gz
#         if nothing_changed == 0:
#             copy_file(default_input_path_to_mmCIF, mmCIF_name, default_output_path_to_mmCIF, ".cif.gz", gzip_mode)
#             continue
#             # return mod_log_message

#         product_of_mmCIF_parser = mmCIF_parser(mmCIF_name, default_input_path_to_mmCIF, df_PDBe_PDB_UniProt_without_null_index_PDBe,
#                                                default_mmCIF_num, chains_to_change)
        
#         df_final_dropped_dup = product_of_mmCIF_parser[0]
#         mmcif_dict = product_of_mmCIF_parser[1]
#         _pdbx_poly_seq_scheme_auth_seq_num_before_change = product_of_mmCIF_parser[2]
#         _atom_site_label_comp_id_list = product_of_mmCIF_parser[3]

#         formed_columns = column_formation(mmcif_dict)
#         for n in formed_columns:
#             auth_comp_id = 0
#             auth_seq_id = n[0]
#             auth_asym_id = n[1]
#             try:
#                 PDB_ins_code = n[2]
#                 if "ins_code" not in PDB_ins_code:
#                     auth_comp_id = PDB_ins_code
#                     PDB_ins_code = 0 
#             except IndexError:
#                 PDB_ins_code = 0

#             try:
#                 if auth_comp_id == 0:
#                     auth_comp_id = n[3]
#             except IndexError:
#                 auth_comp_id = 0
                
# #             renumber_tables(auth_seq_id, auth_asym_id, auth_comp_id, PDB_ins_code, mmcif_dict,
# #                             df_final_dropped_dup, default_mmCIF_num, chains_to_change)
        
# #         renum_pdbx_unobs_or_zero_occ_residues_auth_seq_id(mmcif_dict, df_PDBe_PDB_UniProt, default_mmCIF_num)
        
# # #       renum_pdbx_poly_seq_scheme_auth_seq_num(mmcif_dict, df_final_dropped_dup, default_mmCIF_num)
# #         renum_pdbx_nonpoly_scheme_auth_seq_num(mmcif_dict, df_final_dropped_dup, default_mmCIF_num)
#         renum_struct_ref_seq_pdbx_auth_seq_align(mmcif_dict)
        
#         #just out out.cif
 
#         output_with_this_name_ending(".cif.gz", default_output_path_to_mmCIF, mmcif_dict, mmCIF_name=mmCIF_name, gzip_mode=gzip_mode,
#                                          current_directory=current_directory)
            


# #         return mod_log_message


# #########################################################################################################################################

# all_num_list = list()
# for n in formed_columns[13:14]:
#     auth_comp_id = 0
#     auth_seq_id = n[0]
#     auth_asym_id = n[1]
#     if "_pdbx_poly_seq_scheme" in auth_seq_id:
#         auth_seq_id = "_pdbx_poly_seq_scheme.pdb_seq_num"
        
#     try:
#         PDB_ins_code = n[2]
#         if "ins_code" not in PDB_ins_code:
#             auth_comp_id = PDB_ins_code
#             PDB_ins_code = 0 
#     except IndexError:
#         PDB_ins_code = 0

#     try:
#         if auth_comp_id == 0:
#             auth_comp_id = n[3]
#     except IndexError:
#         auth_comp_id = 0
        
        
#     try:
#         PDB_ins_code_list = list()
#         # auth_comp_id_list = mmcif_dict[auth_comp_id] for debug only
#         auth_seq_id_list = mmcif_dict[auth_seq_id]
#         auth_asym_id_list = mmcif_dict[auth_asym_id]
#         print(auth_seq_id)
#         print(len(auth_seq_id_list))

#         if PDB_ins_code == 0:
#             for _ in range(len(auth_seq_id_list)):
#                 PDB_ins_code_list.append("?")
#         else:
#             PDB_ins_code_list = mmcif_dict[PDB_ins_code]

#         if type(auth_asym_id_list) == str:
#             # auth_comp_id_list = [auth_comp_id_list] for debug only
#             auth_seq_id_list = [auth_seq_id_list]
#             auth_asym_id_list = [auth_asym_id_list]
#             PDB_ins_code_list = [PDB_ins_code]
#             if PDB_ins_code == 0:
#                 PDB_ins_code_list = ["?"]
#             else:
#                 PDB_ins_code_list = [PDB_ins_code]
        
#         dot_to_question = list()
#         for ins_code in PDB_ins_code_list:
#             if ins_code == ".":
#                 dot_to_question.append("?")
#             else:
#                 dot_to_question.append(ins_code)
        
#         PDB_ins_code_list = dote_to_question

#         auth_seq_id_list_zip = list(zip(auth_seq_id_list, auth_asym_id_list))
#         df_mmCIF_auth_seq_id_list_zip = pd.DataFrame(zip(auth_seq_id_list_zip, PDB_ins_code_list))
#         df_mmCIF_auth_seq_id_list_zip = df_mmCIF_auth_seq_id_list_zip.rename(columns={0: "auth_seq_id_list_zip", 1: "ins_code"})

#         df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code"] = np.where(df_mmCIF_auth_seq_id_list_zip['ins_code'] != "?", 
#                                                                       (df_mmCIF_auth_seq_id_list_zip['auth_seq_id_list_zip'].apply(lambda x: x[0])                        
#                                                                        + df_mmCIF_auth_seq_id_list_zip['ins_code'].apply(lambda y: y[0]) + ", "
#                                                                        + df_mmCIF_auth_seq_id_list_zip['auth_seq_id_list_zip'].apply(lambda x: x[1])), 
#                                                                       df_mmCIF_auth_seq_id_list_zip['ins_code'])

#         df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code_cor"] = np.where(df_mmCIF_auth_seq_id_list_zip['PDB_with_ins_code'] != "?", 
#                                                                           df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code"].apply(lambda x: tuple(x.split(","))),
#                                                                           df_mmCIF_auth_seq_id_list_zip["auth_seq_id_list_zip"])

#         df_mmCIF_auth_seq_id_list_zip["auth_seq_id_list_zip"] = df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code_cor"]
#         df_mmCIF_auth_seq_id_list_zip = df_mmCIF_auth_seq_id_list_zip.drop(columns=["PDB_with_ins_code_cor", "ins_code", "PDB_with_ins_code"])
        
#         df_PDBe_PDB_UniProt["auth_num_chain"] = df_PDBe_PDB_UniProt["PDB"].apply(lambda x: (x[0], x[2]))
        
#         # drop_duplicates auth_num_chain column
#         df_PDBe_PDB_UniProt = df_PDBe_PDB_UniProt.drop_duplicates(subset="auth_num_chain", keep='first')
        
#         # merging
#         df_auth_seq_id_list_zip_final = df_mmCIF_auth_seq_id_list_zip.merge(df_PDBe_PDB_UniProt, left_on="auth_seq_id_list_zip", right_on="auth_num_chain", how='left')
        
#         # masterpiece function
#         df_auth_seq_id_list_zip_final["final"] = np.where(df_auth_seq_id_list_zip_final["Three_Rows_CIF_Num_Uni"].apply(lambda x: x is np.nan), 
#                                                       df_auth_seq_id_list_zip_final["auth_seq_id_list_zip"].apply(
#                                                           lambda x: "?" if x[0] == "?" 
#                                                           else ("." if x[0] == "." 
#                                                            else (str(int(''.join(filter(str.isdigit, str(x[0])))) + default_mmCIF_num) if x[1] in chains_to_change 
#                                                             else (''.join(filter(str.isdigit, str(x[0]))))))), 
#                                                       df_auth_seq_id_list_zip_final["Three_Rows_CIF_Num_Uni"].apply(
#                                                           lambda x: (str(int(''.join(filter(str.isdigit, x[1]))) + default_mmCIF_num + 10000) if (x[0][0] == "." and x[2][2].strip() in chains_to_change)
#                                                                      else ''.join(filter(str.isdigit, str(x[1])))) if x is not np.nan else x))
# #         print(len(df_auth_seq_id_list_zip_final["final"]))

        
# #         df_auth_seq_id_list_zip_final = df_auth_seq_id_list_zip_final["final"]
# #         final_list=list()
# #         for value in df_auth_seq_id_list_zip_final:
# #             final_list.append(value)
        
# #         all_num_list.append(final_list)
# #         # actual replacing auth_num with UniProt_num and of ins_code with '?'
# #         PDB_ins_code_list = list()
# #         if PDB_ins_code != 0:
# #             if "." in mmcif_dict[PDB_ins_code]:
# #                 for _ in range(len(final_list)):
# #                     PDB_ins_code_list.append(".")
# #             else:
# #                 for _ in range(len(final_list)):
# #                     PDB_ins_code_list.append("?")
# #             mmcif_dict[PDB_ins_code] = PDB_ins_code_list
# #         mmcif_dict[auth_seq_id] = final_list
        
#     except KeyError:
#         pass

In [14]:
# all_num_list = list()
# for n in formed_columns[13:14]:
#     lost_ins_code_list = [".", "?"]
#     auth_comp_id = 0
#     auth_seq_id = n[0]
#     auth_asym_id = n[1]
#     try:
#         PDB_ins_code = n[2]
#         if "ins_code" not in PDB_ins_code:
#             auth_comp_id = PDB_ins_code
#             PDB_ins_code = 0 
#     except IndexError:
#         PDB_ins_code = 0

#     try:
#         if auth_comp_id == 0:
#             auth_comp_id = n[3]
#     except IndexError:
#         auth_comp_id = 0
        
        
#     try:
#         PDB_ins_code_list = list()
#         # auth_comp_id_list = mmcif_dict[auth_comp_id] for debug only
#         auth_seq_id_list = mmcif_dict[auth_seq_id]
#         auth_asym_id_list = mmcif_dict[auth_asym_id]
#         print(auth_seq_id)
#         print(len(auth_seq_id_list))

#         if PDB_ins_code == 0:
#             for _ in range(len(auth_seq_id_list)):
#                 PDB_ins_code_list.append("?")
#         else:
#             PDB_ins_code_list = mmcif_dict[PDB_ins_code]

#         if type(auth_asym_id_list) == str:
#             # auth_comp_id_list = [auth_comp_id_list] for debug only
#             auth_seq_id_list = [auth_seq_id_list]
#             auth_asym_id_list = [auth_asym_id_list]
#             PDB_ins_code_list = [PDB_ins_code]
#             if PDB_ins_code == 0:
#                 PDB_ins_code_list = ["?"]
#             else:
#                 PDB_ins_code_list = [PDB_ins_code]
            
#         auth_seq_id_list_zip = list(zip(auth_seq_id_list, auth_asym_id_list))
#         df_mmCIF_auth_seq_id_list_zip = pd.DataFrame(zip(auth_seq_id_list_zip, PDB_ins_code_list))
#         df_mmCIF_auth_seq_id_list_zip = df_mmCIF_auth_seq_id_list_zip.rename(columns={0: "auth_seq_id_list_zip", 1: "ins_code"})

        
#         df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code"] = np.where(df_mmCIF_auth_seq_id_list_zip['ins_code'].apply(lambda x: x not in lost_ins_code_list), 
#                                                               (df_mmCIF_auth_seq_id_list_zip['auth_seq_id_list_zip'].apply(lambda x: x[0])                        
#                                                                + df_mmCIF_auth_seq_id_list_zip['ins_code'].apply(lambda y: y[0]) + ", "
#                                                                + df_mmCIF_auth_seq_id_list_zip['auth_seq_id_list_zip'].apply(lambda x: x[1])),
#                                                               df_mmCIF_auth_seq_id_list_zip['ins_code'].apply(lambda x: x if x not in lost_ins_code_list else x))

#         df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code_cor"] = np.where(df_mmCIF_auth_seq_id_list_zip['PDB_with_ins_code'].apply(lambda x: x not in lost_ins_code_list), 
#                                                                           df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code"].apply(lambda x: tuple(x.split(","))),
#                                                                           df_mmCIF_auth_seq_id_list_zip["auth_seq_id_list_zip"].apply(lambda x: x if x not in lost_ins_code_list else x))

#         df_mmCIF_auth_seq_id_list_zip["auth_seq_id_list_zip"] = df_mmCIF_auth_seq_id_list_zip["PDB_with_ins_code_cor"]
#         df_mmCIF_auth_seq_id_list_zip = df_mmCIF_auth_seq_id_list_zip.drop(columns=["PDB_with_ins_code_cor", "ins_code", "PDB_with_ins_code"])
#         df_final_dropped_dup["auth_num_chain"] = df_final_dropped_dup["auth_mmCIF"].apply(lambda x: (x[0], x[2]))
        
#         # drop_duplicates auth_num_chain column
#         df_final_dropped_dup = df_final_dropped_dup.drop_duplicates(subset="auth_num_chain", keep='first')
        
#         # merging
#         df_auth_seq_id_list_zip_final = df_mmCIF_auth_seq_id_list_zip.merge(df_final_dropped_dup, left_on="auth_seq_id_list_zip", right_on="auth_num_chain", how='left')
        
#         # masterpiece function
#         df_auth_seq_id_list_zip_final["final"] = np.where(df_auth_seq_id_list_zip_final["Three_Rows_CIF_Num_Uni"].apply(lambda x: x is np.nan), 
#                                                       df_auth_seq_id_list_zip_final["auth_seq_id_list_zip"].apply(
#                                                           lambda x: "?" if x[0] == "?" 
#                                                           else ("." if x[0] == "." 
#                                                            else (str(int(''.join(filter(str.isdigit, str(x[0])))) + default_mmCIF_num) if x[1] in chains_to_change 
#                                                             else (''.join(filter(str.isdigit, str(x[0]))))))), 
#                                                       df_auth_seq_id_list_zip_final["Three_Rows_CIF_Num_Uni"].apply(
#                                                           lambda x: (str(int(''.join(filter(str.isdigit, x[1]))) + default_mmCIF_num + 10000) if (x[0][0] == "." and x[2][2].strip() in chains_to_change)
#                                                                      else ''.join(filter(str.isdigit, str(x[1])))) if x is not np.nan else x))
        

#         print(len(df_auth_seq_id_list_zip_final["final"]))
        
# #         df_auth_seq_id_list_zip_final = df_auth_seq_id_list_zip_final["final"]
# #         final_list=list()
# #         for value in df_auth_seq_id_list_zip_final:
# #             final_list.append(value)
        
# #         all_num_list.append(final_list)
# #         # actual replacing auth_num with UniProt_num and of ins_code with '?'
# #         PDB_ins_code_list = list()
# #         if PDB_ins_code != 0:
# #             if "." in mmcif_dict[PDB_ins_code]:
# #                 for _ in range(len(final_list)):
# #                     PDB_ins_code_list.append(".")
# #             else:
# #                 for _ in range(len(final_list)):
# #                     PDB_ins_code_list.append("?")

# #                 if "_pdbx_poly_seq_scheme" not in auth_seq_id and "_pdbx_nonpoly_scheme" not in auth_seq_id:
# #                     mmcif_dict[PDB_ins_code] = PDB_ins_code_list
# #         mmcif_dict[auth_seq_id] = final_list
        
#     except KeyError:
#         pass

In [189]:
def check_assemblies(mmCIF_assembly, default_output_path_to_mmCIF_assembly):
    output_mmCIF_assembly_files_were_found_list = list()
    output_mmCIF_assembly_files_were_found_list.append(mmCIF_assembly)
    for name in output_mmCIF_assembly_files_were_found_list:
        not_gzip = 1
        try:
            list_of_lines_from_assembly_file = gzip.open(
                Path(str(default_output_path_to_mmCIF_assembly) + "/" + name), 'rt').readlines()
        except OSError:
            # maybe not archived
            try:
                list_of_lines_from_assembly_file = open(
                    Path(str(default_output_path_to_mmCIF_assembly) + "/" + name), 'rt').readlines()
                not_gzip = 0
            except Exception:
                # broken archive
                os.remove(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name))
                continue
        except Exception:
            # broken archive
            os.remove(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name))
            continue

        # check if file startswith "_atom_site" table at the beginning
        try:
            if "_atom_site" in list_of_lines_from_assembly_file[3] and "loop_" in list_of_lines_from_assembly_file[2]:
                pass
            else:
                continue
        except IndexError:
            # empty file
            os.remove(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name))
            continue

        try:
            new_order_for_assembly_file = (list_of_lines_from_assembly_file[:1]
                                           + list_of_lines_from_assembly_file[list_of_lines_from_assembly_file.index("#\n", 2):]
                                           + list_of_lines_from_assembly_file[2:list_of_lines_from_assembly_file.index("#\n", 2)]
                                           + ["#\n"])

            if not_gzip != 0:
                with gzip.open(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name), "wt") as gzip_out:
                    for listitem in new_order_for_assembly_file:
                        gzip_out.write(listitem)
            else:
                with open(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name), "wt") as file_out:
                    for listitem in new_order_for_assembly_file:
                        file_out.write(listitem)

        except ValueError:
            # file isn't complete
            os.remove(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name))
            


def ProcessPool_run_renum_mmCIF(format_mmCIF, mmCIF_to_renumber, default_input_path_to_mmCIF,
                                default_input_path_to_SIFTS, default_output_path_to_mmCIF, default_mmCIF_num,
                                gzip_mode, exception_AccessionIDs, nproc):
    first_res = 0

    for i in range(3):
        if not os.path.exists(default_output_path_to_mmCIF):
            os.makedirs(default_output_path_to_mmCIF)

        # renumber loop
        resulting = list()
        executor = ProcessPoolExecutor(max_workers=nproc)
        partial_master_mmCIF_renumber_function = partial(master_mmCIF_renumber_function,
                                                         default_input_path_to_mmCIF=default_input_path_to_mmCIF,
                                                         default_input_path_to_SIFTS=default_input_path_to_SIFTS,
                                                         default_output_path_to_mmCIF=default_output_path_to_mmCIF,
                                                         default_mmCIF_num=default_mmCIF_num, gzip_mode=gzip_mode,
                                                         exception_AccessionIDs=exception_AccessionIDs)
        jobs = [executor.submit(partial_master_mmCIF_renumber_function, mmCIF_files) for mmCIF_files in mmCIF_to_renumber]
        for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), miniters=1, position=0,
                             leave=True, desc="Renumbering " + format_mmCIF + " files"):
            result = job.result()
            resulting.append(result)

        if i == 0:
            first_res = resulting

        if format_mmCIF == "mmCIF_assembly":
            output_mmCIF = look_what_is_inside('output_mmCIF_assembly', default_output_path_to_mmCIF_assembly=default_output_path_to_mmCIF)
        else:
            output_mmCIF = look_what_is_inside('output_mmCIF', default_output_path_to_mmCIF=default_output_path_to_mmCIF)

        # checker loop
        check_list = list()
        executor = ProcessPoolExecutor(max_workers=nproc)
        partial_reform_assembly = partial(check_assemblies, default_output_path_to_mmCIF_assembly=default_output_path_to_mmCIF)
        jobs = [executor.submit(partial_reform_assembly, assembly_files) for assembly_files in output_mmCIF]
        for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), miniters=1, position=0,
                             leave=True, desc="Checking " + format_mmCIF + " files"):
            resultus = job.result()
            check_list.append(resultus)

        if format_mmCIF == "mmCIF_assembly":
            output_mmCIF = look_what_is_inside('output_mmCIF_assembly', default_output_path_to_mmCIF_assembly=default_output_path_to_mmCIF)
        else:
            output_mmCIF = look_what_is_inside('output_mmCIF', default_output_path_to_mmCIF=default_output_path_to_mmCIF)

        if len(check_list) == len(output_mmCIF):
            break
        else:
            mmCIF_to_renumber = list(set(mmCIF_to_renumber) - set(output_mmCIF))

    return first_res


In [190]:
# def ProcessPool_run_renum(format_to_download="mmCIF", input_mmCIF_files_were_found=(),
#               default_input_path_to_mmCIF=default_input_path_to_mmCIF,
#               default_input_path_to_SIFTS=default_input_path_to_SIFTS, 
#               default_output_path_to_mmCIF=default_output_path_to_mmCIF, 
#               default_mmCIF_num=default_mmCIF_num, gzip_mode=gzip_mode):
    
#     resulting = list()
#     executor = ProcessPoolExecutor()
#     partial_master_mmCIF_renumber_function = partial(master_mmCIF_renumber_function,
#                                                      default_input_path_to_mmCIF=default_input_path_to_mmCIF,
#                                                      default_input_path_to_SIFTS=default_input_path_to_SIFTS, 
#                                                      default_output_path_to_mmCIF=default_output_path_to_mmCIF, 
#                                                      default_mmCIF_num=default_mmCIF_num, gzip_mode=gzip_mode)
    
#     jobs = [executor.submit(partial_master_mmCIF_renumber_function, mmCIF_files) for mmCIF_files in input_mmCIF_files_were_found]
#     for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), position=0, leave=True, desc="Renumbering "+format_to_download+" files"):
#         resultus = job.result()
#         if resultus is not None:
#             resulting.append(resultus)
    
#     return resulting
        

In [191]:
# input_mmCIF_files_were_found =look_what_is_inside("mmCIF_assembly")
# default_output_path_to_mmCIF = default_output_path_to_mmCIF_assembly
# default_input_path_to_mmCIF = default_input_path_to_mmCIF_assembly

# input_mmCIF_files_were_found =look_what_is_inside("mmCIF")
# default_input_path_to_mmCIF = current_directory + "/mmCIF"
# default_output_path_to_mmCIF = current_directory + "/output_mmCIF"

In [192]:
#for asymmetric_unit
# input_mmCIF_files_were_found = look_what_is_inside('mmCIF', default_input_path_to_mmCIF=default_input_path_to_mmCIF)
# if not os.path.exists(default_output_path_to_mmCIF):
#     os.makedirs(default_output_path_to_mmCIF)
# if __name__ == '__main__':
#     resulting = ProcessPool_run_renum("mmCIF", input_mmCIF_files_were_found,
#                           default_input_path_to_mmCIF,
#                           default_input_path_to_SIFTS,
#                           default_output_path_to_mmCIF,
#                           default_mmCIF_num, gzip_mode)
#for assembly
# input_mmCIF_assembly_files_were_found = look_what_is_inside('mmCIF_assembly',default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly)
# if not os.path.exists(default_output_path_to_mmCIF):
#     os.makedirs(default_output_path_to_mmCIF)
    
# if __name__ == '__main__':
#     resulting2 = ProcessPool_run_renum("mmCIF_assembly", input_mmCIF_assembly_files_were_found[207294:207295],
#                           default_input_path_to_mmCIF_assembly,
#                           default_input_path_to_SIFTS,
#                           default_output_path_to_mmCIF_assembly,
#                           default_mmCIF_num,gzip_mode)

In [228]:
if __name__ == '__main__':
    ProcessPool_run_renum_mmCIF("mmCIF", input_mmCIF_files_were_found[:1000], default_input_path_to_mmCIF,
                                default_input_path_to_SIFTS, default_output_path_to_mmCIF, default_mmCIF_num,
                                gzip_mode, exception_AccessionIDs, nproc)

Renumbering mmCIF files: 100%|██████████| 1000/1000 [01:58<00:00,  8.44it/s]
Checking mmCIF files: 100%|██████████| 176198/176198 [02:15<00:00, 1300.50it/s]


In [230]:
if __name__ == '__main__':
    ProcessPool_run_renum_mmCIF("mmCIF", input_mmCIF_files_were_found[:20000], default_input_path_to_mmCIF,
                                default_input_path_to_SIFTS, default_output_path_to_mmCIF, default_mmCIF_num,
                                gzip_mode, exception_AccessionIDs, nproc)

Renumbering mmCIF files: 100%|██████████| 20000/20000 [39:52<00:00,  8.36it/s]  
Checking mmCIF files: 100%|██████████| 176185/176185 [02:23<00:00, 1228.31it/s]


In [None]:
"""ValueError Warning this file is not renumbered: 6ysc.cif.gz
ValueError Warning this file is not renumbered: 4d6a.cif.gz
ValueError Warning this file is not renumbered: 2rdt.cif.gz
ValueError Warning this file is not renumbered: 3mlh.cif.gz
ValueError Warning this file is not renumbered: 3bln.cif.gz
ValueError Warning this file is not renumbered: 6v2c.cif.gz
ValueError Warning this file is not renumbered: 3qks.cif.gz
ValueError Warning this file is not renumbered: 3qkr.cif.gz
ValueError Warning this file is not renumbered: 5o61.cif.gz
ValueError Warning this file is not renumbered: 1xpl.cif.gz
TypeError Warning this file is not renumbered: 6zsg.cif.gz
ValueError Warning this file is not renumbered: 3bvl.cif.gz
ValueError Warning this file is not renumbered: 1f32.cif.gz
ValueError Warning this file is not renumbered: 2zs0.cif.gz
ValueError Warning this file is not renumbered: 2bwx.cif.gz
ValueError Warning this file is not renumbered: 4mpb.cif.gz
ValueError Warning this file is not renumbered: 5hbl.cif.gz
ValueError Warning this file is not renumbered: 2rft.cif.gz
ValueError Warning this file is not renumbered: 2rfu.cif.gz
ValueError Warning this file is not renumbered: 5za2.cif.gz
ValueError Warning this file is not renumbered: 2zcf.cif.gz
ValueError Warning this file is not renumbered: 6z8m.cif.gz
ValueError Warning this file is not renumbered: 1eta.cif.gz
ValueError Warning this file is not renumbered: 2j6v.cif.gz
ValueError Warning this file is not renumbered: 6ju4.cif.gz
ValueError Warning this file is not renumbered: 6b9z.cif.gz
ValueError Warning this file is not renumbered: 3cir.cif.gz
ValueError Warning this file is not renumbered: 6uqc.cif.gz
ValueError Warning this file is not renumbered: 3qe1.cif.gz
ValueError Warning this file is not renumbered: 5ezq.cif.gz
ValueError Warning this file is not renumbered: 3bsq.cif.gz
ValueError Warning this file is not renumbered: 4ko2.cif.gz
ValueError Warning this file is not renumbered: 5ka8.cif.gz
ValueError Warning this file is not renumbered: 4v8y.cif.gz
ValueError Warning this file is not renumbered: 6ni5.cif.gz
ValueError Warning this file is not renumbered: 3gju.cif.gz
ValueError Warning this file is not renumbered: 6dwq.cif.gz
ValueError Warning this file is not renumbered: 3r3m.cif.gz
ValueError Warning this file is not renumbered: 3ov6.cif.gz
ValueError Warning this file is not renumbered: 3bvf.cif.gz
ValueError Warning this file is not renumbered: 2wgw.cif.gz
TypeError Warning this file is not renumbered: 6zsc.cif.gz
ValueError Warning this file is not renumbered: 2ihb.cif.gz
ValueError Warning this file is not renumbered: 2x01.cif.gz
ValueError Warning this file is not renumbered: 5cl6.cif.gz
ValueError Warning this file is not renumbered: 3ers.cif.gz
ValueError Warning this file is not renumbered: 2pxh.cif.gz
ValueError Warning this file is not renumbered: 4wq9.cif.gz
ValueError Warning this file is not renumbered: 3oad.cif.gz
ValueError Warning this file is not renumbered: 6ni6.cif.gz
ValueError Warning this file is not renumbered: 5ndw.cif.gz
ValueError Warning this file is not renumbered: 3mlt.cif.gz
ValueError Warning this file is not renumbered: 4dpz.cif.gz
ValueError Warning this file is not renumbered: 1xpm.cif.gz
ValueError Warning this file is not renumbered: 4ueq.cif.gz
ValueError Warning this file is not renumbered: 3h7s.cif.gz
ValueError Warning this file is not renumbered: 5cl5.cif.gz
ValueError Warning this file is not renumbered: 2z3c.cif.gz
ValueError Warning this file is not renumbered: 3r3g.cif.gz
ValueError Warning this file is not renumbered: 5jt1.cif.gz
ValueError Warning this file is not renumbered: 6v24.cif.gz
ValueError Warning this file is not renumbered: 2qna.cif.gz
ValueError Warning this file is not renumbered: 2qdv.cif.gz
ValueError Warning this file is not renumbered: 5d9e.cif.gz
ValueError Warning this file is not renumbered: 5jsu.cif.gz
ValueError Warning this file is not renumbered: 6r94.cif.gz
ValueError Warning this file is not renumbered: 3h4t.cif.gz
ValueError Warning this file is not renumbered: 1tg1.cif.gz
ValueError Warning this file is not renumbered: 2pxs.cif.gz
ValueError Warning this file is not renumbered: 4ur7.cif.gz
ValueError Warning this file is not renumbered: 5t1x.cif.gz
ValueError Warning this file is not renumbered: 3bua.cif.gz
ValueError Warning this file is not renumbered: 2hal.cif.gz
ValueError Warning this file is not renumbered: 3q26.cif.gz
ValueError Warning this file is not renumbered: 2vy1.cif.gz
ValueError Warning this file is not renumbered: 3cfn.cif.gz
ValueError Warning this file is not renumbered: 3gbm.cif.gz
ValueError Warning this file is not renumbered: 5hbo.cif.gz
ValueError Warning this file is not renumbered: 2z70.cif.gz
ValueError Warning this file is not renumbered: 3ze9.cif.gz
ValueError Warning this file is not renumbered: 3hh1.cif.gz
ValueError Warning this file is not renumbered: 6yce.cif.gz
ValueError Warning this file is not renumbered: 1enm.cif.gz
ValueError Warning this file is not renumbered: 5t20.cif.gz
ValueError Warning this file is not renumbered: 5jsy.cif.gz
ValueError Warning this file is not renumbered: 5jfn.cif.gz
ValueError Warning this file is not renumbered: 3nyn.cif.gz
ValueError Warning this file is not renumbered: 6h8t.cif.gz
ValueError Warning this file is not renumbered: 6sdg.cif.gz
ValueError Warning this file is not renumbered: 2qjb.cif.gz
ValueError Warning this file is not renumbered: 3pct.cif.gz
ValueError Warning this file is not renumbered: 3bve.cif.gz
ValueError Warning this file is not renumbered: 3lf3.cif.gz
ValueError Warning this file is not renumbered: 2jge.cif.gz
ValueError Warning this file is not renumbered: 4kl8.cif.gz
ValueError Warning this file is not renumbered: 4dz8.cif.gz
ValueError Warning this file is not renumbered: 2wz5.cif.gz
ValueError Warning this file is not renumbered: 3q25.cif.gz
ValueError Warning this file is not renumbered: 5fhv.cif.gz
ValueError Warning this file is not renumbered: 3h4i.cif.gz
ValueError Warning this file is not renumbered: 3s7k.cif.gz
ValueError Warning this file is not renumbered: 3m5n.cif.gz
ValueError Warning this file is not renumbered: 2rg8.cif.gz
ValueError Warning this file is not renumbered: 1fh2.cif.gz
TypeError Warning this file is not renumbered: 6zsd.cif.gz
ValueError Warning this file is not renumbered: 4bsw.cif.gz
ValueError Warning this file is not renumbered: 4z3a.cif.gz
ValueError Warning this file is not renumbered: 4bkc.cif.gz
ValueError Warning this file is not renumbered: 2zfo.cif.gz
ValueError Warning this file is not renumbered: 3spf.cif.gz
ValueError Warning this file is not renumbered: 6ap9.cif.gz
ValueError Warning this file is not renumbered: 1eis.cif.gz
ValueError Warning this file is not renumbered: 5ojo.cif.gz
ValueError Warning this file is not renumbered: 1aw8.cif.gz
ValueError Warning this file is not renumbered: 6v2g.cif.gz
ValueError Warning this file is not renumbered: 7a0l.cif.gz
ValueError Warning this file is not renumbered: 3q6v.cif.gz
ValueError Warning this file is not renumbered: 4ebu.cif.gz
ValueError Warning this file is not renumbered: 3dl7.cif.gz
ValueError Warning this file is not renumbered: 5cl4.cif.gz
ValueError Warning this file is not renumbered: 3ned.cif.gz
ValueError Warning this file is not renumbered: 2qdw.cif.gz
ValueError Warning this file is not renumbered: 2xf1.cif.gz
ValueError Warning this file is not renumbered: 6h2u.cif.gz
ValueError Warning this file is not renumbered: 3dtq.cif.gz
ValueError Warning this file is not renumbered: 4kuk.cif.gz
ValueError Warning this file is not renumbered: 6dtk.cif.gz
ValueError Warning this file is not renumbered: 3aaj.cif.gz
ValueError Warning this file is not renumbered: 5kbt.cif.gz
ValueError Warning this file is not renumbered: 1en2.cif.gz
ValueError Warning this file is not renumbered: 4ubv.cif.gz
ValueError Warning this file is not renumbered: 2q86.cif.gz
ValueError Warning this file is not renumbered: 7a5v.cif.gz
ValueError Warning this file is not renumbered: 6gp0.cif.gz
TypeError Warning this file is not renumbered: 6zse.cif.gz
ValueError Warning this file is not renumbered: 2fo0.cif.gz
ValueError Warning this file is not renumbered: 3i14.cif.gz
ValueError Warning this file is not renumbered: 2pxw.cif.gz
ValueError Warning this file is not renumbered: 4wq8.cif.gz"""


'5a51.cif.gz'

In [16]:
def check_assemblies(mmCIF_assembly, default_output_path_to_mmCIF_assembly):
    output_mmCIF_assembly_files_were_found_list = list()
    output_mmCIF_assembly_files_were_found_list.append(mmCIF_assembly)
    for name in output_mmCIF_assembly_files_were_found_list:
        not_gzip = 1
        try:
            list_of_lines_from_assembly_file = gzip.open(
                Path(str(default_output_path_to_mmCIF_assembly) + "/" + name), 'rt').readlines()
        except OSError:
            # maybe not archived
            try:
                list_of_lines_from_assembly_file = open(
                    Path(str(default_output_path_to_mmCIF_assembly) + "/" + name), 'rt').readlines()
                not_gzip = 0
            except Exception:
                # broken archive
                os.remove(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name))
                continue
        except Exception:
            # broken archive
            os.remove(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name))
            continue
        
        # check if file startswith "_atom_site" table at the beginning
        try:
            if "_atom_site" in list_of_lines_from_assembly_file[3] and "loop_" in list_of_lines_from_assembly_file[2]:
                pass
            else:
                continue
        except IndexError:
            # empty file
            os.remove(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name))
            continue
        
        try:
            new_order_for_assembly_file = list()
            listitem = (list_of_lines_from_assembly_file[:1]
                        + list_of_lines_from_assembly_file[list_of_lines_from_assembly_file.index("#\n", 2):]
                        + list_of_lines_from_assembly_file[2:list_of_lines_from_assembly_file.index("#\n", 2)]
                        + ["#\n"])

            if not_gzip != 0:
                with gzip.open(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name), "wt") as gzip_out:
                    for listitem in new_order_for_assembly_file:
                        gzip_out.write(listitem)
            else:
                with open(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name), "wt") as file_out:
                    for listitem in new_order_for_assembly_file:
                        file_out.write(listitem)

        except ValueError:
            # file isn't complete
            os.remove(Path(str(default_output_path_to_mmCIF_assembly) + "/" + name))
            



def ProcessPool_run_reform_assembly(default_output_path_to_mmCIF_assembly, current_directory):
    output_mmCIF_assembly = look_what_is_inside('output_mmCIF_assembly', default_output_path_to_mmCIF_assembly=default_output_path_to_mmCIF_assembly)
    assembly_list = list()
    for assembly in output_mmCIF_assembly:
        if "assembly" in assembly:
            assembly_list.append(assembly)
    output_mmCIF_assembly = assembly_list

    os.chdir(default_output_path_to_mmCIF_assembly)
    resulting = list()
    executor = ProcessPoolExecutor()
    partial_reform_assembly = partial(check_assemblies, default_output_path_to_mmCIF_assembly=default_output_path_to_mmCIF_assembly)

    jobs = [executor.submit(partial_reform_assembly, assembly_files) for assembly_files in output_mmCIF_assembly]
    for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), position=0, leave=True, desc="Checking assembly files"):
        resultus = job.result()
        resulting.append(resultus)

    os.chdir(current_directory)
    return resulting



In [17]:
if __name__ == '__main__':
    resulting3 = ProcessPool_run_reform_assembly(default_output_path_to_mmCIF_assembly=default_output_path_to_mmCIF_assembly,
                                                 current_directory=current_directory)

Checking assembly files: 100%|██████████| 263532/263532 [35:13<00:00, 124.71it/s] 


In [17]:
# output_mmCIF_assembly = look_what_is_inside('output_mmCIF_assembly', default_output_path_to_mmCIF_assembly=default_output_path_to_mmCIF_assembly)

In [None]:

# get partial_occupancy or ins_code
# input_mmCIF_files_were_found =look_what_is_inside("mmCIF")
# default_input_path_to_mmCIF = current_directory + "/mmCIF"
# default_output_path_to_mmCIF = current_directory + "/output_mmCIF"

# ### partial_occupancy or ins_code catcher
# def master_mmCIF_renumber_function(input_mmCIF_files_were_found, default_input_path_to_mmCIF,
#                                    default_input_path_to_SIFTS, default_output_path_to_mmCIF,
#                                    default_mmCIF_num, gzip_mode):

#     input_mmCIF_assembly_files_were_found_list = list()
#     input_mmCIF_assembly_files_were_found_list.append(input_mmCIF_files_were_found)

#     for mmCIF_name in input_mmCIF_assembly_files_were_found_list:
        
#         mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
#         if mmcif_dict == 0:
#             continue
#         ### ins_code_catcher
# #         non_dot_ins_code = 0
# #         try:
# #             for n in mmcif_dict["_pdbx_poly_seq_scheme.pdb_ins_code"]:
# #                 if "." != n:
# #                     non_dot_ins_code = ("pdb_ins_code:", mmCIF_name)
# #                     return (non_dot_ins_code)
# #         except KeyError:
# #             pass
        
#         ### partial_occupancy_catcher
#         non_whole_occupancy = 0
#         try:
#             for n in mmcif_dict["_atom_site.occupancy"]:
#                 if n[0] != "1":
#                     non_whole_occupancy = ("partial_occupancy:", mmCIF_name)
#                     return (non_whole_occupancy)
#         except KeyError:
#             pass

# def ProcessPool_run_renum(format_to_download="mmCIF", input_mmCIF_files_were_found=(),
#               default_input_path_to_mmCIF=default_input_path_to_mmCIF,
#               default_input_path_to_SIFTS=default_input_path_to_SIFTS, 
#               default_output_path_to_mmCIF=default_output_path_to_mmCIF, 
#               default_mmCIF_num=default_mmCIF_num, gzip_mode=gzip_mode):
    
#     resulting = list()
#     executor = ProcessPoolExecutor()
#     partial_master_mmCIF_renumber_function = partial(master_mmCIF_renumber_function,
#                                                      default_input_path_to_mmCIF=default_input_path_to_mmCIF,
#                                                      default_input_path_to_SIFTS=default_input_path_to_SIFTS, 
#                                                      default_output_path_to_mmCIF=default_output_path_to_mmCIF, 
#                                                      default_mmCIF_num=default_mmCIF_num, gzip_mode=gzip_mode)
    
#     jobs = [executor.submit(partial_master_mmCIF_renumber_function, mmCIF_files) for mmCIF_files in input_mmCIF_files_were_found]
#     for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), position=0, leave=True, desc="Renumbering "+format_to_download+" files"):
#         resultus = job.result()
#         if resultus != None:
#             resulting.append(resultus)
    
#     return resulting

# if __name__ == '__main__':
#     resulting = ProcessPool_run_renum("mmCIF", input_mmCIF_files_were_found,
#                           default_input_path_to_mmCIF,
#                           default_input_path_to_SIFTS,
#                           default_output_path_to_mmCIF,
#                           default_mmCIF_num, gzip_mode)
    
# with open('partial_occupancy' + '.txt', 'w') as filehandle:
#     for banch in resulting:
#         filehandle.write('%s\n' % list(banch))
        
# with open('pdb_ins_code' + '.txt', 'w') as filehandle:
#     for banch in resulting:
#         filehandle.write('%s\n' % list(banch))

In [27]:
# Annotation

# add_set_of_annotation = ['Isoform beta',
#  'Cloning site residue',
#  'Gst tag',
#  'Propionation',
#  'Initial methionine',
#  'See remark9 99',
#  'Linker',
#  'Amidation',
#  'Oxidized cys',
#  'Missing from gb',
#  'Polymorphism',
#  'Chromophor; rem 999',
#  'Strain difference',
#  'Intrachain his tag',
#  'Mod. residue/cloning artifact',
#  'Differences in map',
#  'E',
#  'Initiating residue',
#  'Myc tag',
#  'Modified tyr',
#  'Variation',
#  'Possible isoform',
#  'Substitution',
#  'Random mutagenesis',
#  'See remrak 999',
#  'Lys tag',
#  'Gap in sws p00734',
#  'Chromophore',
#  'Expression tag',
#  'N-acetylation',
#  'Initiating mse',
#  'Modified',
#  'helix',
#  'Chromophore; rem 999',
#  'Methylated asn',
#  'Variant see remark 999',
#  'Chemical modification',
#  'Somatic variant',
#  'Benzoylation',
#  'Autophosphorylation',
#  'Deletion',
#  'See remerk 999',
#  'Allele',
#  'Myristoylated',
#  'Modified chromophore',
#  'Conflict',
#  'See remark 999; engineered',
#  'See sequence details',
#  'Gpgs tag',
#  'Insertion',
#  'Insertion; see remark 999',
#  '?',
#  'Sequencing error',
#  'strand',
#  'Missing in sws',
#  'Initiating met',
#  'Variant',
#  'Thrombin cleavage site',
#  'Protease cleavage site',
#  'Frameshift error',
#  'Methylation',
#  'Microheterogeneity; see remark 999',
#  'Natural variant',
#  'See remark 999',
#  'Isoform',
#  'Formylation',
#  'Polymorphic variant',
#  'Gcn4 tag',
#  'Detection tag',
#  'Variant strain',
#  'Initiator n-formyl-met',
#  'Modified amino acid',
#  'Allelic variant',
#  'Modification',
#  'Modified residues',
#  'Myc epitope',
#  'loop',
#  'Kt3 tag',
#  'Cleavage site',
#  'See remark 400',
#  'Chromophore; see remark 999',
#  'Correction',
#  'Initiator methionine',
#  'Sequence correction',
#  'Polymorphic site',
#  'T',
#  'See sequence_details',
#  'Modified residue',
#  'Engineered mutation',
#  'Hydroxylation',
#  'Remark 999',
#  'H',
#  'Microheterogeneity',
#  'Acetylation',
#  'Initiating methionine',
#  'New isozyme',
#  'See reamrk 999',
#  'Cloning artifact',
#  'Prescission site',
#  'Strain',
#  'D-configuration',
#  'Myristoylation',
#  'Modified initiating methionine',
#  'Signal peptide',
#  'Phosphorylation']

# None_type_list = list()
# for annotation in add_set_of_annotation:
#     try:
#         with open(annotation + '.txt', 'w') as filehandle:
#             for banch in resulting:
#                 try:
#                     for li in banch:
#                         try:
#                             if li[1][0] == "Annotation:" and li[1][1] == annotation:
#                                 filehandle.write('%s\n' % list(li))
#                         except IndexError:
#                             pass
#                 except TypeError:
#                     None_type_list.append(banch)
                    
#     except FileNotFoundError:
#         pass
    
# with open("all_annotation" + '.txt', 'w') as filehandle:
#     for banch in resulting:
#         try:
#             for li in banch:
#                 try:
#                     if li[1][0] == "Annotation:":
#                         filehandle.write('%s\n' % list(li))
#                 except IndexError:
#                     pass
#         except TypeError:
#             pass

In [None]:
### for the analysis
# sums_50k = 0
# sums_renum = 0
# new_sums_list = list()
# for n in resulting:
#     for z in n:
#         try:
#             sums_50k += int(z[-1])
#             sums_renum += int(z[-2])
#             PDBid = z[0]
#                 # print(z)
#         except TypeError:
#             new_sums_list.append([PDBid, sums_renum, sums_50k])
#             # print(sums)
#             sums_50k = 0
#             sums_renum = 0

# both = set()
# Uni = set()
# _50k = set()
# nothing = set()

# for n in new_sums_list:
#     if n[1] == 0 and n[2] == 0:
#         nothing.add(n[0])
#     if n[1] != 0 and n[2] == 0:
#         Uni.add(n[0])
#     if n[1] == 0 and n[2] != 0:
#         _50k.add(n[0])
#     if n[1] != 0 and n[2] != 0:
#         both.add(n[0])
        
# print(len(nothing))
# print(len(_50k))
# print(len(Uni))
# print(len(both))

# no_UNi_set = set()
# no_cnange_set = set()
# no_SIFTS_set = set()
# for n in input_mmCIF_files_were_found:
#     if "no_UniProt" in n:
#         no_UNi_set.add(n[:4])
#     if "no_SIFTS" in n:
#         no_SIFTS_set.add(n[:4])
#     if "no_change" in n:
#         no_cnange_set.add(n[:4])

# no_UNi_set = set()
# no_cnange_set = set()
# no_SIFTS_set = set()
# for n in input_mmCIF_files_were_found:
#     if "no_UniProt" in n:
#         no_UNi_set.add(n[:4])
#     if "no_SIFTS" in n:
#         no_SIFTS_set.add(n[:4])
#     if "no_change" in n:
#         no_cnange_set.add(n[:4])

# A) no sifts file available (we don't renumber) 4026
# B) sifts file available but no Uniprot information (we don't renumber) 5591
# C) sifts available with UniProt information (we try to renumber) 164220

# These are mutually exclusive sets. So A+B+C = N, the number of entries in PDB. N total 173837

# We can divide C into:

# D) no changes in numbering at all 71246
# E) NumChanges>0 but only changes that are 50000+SeqResNum 25857
# F) NumChanges>0 but only changes due to UNP 45651
# G) NumChanges>0 with changes due to UNP and 50000+SeqResNum, 21466

# These are also mutually exclusive. Of most interest is the sum: F+G. Is this 67117

In [None]:
### cases when pdb_seq_num diff from auth_seq_num
# cases_of_diff_in_poly_seq_scheme = set()
# cases_of_diff_in_nonpoly_scheme = set()
# for input_file in tqdm.tqdm(input_mmCIF_files_were_found, total=len(input_mmCIF_files_were_found), position=0, leave=True, desc="Checking files"):

#     input_mmCIF_assembly_files_were_found_list = list()
#     input_mmCIF_assembly_files_were_found_list.append(input_file)
    
#     for mmCIF_name in input_mmCIF_assembly_files_were_found_list:
#         mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
#         if mmcif_dict == 0:
#             continue
#     try:       
#         for n in range(len(mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"])-1):
#             if mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"][n] != mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"][n] and mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"][n] != "?":
#                 return mmCIF_name
#                 # cases_of_diff_in_poly_seq_scheme.add(mmCIF_name)
#     except KeyError:
#         pass
            
#     try:
#         for n in range(len(mmcif_dict["_pdbx_nonpoly_scheme.pdb_seq_num"])-1):
#             if mmcif_dict["_pdbx_nonpoly_scheme.pdb_seq_num"][n] != mmcif_dict["_pdbx_nonpoly_scheme.auth_seq_num"][n] and mmcif_dict["_pdbx_nonpoly_scheme.auth_seq_num"][n] != "?":
#                 cases_of_diff_in_nonpoly_scheme.add(mmCIF_name)
#     except KeyError:
#         pass

# def master_mmCIF_renumber_function(input_mmCIF_files_were_found, default_input_path_to_mmCIF,
#                                    default_input_path_to_SIFTS, default_output_path_to_mmCIF,
#                                    default_mmCIF_num, gzip_mode):

#     input_mmCIF_assembly_files_were_found_list = list()
#     input_mmCIF_assembly_files_were_found_list.append(input_mmCIF_files_were_found)
    
#     for mmCIF_name in input_mmCIF_assembly_files_were_found_list:
#         mmcif_dict = try_MMCIF2Dict(default_input_path_to_mmCIF, mmCIF_name)
#         if mmcif_dict == 0:
#             continue
#     try:
#         mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"]
# #         for n in range(len(mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"])-1):
# #             if mmcif_dict["_pdbx_poly_seq_scheme.pdb_seq_num"][n] != mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"][n] and mmcif_dict["_pdbx_poly_seq_scheme.auth_seq_num"][n] != "?":
# #                 pass
#                 # cases_of_diff_in_poly_seq_scheme.add(mmCIF_name)
#     except KeyError:
#         return mmCIF_name


# # # nonpoly_diff = resulting
# with open('without_poly_scheme' + '.txt', 'w') as filehandle:
#     for banch in resulting:
#         filehandle.write('%s\n' % banch)

In [None]:
################################PDB format renum#####################################

In [61]:
from src.download.modules import *
from src.renum.shared.handling_chain_numbering_clashes import handling_chain_numbering_clashes
from src.renum.shared.SIFTS_tree_parser import SIFTS_tree_parser
from src.renum.shared.renumbered_count_in_chains import renumbered_count_in_chains
from src.download.downloadwithThreadPool import download_with_pool, url_formation_for_pool
PDBrenum_REMARK = [
    "REMARK   0  File processed by PDBrenum: http://dunbrack3.fccc.edu/PDBrenum      ",
    "REMARK   0  Author sequence numbering is replaced with UniProt numbering        ",
    "REMARK   0  according to alignment by SIFTS                                     ",
    "REMARK   0  (https://www.ebi.ac.uk/pdbe/docs/sifts/).                           ",                 
    "REMARK   0  Only chains with UniProt sequences in SIFTS are renumbered.         ",
    "REMARK   0  Residues in UniProt chains without UniProt residue numbers in SIFTS ",
    "REMARK   0  (e.g., sequence tags) are given residue numbers 5000+label_seq_id   ",
    "REMARK   0  (where label_seq_id is the 1-to-N residue numbering of each chain.  ",
    "REMARK   0  Ligands are numbered 5000+their residue number in the original      ",
    "REMARK   0  file. The _poly_seq_scheme table contains a correspondence between  ",
    "REMARK   0  the 1-to-N sequence (seq_id), the new numbering based on UniProt    ",
    "REMARK   0  (pdb_seq_num = auth_seq_id in the _atom_site records), and          ",
    "REMARK   0  the author numbering in the original mmCIF file from the PDB        ",
    "REMARK   0  (auth_seq_num).                                                     "]

def SIFTS_data_parser_for_PDB(tuple_PDBe_for_PDB_and_tuple_PDB, tuple_PDBe_for_UniProt_and_tuple_UniProt,
                              default_PDB_num, chains_to_change="all"):
    df_PDBe_UniProt = pd.DataFrame(tuple_PDBe_for_UniProt_and_tuple_UniProt, columns=['PDBe', 'UniProt', "AccessionID"])
    df_PDBe_UniProt = df_PDBe_UniProt.drop_duplicates(subset="PDBe", keep='first')
    df_PDBe_PDB = pd.DataFrame(tuple_PDBe_for_PDB_and_tuple_PDB, columns=['PDBe', 'PDB'])
    df_PDBe_PDB = df_PDBe_PDB.drop_duplicates(subset="PDBe", keep='first')

    df_PDBe_PDB_UniProt = df_PDBe_PDB.merge(df_PDBe_UniProt, left_on="PDBe", right_on="PDBe", how='left')
    df_PDBe_PDB_UniProt['UniProt'] = df_PDBe_PDB_UniProt['UniProt'].replace(np.nan, "5000")
    df_PDBe_PDB_UniProt["Uni_moD"] = np.where(df_PDBe_PDB_UniProt['UniProt'] != "5000", df_PDBe_PDB_UniProt['UniProt'], df_PDBe_PDB_UniProt["PDBe"])
    df_PDBe_PDB_UniProt.loc[:, 'new_col_Uni'] = df_PDBe_PDB_UniProt.Uni_moD.map(lambda x: x[0])
    df_PDBe_PDB_UniProt["UniProt_5k"] = df_PDBe_PDB_UniProt.new_col_Uni.apply(lambda x: (int(x) + default_PDB_num if type(x) == str else x))
    df_PDBe_PDB_UniProt.loc[df_PDBe_PDB_UniProt['UniProt'] != '5000', 'UniProt_5k'] = df_PDBe_PDB_UniProt['new_col_Uni']

    Three_Rows_CIF_Num_Uni = []
    if chains_to_change == "all":
        for index, rows in df_PDBe_PDB_UniProt.iterrows():
            intermediate_list = [rows.PDBe, rows.UniProt_5k, rows.Uni_moD, rows.PDB, rows.AccessionID]
            Three_Rows_CIF_Num_Uni.append(intermediate_list)

    else:
        for index, rows in df_PDBe_PDB_UniProt.iterrows():
            if rows.PDB[2].strip() in chains_to_change:
                intermediate_list = [rows.PDBe, rows.UniProt_5k, rows.Uni_moD, rows.PDB, rows.AccessionID]
            else:
                intermediate_list = [rows.PDBe, rows.PDB[0], rows.Uni_moD, rows.PDB, rows.AccessionID]
            Three_Rows_CIF_Num_Uni.append(intermediate_list)

    df_PDBe_PDB_UniProt["Three_Rows_CIF_Num_Uni"] = Three_Rows_CIF_Num_Uni
    df_PDBe_PDB_UniProt_without_null = df_PDBe_PDB_UniProt[df_PDBe_PDB_UniProt.PDB.map(lambda x: x[0]) != "null"]
    df_PDBe_PDB_UniProt_without_null_index_PDBe = df_PDBe_PDB_UniProt_without_null.set_index("PDBe")

    return [df_PDBe_PDB_UniProt_without_null_index_PDBe, df_PDBe_PDB_UniProt]


def try_SIFTS_tree_parser(default_input_path_to_SIFTS, SIFTS_name):
    product_tree_SIFTS = 0
    for _ in range(3):
        try:
            product_tree_SIFTS = SIFTS_tree_parser(
                gzip.open(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name), 'rt'))
            break
        except EOFError:
            os.remove(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name))
            download_with_pool(url_formation_for_pool("SIFTS", [SIFTS_name], default_input_path_to_SIFTS=default_input_path_to_SIFTS)[0],
                               default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        except ValueError:
            os.remove(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name))
            download_with_pool(url_formation_for_pool("SIFTS", [SIFTS_name], default_input_path_to_SIFTS=default_input_path_to_SIFTS)[0],
                               default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        except OSError:
            download_with_pool(url_formation_for_pool("SIFTS", [SIFTS_name], default_input_path_to_SIFTS=default_input_path_to_SIFTS)[0],
                               default_input_path_to_SIFTS=default_input_path_to_SIFTS)
    return product_tree_SIFTS


def try_PDB(default_input_path_to_PDB, PDB):
    split = 0

    for _ in range(3):
        try:
            split = gzip.open(Path(str(default_input_path_to_PDB) + "/" + PDB), 'rt').read().splitlines()
            break
        except EOFError:
            try:
                re.search('\.pdb(.*).gz', PDB).group(1)
                os.remove(Path(str(default_input_path_to_PDB) + "/" + PDB))
                download_with_pool(url_formation_for_pool("PDB_assembly", [PDB], default_input_path_to_PDB_assembly=default_input_path_to_PDB)[0],
                                   default_input_path_to_PDB_assembly=default_input_path_to_PDB)
            except AttributeError:
                os.remove(Path(str(default_input_path_to_PDB) + "/" + PDB))
                download_with_pool(url_formation_for_pool("PDB", [PDB], default_input_path_to_PDB=default_input_path_to_PDB)[0],
                                   default_input_path_to_PDB=default_input_path_to_PDB)

        except ValueError:
            try:
                re.search('\.pdb(.*).gz', PDB).group(1)
                os.remove(Path(str(default_input_path_to_PDB) + "/" + PDB))
                download_with_pool(url_formation_for_pool("PDB_assembly", [PDB], default_input_path_to_PDB_assembly=default_input_path_to_PDB)[0],
                                   default_input_path_to_PDB_assembly=default_input_path_to_PDB)
            except AttributeError:
                os.remove(Path(str(default_input_path_to_PDB) + "/" + PDB))
                download_with_pool(url_formation_for_pool("PDB", [PDB], default_input_path_to_PDB=default_input_path_to_PDB)[0],
                                   default_input_path_to_PDB=default_input_path_to_PDB)
        except OSError:
            try:
                re.search('\.pdb(.*).gz', PDB).group(1)
                download_with_pool(url_formation_for_pool("PDB_assembly", [PDB], default_input_path_to_PDB_assembly=default_input_path_to_PDB)[0],
                                   default_input_path_to_PDB_assembly=default_input_path_to_PDB)
            except AttributeError:
                download_with_pool(url_formation_for_pool("PDB", [PDB], default_input_path_to_PDB=default_input_path_to_PDB)[0],
                                   default_input_path_to_PDB=default_input_path_to_PDB)
    return split


def if_no_SIFTS_data_log_for_PDB(default_input_path_to_PDB, PDB_id, PDB):
    split = try_PDB(default_input_path_to_PDB, PDB)
    res_number_name_chainID_from_PDB_tuple = list()
    chains_set = set()
    log_message = list()

    for n in split:
        if n.startswith("ATOM") or n.startswith("TER") or n.startswith("ANISOU") or n.startswith("ANISOU") or n.startswith("SIGUIJ"):
            res_number_name_chainID_from_PDB_tuple.append((n[22:27].strip(" "), n[17:20], n[21]))
            chains_set.add(n[21])

    if len(res_number_name_chainID_from_PDB_tuple) == 0:
        log_message.append([PDB_id, "-", "-", "-", "-", "-", "-", "-", "-", "-"])
        return log_message

    df_mmCIF = pd.DataFrame(list(zip(res_number_name_chainID_from_PDB_tuple, res_number_name_chainID_from_PDB_tuple)))
    df_mmCIF = df_mmCIF.rename(columns={0: "PDB_old", 1: "PDB_old_copy"})
    df_mmCIF = df_mmCIF.set_index("PDB_old")
    df_mmCIF = df_mmCIF.drop_duplicates()

    for chain in chains_set:
        count_res_in_chain = 0
        for resnum_resname_chain in df_mmCIF.PDB_old_copy:
            if chain == resnum_resname_chain[2]:
                count_res_in_chain += 1
        log_message.append([PDB_id, "-", chain, "-", "-", len(df_mmCIF), "-", count_res_in_chain, "0", "0"])
    return log_message


def copy_file(inpath, file_name, outpath, postfix, gzip_mode):
    if file_name.endswith(".ent.gz") and file_name.startswith("pdb"):
        PDB_id = file_name[3:file_name.rfind(".ent.gz")]
    else:
        PDB_id = file_name[:4]

    absolute_path_in = inpath + "/" + file_name
    absolute_path_out = outpath + "/" + PDB_id + postfix
    if gzip_mode == "off":
        with gzip.open(absolute_path_in, 'rb') as f_in:
            with open(absolute_path_out, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    else:
        shutil.copyfile(absolute_path_in, absolute_path_out + ".gz")


def PDB_parser(split, df_PDBe_PDB_UniProt_without_null_index_PDBe, default_PDB_num):
    res_number_name_chainID_from_PDB_tuple = list()
    num_ins_code_name_chain_HETATM = list()
    missing_res_remark_465 = list()
    skipper_for_remark_465 = True
    Num_is_Too_Big = True

    for n in split:
        if n.startswith("ATOM") or n.startswith("TER") or n.startswith("ANISOU") or n.startswith("ANISOU") or n.startswith("SIGUIJ"):
            res_number_name_chainID_from_PDB_tuple.append((n[22:27].strip(" "), n[17:20], n[21]))
        if n.startswith("HETATM"):
            num_ins_code_name_chain_HETATM.append((n[22:27].strip(" "), n[17:20], n[21]))

        if n.startswith("REMARK 465"):
            if not skipper_for_remark_465:
                missing_res_remark_465.append((n[20:27].strip(" "), n[15:18], n[19]))
            if n[15:27] == "RES C SSSEQI":
                skipper_for_remark_465 = False

    df_mmCIF = pd.DataFrame(list(zip(res_number_name_chainID_from_PDB_tuple, res_number_name_chainID_from_PDB_tuple)))
    df_mmCIF = df_mmCIF.rename(columns={0: "PDB_old", 1: "PDB_old_copy"})
    df_mmCIF = df_mmCIF.set_index("PDB_old")
    df_mmCIF = df_mmCIF.drop_duplicates()

    df_final = df_mmCIF.merge(df_PDBe_PDB_UniProt_without_null_index_PDBe, left_on="PDB_old_copy", right_on="PDB", how='left')
    df_final['Uni_moD'] = df_final['Uni_moD'].replace(np.nan, "5000")
    df_final["Uni_moD"] = np.where(df_final['Uni_moD'] != "5000", df_final['Uni_moD'], df_final["PDB_old_copy"])
    df_final.loc[:, 'new_col_Uni'] = df_final.Uni_moD.map(lambda x: x[0])
    df_final["UniProt_5k"] = df_final.new_col_Uni.apply(lambda x: (int(x) + default_PDB_num if x.isdigit() else x))
    df_final.loc[df_final['UniProt'] != '5000', 'UniProt_5k'] = df_final['new_col_Uni']

    Three_Rows_CIF_Num_Uni = list()
    for index, rows in df_final.iterrows():
        intermediate_list = [rows.PDB_old_copy, rows.UniProt_5k, rows.Uni_moD]
        if type(rows.UniProt_5k) == int:
            if len(str(rows.UniProt_5k)) > 4:
                Num_is_Too_Big = False
        else:
            if len(rows.UniProt_5k) > 4:
                Num_is_Too_Big = False
        Three_Rows_CIF_Num_Uni.append(intermediate_list)

    df_final["Three_Rows_CIF_Num_Uni"] = Three_Rows_CIF_Num_Uni
    df_final_index_PDBe = df_final.set_index("PDB_old_copy")
    df_final_index_PDBe_drop_NAN = df_final_index_PDBe.dropna(subset=['PDB'])
    pd_series_index_PDBe = df_final_index_PDBe["Three_Rows_CIF_Num_Uni"]

    PDB_str = df_final_index_PDBe_drop_NAN.PDB.map(
        lambda x: x[1] + "{:>2}".format(x[2]) + "{:>4}".format(x[0]) + " " if x[0].isdigit() else x[1] + "{:>2}".format(x[2]) + "{:>5}".format(x[0]))
    df_final_index_PDBe_drop_NAN = df_final_index_PDBe_drop_NAN.merge(PDB_str.rename('PDB_str'), left_index=True, right_index=True)
    df_final_poly_corrected = df_final_index_PDBe_drop_NAN.drop(columns=['UniProt', 'AccessionID', "new_col_Uni", "UniProt_5k", "Uni_moD"])
    renum_str = df_final_poly_corrected.PDB.map(lambda x: x[1] + "{:>2}".format(x[2])) + df_final_poly_corrected.Three_Rows_CIF_Num_Uni.map(
        lambda x: "{:>4}".format(str(int(x[1])))) + " "
    df_final_poly_corrected = df_final_poly_corrected.merge(renum_str.rename('renum_str'), left_index=True, right_index=True)
    df_final_poly_corrected = df_final_poly_corrected.reset_index(drop=True)

    return [pd_series_index_PDBe, num_ins_code_name_chain_HETATM, df_final_poly_corrected, missing_res_remark_465, Num_is_Too_Big]


def non_poly_num(pd_series_index_PDBe, num_ins_code_name_chain_HETATM):
    working_range_list = list()
    for n in range(1, 10000):
        working_range_list.append(n)

    chain_and_number = list()
    for n in pd_series_index_PDBe:
        chain_and_number.append((n[0][2], n[1]))

    chain_label = chain_and_number[0][0]
    numbers_per_chain = list()
    chain_label_with_numbers_per_chain = list()
    for n in chain_and_number:
        if chain_label == n[0]:
            d = str(n[1])
            n_numeric = ''.join(d for d in d if d.isdigit())
            numbers_per_chain.append(int(n_numeric))
        else:
            numbers_per_chain = list(np.unique(numbers_per_chain))
            chain_label_with_numbers_per_chain.append((chain_label, numbers_per_chain))
            numbers_per_chain = list()
            chain_label = n[0]
            if chain_label == n[0]:
                d = str(n[1])
                n_numeric = ''.join(d for d in d if d.isdigit())
                numbers_per_chain.append(int(n_numeric))

    numbers_per_chain = list(np.unique(numbers_per_chain))
    chain_label_with_numbers_per_chain.append((chain_label, numbers_per_chain))

    available_numbers_for_chains = list()
    for n in chain_label_with_numbers_per_chain:
        for num in n[1]:
            if num in working_range_list:
                working_range_list.remove(num)

        available_numbers_for_chains.append((n[0], working_range_list))
        working_range_list = list()
        for n_ in range(1, 10000):
            working_range_list.append(n_)

    chain_and_num_available = list()
    for n in available_numbers_for_chains:
        for num in n[1]:
            chain_and_num_available.append((n[0], num))

    numbers_from_num_ins_code_name_chain_HETATM = list()
    chain_from_num_ins_code_name_chain_HETATM = list()
    for n in num_ins_code_name_chain_HETATM:
        numbers_from_num_ins_code_name_chain_HETATM.append(n[0])
        chain_from_num_ins_code_name_chain_HETATM.append(n[2])

    df_nonpoly = pd.DataFrame(
        list(zip(num_ins_code_name_chain_HETATM, chain_from_num_ins_code_name_chain_HETATM, numbers_from_num_ins_code_name_chain_HETATM)), columns=[
            'PDB', "PDB_chain", "numbers"])
    df_nonpoly_dropped_dup = df_nonpoly.drop_duplicates(subset="PDB", keep='first')
    df_nonpoly_dropped_dup_sorted = df_nonpoly_dropped_dup.sort_values(["PDB_chain", "numbers"], ascending=(True, True)).reset_index(drop=True)
    small_ref_table = df_nonpoly_dropped_dup_sorted.set_index(["PDB_chain", "PDB"]).count(level="PDB_chain")

    all_nonpoly_chains = list()
    for n in small_ref_table.index:
        all_nonpoly_chains.append(n)

    checked_chains_list = list()
    for n in chain_and_num_available:
        checked_chains_list.append(n[0])
    checked_chains_list_uniq = list(np.unique(checked_chains_list))

    available_numbers_to_chains = list()
    for n in all_nonpoly_chains:
        if n not in checked_chains_list_uniq:
            for z in range(1, 10000):
                available_numbers_to_chains.append((n, z))

    chain_and_num_available.extend(available_numbers_to_chains)

    df_chain_and_num_available = pd.DataFrame(chain_and_num_available, columns=['available_chain', "available_number"])
    df_chain_and_num_available_sorted = df_chain_and_num_available.drop_duplicates(
        subset=["available_chain", "available_number"], keep="first").sort_values(
        ["available_chain", "available_number"], ascending=(True, False)).reset_index(drop=True)

    df_for_nonpoly_replace = pd.DataFrame(list(), columns=['available_chain', "available_number"])
    for n in df_nonpoly_dropped_dup_sorted.set_index(["PDB_chain", "PDB"]).count(level="PDB_chain").index:
        temporal_df_for_addition_of_available_num = df_chain_and_num_available_sorted.where(
            df_chain_and_num_available_sorted['available_chain'] == n).dropna()[0:(small_ref_table["numbers"][n])]
        df_for_nonpoly_replace = df_for_nonpoly_replace.append(temporal_df_for_addition_of_available_num, ignore_index=True)

    df_final_nonpoly = pd.merge(left=df_nonpoly_dropped_dup_sorted, right=df_for_nonpoly_replace, left_index=True, right_index=True)

    df_final_nonpoly.loc[:, 'PDB_str'] = df_final_nonpoly.PDB.map(
        lambda x: x[1] + "{:>2}".format(x[2]) + "{:>4}".format(x[0]) + " " if x[0].isdigit() else x[1] + "{:>2}".format(x[2]) + "{:>5}".format(x[0]))
    df_final_nonpoly_corrected = df_final_nonpoly.drop(columns=['PDB_chain', 'numbers', "available_chain"])
    df_final_nonpoly_corrected.loc[:, 'renum_str'] = df_final_nonpoly_corrected.PDB.map(
        lambda x: x[1] + "{:>2}".format(x[2])) + df_final_nonpoly_corrected.available_number.map(lambda x: "{:>4}".format(str(int(x)))) + " "

    return df_final_nonpoly_corrected


def remark_465(missing_res_remark_465, df_PDBe_PDB_UniProt):
    df_PDBe_PDB_UniProt_nulls = df_PDBe_PDB_UniProt.loc[df_PDBe_PDB_UniProt['PDB'].apply(lambda x: x[0] == "null")]
    df_PDBe_PDB_UniProt_nulls = df_PDBe_PDB_UniProt_nulls.reset_index(drop=True)

    df_mmCIF_remark_465 = pd.DataFrame(list(zip(missing_res_remark_465)))
    df_mmCIF_remark_465 = df_mmCIF_remark_465.rename(columns={0: "PDB_old"})
    df_mmCIF_remark_465 = df_mmCIF_remark_465.drop_duplicates()

    df_remark_465_final = df_mmCIF_remark_465.merge(df_PDBe_PDB_UniProt_nulls, left_index=True, right_index=True)
    df_remark_465_final.loc[:, 'PDB_str'] = df_remark_465_final.PDB_old.map(
        lambda x: x[1] + "{:>2}".format(x[2]) + "{:>6}".format(x[0]) + " " if x[0].isdigit() else x[1] + "{:>2}".format(x[2]) + "{:>5}".format(x[0]))

    df_final_poly_remark_465_corrected = df_remark_465_final.drop(columns=['UniProt', 'AccessionID', "new_col_Uni", "UniProt_5k", "Uni_moD"])
    df_final_poly_remark_465_corrected.loc[:, 'renum_str'] = df_final_poly_remark_465_corrected.PDB_old.map(
        lambda x: x[1] + "{:>2}".format(x[2])) + df_final_poly_remark_465_corrected.Three_Rows_CIF_Num_Uni.map(
        lambda x: "{:>6}".format(str(int(x[1])))) + " "
    df_final_poly_remark_465_corrected = df_final_poly_remark_465_corrected.reset_index(drop=True)

    return df_final_poly_remark_465_corrected


def final_dict_formation(df_final_poly_corrected, df_final_nonpoly_corrected, final_remark_465, chains_to_change):
    all_data_df = df_final_poly_corrected.append(df_final_nonpoly_corrected, ignore_index=True, sort=False)
    all_data_df = all_data_df.append(final_remark_465, ignore_index=True, sort=False)
    all_data_df_drop_dup = all_data_df.drop_duplicates(subset="PDB_str", keep='first')
    not_in_chains_to_change = all_data_df_drop_dup.PDB.map(lambda x: x if x[2] in chains_to_change else "not_in_chains_to_change")
    all_data_merged_not_in_chain_to_change = all_data_df_drop_dup.merge(not_in_chains_to_change.rename('not_in_chains_to_change'),
                                                                        left_index=True, right_index=True)
    all_data_df_drop_dup_drop_chains = all_data_merged_not_in_chain_to_change[
        all_data_merged_not_in_chain_to_change.not_in_chains_to_change != 'not_in_chains_to_change']

    different_indent_PDB_str = list()
    for n in all_data_df_drop_dup_drop_chains["PDB_str"]:
        if "-" in n:
            n = n[:5] + n[6:] + " "
        different_indent_PDB_str.append(n)
        different_indent_PDB_str.append(n[:3] + " " + n[3:])  # present HET
        different_indent_PDB_str.append(n[:5] + " " + n[5:])  # present most common
        different_indent_PDB_str.append(n[:5] + "  " + n[5:])  # present REMARK 500

    different_indent_renum_str = list()
    for n in all_data_df_drop_dup_drop_chains["renum_str"]:
        different_indent_renum_str.append(n)
        different_indent_renum_str.append(n[:3] + " " + n[3:])  # present HET
        different_indent_renum_str.append(n[:5] + " " + n[5:])  # present most common
        different_indent_renum_str.append(n[:5] + "  " + n[5:])  # present REMARK 500

    dict_for_replacement = dict(zip(different_indent_PDB_str, different_indent_renum_str))
    return dict_for_replacement


def replace_all(lines, dict_for_replacement):
    location_of_the_value = 0
    for key, value in dict_for_replacement.items():
        if key in lines:
            if location_of_the_value == lines.find(key):
                continue
            lines = lines.replace(key, value)
            location_of_the_value = lines.find(value)
    return lines


def master_PDB_renumber_function(input_PDB_files_were_found, default_input_path_to_PDB, default_input_path_to_SIFTS,
                                 default_output_path_to_PDB, default_PDB_num, gzip_mode, exception_AccessionIDs):
    if not os.path.exists(default_output_path_to_PDB):
        os.makedirs(default_output_path_to_PDB)

    input_PDB_files_were_found_list = list()
    input_PDB_files_were_found_list.append(input_PDB_files_were_found)

    for PDB in input_PDB_files_were_found_list:

        try:
            assembly_num = re.search('\.pdb(.*).gz', PDB).group(1)
            SIFTS_name = PDB[:4] + ".xml.gz"
            PDB_id = PDB[:4]
        except AttributeError:
            assembly_num = ""
            SIFTS_name = PDB[3:7] + ".xml.gz"
            PDB_id = PDB[3:7]

        # for no corresponding SIFTS
        try:
            gzip.open(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name), 'rt')
        except FileNotFoundError:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb" + assembly_num, gzip_mode)
            log_message = if_no_SIFTS_data_log_for_PDB(default_input_path_to_PDB, PDB_id, PDB)
            return log_message

        # for zero byte SIFTS
        if os.path.getsize(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name)) == 0:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb" + assembly_num, gzip_mode)
            log_message = if_no_SIFTS_data_log_for_PDB(default_input_path_to_PDB, PDB_id, PDB)
            return log_message

        product_tree_SIFTS = try_SIFTS_tree_parser(default_input_path_to_SIFTS, SIFTS_name)
        if product_tree_SIFTS == 0:
            continue

        tuple_PDBe_for_PDB_and_tuple_PDB = product_tree_SIFTS[0]
        tuple_PDBe_for_UniProt_and_tuple_UniProt = product_tree_SIFTS[1]
        UniProt_conversion_dict = product_tree_SIFTS[2]

        # for no UniProt in SIFTS
        if len(tuple_PDBe_for_UniProt_and_tuple_UniProt) == 0:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb" + assembly_num, gzip_mode)
            log_message = if_no_SIFTS_data_log_for_PDB(default_input_path_to_PDB, PDB_id, PDB)
            return log_message

        split = try_PDB(default_input_path_to_PDB, PDB)
        if split == 0:
            continue

        product_of_SIFTS_data_parser = SIFTS_data_parser_for_PDB(tuple_PDBe_for_PDB_and_tuple_PDB,
                                                                 tuple_PDBe_for_UniProt_and_tuple_UniProt,
                                                                 default_PDB_num, 'all')
        df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

        handling_chain_numbering = handling_chain_numbering_clashes(df_PDBe_PDB_UniProt, exception_AccessionIDs)
        chains_to_change = handling_chain_numbering[0]
        combined_tuple_PDBe_UniProt_AccessionID = handling_chain_numbering[1]
        longest_AccessionID_list = handling_chain_numbering[3]
        chains_to_change_one_to_end = handling_chain_numbering[4]

        product_of_SIFTS_data_parser = SIFTS_data_parser_for_PDB(tuple_PDBe_for_PDB_and_tuple_PDB, combined_tuple_PDBe_UniProt_AccessionID,
                                                                 default_PDB_num, chains_to_change)
        df_PDBe_PDB_UniProt_without_null_index_PDBe = product_of_SIFTS_data_parser[0]
        df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

        renumbered_count = renumbered_count_in_chains(chains_to_change_one_to_end, df_PDBe_PDB_UniProt_without_null_index_PDBe,
                                                      PDB_id, UniProt_conversion_dict, longest_AccessionID_list)
        chain_total_renum = renumbered_count[0]
        nothing_changed = renumbered_count[1]

        chain_total_renum.append(nothing_changed)
        mod_log_message = chain_total_renum

        if nothing_changed == 0:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb" + assembly_num, gzip_mode)
            return mod_log_message

        parsed_PDB = PDB_parser(split, df_PDBe_PDB_UniProt_without_null_index_PDBe, default_PDB_num)
        pd_series_index_PDBe = parsed_PDB[0]
        num_ins_code_name_chain_HETATM = parsed_PDB[1]
        df_final_poly_corrected = parsed_PDB[2]
        missing_res_remark_465 = parsed_PDB[3]
        Num_is_Too_Big = parsed_PDB[4]

        # when numbers get too big
        if not Num_is_Too_Big:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb" + assembly_num, gzip_mode)
            return mod_log_message

        df_final_nonpoly_corrected = non_poly_num(pd_series_index_PDBe, num_ins_code_name_chain_HETATM)
        if len(missing_res_remark_465) != 0:
            final_remark_465 = remark_465(missing_res_remark_465, df_PDBe_PDB_UniProt)
        else:
            final_remark_465 = None

        dict_for_replacement = final_dict_formation(df_final_poly_corrected, df_final_nonpoly_corrected, final_remark_465, chains_to_change)

        outF = open(Path(str(default_output_path_to_PDB) + "/" + PDB_id + ".pdb" + assembly_num), "w")
        for lines in split:
            lines = replace_all(lines, dict_for_replacement)
            outF.write(lines)
            outF.write("\n")
        outF.close()

        if gzip_mode == "on":
            with open(Path(str(default_output_path_to_PDB) + "/" + PDB_id + ".pdb" + assembly_num), 'rb') as f_in:
                with gzip.open(Path(str(default_output_path_to_PDB) + "/" + PDB_id + ".pdb" + assembly_num + ".gz"), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            os.remove(Path(str(default_output_path_to_PDB) + "/" + PDB_id + ".pdb" + assembly_num))

        return mod_log_message


def ProcessPool_run_renum_PDB(format_to_download, input_PDB_files_were_found, default_input_path_to_PDB, default_input_path_to_SIFTS,
                              default_output_path_to_PDB, default_PDB_num, gzip_mode, exception_AccessionIDs, nproc):
    if not os.path.exists(default_output_path_to_PDB):
        os.makedirs(default_output_path_to_PDB)

    resulting = list()
    executor = ProcessPoolExecutor(max_workers=nproc)
    partial_master_PDB_renumber_function = partial(master_PDB_renumber_function,
                                                   default_input_path_to_PDB=default_input_path_to_PDB,
                                                   default_input_path_to_SIFTS=default_input_path_to_SIFTS,
                                                   default_output_path_to_PDB=default_output_path_to_PDB,
                                                   default_PDB_num=default_PDB_num, gzip_mode=gzip_mode,
                                                   exception_AccessionIDs=exception_AccessionIDs)

    jobs = [executor.submit(partial_master_PDB_renumber_function, pdb_files) for pdb_files in input_PDB_files_were_found]
    with tqdm.tqdm(total=len(jobs), position=0, leave=True, desc="Renumbering " + format_to_download + " files") as pbar:
        for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), position=0, leave=True, desc="Renumbering " + format_to_download + " files"):
            result = job.result()
            resulting.append(result)
            pbar.update()

    return resulting

In [58]:
input_PDB_files_were_found[:1]

['pdb1g97.ent.gz']

In [111]:
input_PDB_files_were_found[:1]

['pdb1g97.ent.gz']

In [109]:
    
for n in input_PDB_files_were_found[:1]:    
    if not os.path.exists(default_output_path_to_PDB):
        os.makedirs(default_output_path_to_PDB)
        
    input_PDB_files_were_found_list = list()
    input_PDB_files_were_found_list.append(n)

    for PDB in input_PDB_files_were_found_list:
        
        try:
            assembly_num = re.search('\.pdb(.*).gz', PDB).group(1)
            SIFTS_name = PDB[:4] + ".xml.gz"
            PDB_id = PDB[:4]
        except AttributeError:
            assembly_num = ""
            SIFTS_name = PDB[3:7] + ".xml.gz"
            PDB_id = PDB[3:7]

        # for no corresponding SIFTS
        try:
            handle_SIFTS = gzip.open(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name), 'rt')
        except FileNotFoundError:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb"+assembly_num, gzip_mode)
            log_message = if_no_SIFTS_data_log_for_PDB(default_input_path_to_PDB, PDB_id, PDB)
            # return log_message
            continue

        # for zero byte SIFTS
        if os.path.getsize(Path(str(default_input_path_to_SIFTS) + "/" + SIFTS_name)) == 0:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb"+assembly_num, gzip_mode)
            log_message = if_no_SIFTS_data_log_for_PDB(default_input_path_to_PDB, PDB_id, PDB)
            # return log_message
            continue

        product_tree_SIFTS = try_SIFTS_tree_parser(default_input_path_to_SIFTS, SIFTS_name)
        if product_tree_SIFTS == 0:
            continue

        tuple_PDBe_for_PDB_and_tuple_PDB = product_tree_SIFTS[0]
        tuple_PDBe_for_UniProt_and_tuple_UniProt = product_tree_SIFTS[1]
        UniProt_conversion_dict = product_tree_SIFTS[2]
        
        # for no UniProt in SIFTS
        if len(tuple_PDBe_for_UniProt_and_tuple_UniProt) == 0:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb"+assembly_num, gzip_mode)
            log_message = if_no_SIFTS_data_log_for_PDB(default_input_path_to_PDB, PDB_id, PDB)
            # return log_message
            continue
            
        split = try_PDB(default_input_path_to_PDB, PDB)
        if split == 0:
            continue
    
            
        product_of_SIFTS_data_parser = SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, tuple_PDBe_for_UniProt_and_tuple_UniProt, default_PDB_num, 'all')
        df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

        handling_chain_numbering = handling_chain_numbering_clashes(df_PDBe_PDB_UniProt, exception_AccessionIDs)
        chains_to_change = handling_chain_numbering[0]
        combined_tuple_PDBe_UniProt_AccessionID = handling_chain_numbering[1]
        longest_AccessionID_list = handling_chain_numbering[3]
        chains_to_change_one_to_end = handling_chain_numbering[4]

        product_of_SIFTS_data_parser = SIFTS_data_parser_for_mmCIF(tuple_PDBe_for_PDB_and_tuple_PDB, combined_tuple_PDBe_UniProt_AccessionID, default_PDB_num, chains_to_change)
        df_PDBe_PDB_UniProt_without_null_index_PDBe = product_of_SIFTS_data_parser[0]
        df_PDBe_PDB_UniProt = product_of_SIFTS_data_parser[1]

        renumbered_count = renumbered_count_in_chains(chains_to_change_one_to_end, df_PDBe_PDB_UniProt_without_null_index_PDBe, PDB_id, UniProt_conversion_dict, longest_AccessionID_list)
        chain_total_renum = renumbered_count[0]
        nothing_changed = renumbered_count[1]

        chain_total_renum.append(nothing_changed)
        mod_log_message = chain_total_renum
        

        if nothing_changed == 0:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb"+assembly_num, gzip_mode)
            # return mod_log_message
        

        parsed_PDB = PDB_parser(split, df_PDBe_PDB_UniProt_without_null_index_PDBe, default_PDB_num)
        pd_series_index_PDBe = parsed_PDB[0]
        num_ins_code_name_chain_HETATM = parsed_PDB[1]
        df_final_poly_corrected = parsed_PDB[2]
        missing_res_remark_465 = parsed_PDB[3]
        Num_is_Too_Big = parsed_PDB[4]
        
        # when numbers get too big 
        if not Num_is_Too_Big:
            copy_file(default_input_path_to_PDB, PDB, default_output_path_to_PDB, ".pdb"+assembly_num, gzip_mode)
            # return mod_log_message
            continue

        df_final_nonpoly_corrected = non_poly_num(pd_series_index_PDBe, num_ins_code_name_chain_HETATM)
        if len(missing_res_remark_465) != 0:
            final_remark_465 = remark_465(missing_res_remark_465, df_PDBe_PDB_UniProt)
        else:
            final_remark_465 = None
            
        dict_for_replacement = final_dict_formation(df_final_poly_corrected, df_final_nonpoly_corrected, final_remark_465, chains_to_change)

        outF = open(Path(str(default_output_path_to_PDB) + "/" + PDB_id + ".pdb"+assembly_num), "w")
        start_remark_0 = 0
        
        # PDBrenum REMARK 0 insert 
        for lines in split:
            if lines.startswith("HEADER"):
                start_remark_0 += 1
            else:
                split = split[:start_remark_0]+ PDBrenum_REMARK +split[start_remark_0:]
                break
        
        # renumbering        
        for lines in split:
            lines = replace_all(lines, dict_for_replacement)
            outF.write(lines)
            outF.write("\n")
        outF.close()
        
        if gzip_mode == "on":
            with open(Path(str(default_output_path_to_PDB) + "/" + PDB_id + ".pdb"+assembly_num), 'rb') as f_in:
                with gzip.open(Path(str(default_output_path_to_PDB) + "/" + PDB_id + ".pdb"+assembly_num+".gz"), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            os.remove(Path(str(default_output_path_to_PDB) + "/" + PDB_id + ".pdb"+assembly_num))
        

        # return mod_log_message

In [56]:
def ProcessPool_run_renum_PDB(format_to_download="PDB", input_PDB_files_were_found=(),
                              default_input_path_to_PDB=default_input_path_to_PDB, 
                              default_input_path_to_SIFTS=default_input_path_to_SIFTS, 
                              default_output_path_to_PDB=default_output_path_to_PDB, 
                              default_PDB_num=default_PDB_num, gzip_mode=gzip_mode):
    if not os.path.exists(default_output_path_to_PDB):
        os.makedirs(default_output_path_to_PDB)
    
    resulting = list()
    executor = ProcessPoolExecutor()
    partial_master_PDB_renumber_function = partial(master_PDB_renumber_function, 
                                                   default_input_path_to_PDB=default_input_path_to_PDB, 
                                                   default_input_path_to_SIFTS=default_input_path_to_SIFTS, 
                                                   default_output_path_to_PDB=default_output_path_to_PDB, 
                                                   default_PDB_num=default_PDB_num, gzip_mode=gzip_mode)
    
    jobs = [executor.submit(partial_master_PDB_renumber_function, pdb_files) for pdb_files in input_PDB_files_were_found]
    for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), position=0, leave=True, desc="Renumbering "+format_to_download+" files"):
        resultus = job.result()
        resulting.append(resultus)
    
    return resulting

In [47]:
input_PDB_files_were_found = look_what_is_inside("PDB")
if __name__ == '__main__':
    resulting2 = ProcessPool_run_renum_PDB("PDB", input_PDB_files_were_found,
                                          default_input_path_to_PDB, 
                                          default_input_path_to_SIFTS,
                                          default_output_path_to_PDB,
                                          default_PDB_num, gzip_mode)

NameError: name 'ProcessPool_run_renum_PDB' is not defined

In [15]:
input_PDB_assembly_files_were_found = look_what_is_inside("PDB_assembly")
if __name__ == '__main__':
    resulting = ProcessPool_run_renum_PDB("PDB_assembly", input_PDB_assembly_files_were_found,
                                          default_input_path_to_PDB_assembly, 
                                          default_input_path_to_SIFTS,
                                          default_output_path_to_PDB_assembly,
                                          default_PDB_num, gzip_mode)

Renumbering PDB_assembly files: 100%|██████████| 245437/245437 [21:55:39<00:00,  3.11it/s]   


[[['3isb', 'A', 'A', 'P06746', 'DPOLB_HUMAN', 327, 327, 327, 0, 0], 0],
 [['3mef', 'A', 'A', 'P0A9X9', 'CSPA_ECOLI', 69, 69, 69, 0, 0], 0],
 [['3rv5', 'A', 'A', 'P63316', 'TNNC1_HUMAN', 322, 322, 83, 0, 0],
  ['3rv5', 'B', 'B', 'P63316', 'TNNC1_HUMAN', 322, 322, 78, 0, 0],
  ['3rv5', 'C', 'C', 'P63316', 'TNNC1_HUMAN', 322, 322, 83, 0, 0],
  ['3rv5', 'D', 'D', 'P63316', 'TNNC1_HUMAN', 322, 322, 78, 0, 0],
  0],
 [['1kmq', 'A', 'A', 'P61586', 'RHOA_HUMAN', 177, 177, 177, 0, 0], 0],
 [['2ibs', 'E', 'A', 'P14385', 'MTTA_THEAQ', 786, 786, 393, 0, 0],
  ['2ibs', 'F', 'D', 'P14385', 'MTTA_THEAQ', 786, 786, 393, 0, 0],
  0],
 [['3wuz', 'A', 'A', nan, None, 120, 0, 120, 0, 1],
  ['3wuz', 'A', 'A', 'Q9UKJ1', 'PILRA_HUMAN', 120, 119, 120, 0, 1],
  1],
 [['1yqz', 'A', 'A', 'O52582', 'CDR_STAA8', 874, 874, 437, 0, 0],
  ['1yqz', 'B', 'B', 'O52582', 'CDR_STAA8', 874, 874, 437, 0, 0],
  0],
 [['5zhr', 'A', 'A', 'Q10QA5', 'D14_ORYSJ', 530, 530, 265, 0, 0],
  ['5zhr', 'B', 'B', 'Q10QA5', 'D14_ORYSJ', 5

In [100]:
int("b")

ValueError: invalid literal for int() with base 10: 'b'

In [22]:
def log_writer(resulting):
    with open('log_corrected.txt', 'w') as f:
        compuni_humanuni_PDBid = list()
        pdb_id_set = set()
        formated_item = (format("SP", "<3") + format("PDB_id", "<7")  + format("chain_PDB", "<12") + format("chain_auth", "<12") + format("comp_uni", "<20") + format("human_uni", "<20") 
                         + format("prot_len", ">10") + format("uni_len", ">10") + format("chain_len", ">10") + format("renum", ">10") + format("5k_or_50k", ">10"))
        f.write("%s\n" % formated_item)

        for n in resulting:
            for z in n:
                if type(z) == int:
                    continue
                try:
                    if z[0][-1] == "*":
                        formated_item = (format("*", "<3") + format(z[0][:4], "<7") + format(z[1], "<12") + format(z[2], "<12") + format(z[3], "<20") + format(z[4], "<20") 
                        + format(z[5], ">10") + format(z[6], ">10")  + format(z[7], ">10") + format(z[8], ">10")  + format(z[9], ">10"))
                        pdb_id_set.add(z[0][:4])
                        compuni_humanuni_PDBid.append((z[3], z[4], z[0][:4]))
                    else:
                        formated_item = (format("+", "<3") + format(z[0], "<7") + format(z[1], "<12") + format(z[2], "<12") + format(z[3], "<20") + format(z[4], "<20") 
                        + format(z[5], ">10") + format(z[6], ">10") + format(z[7], ">10") + format(z[8], ">10")  + format(z[9], ">10"))
                        pdb_id_set.add(z[0])
                        compuni_humanuni_PDBid.append((z[3], z[4], z[0][:4]))
                    f.write("%s\n" % formated_item)

                    # print(formated_item)
                except IndexError:
                    pass
                except TypeError:
                    pass

    uniq_compuni_humanuni_PDBid_translation = set()
    for n in compuni_humanuni_PDBid:
        if n[0] == "-":
            continue
        uniq_compuni_humanuni_PDBid_translation.add(n)

    with open('log_translator.txt', 'w') as filehandle:
        for listitem in uniq_compuni_humanuni_PDBid_translation:
            filehandle.write(listitem[0] + " " + listitem[1] + " " + listitem[2] + "\n")

In [26]:
log_writer(resulting)

In [16]:
# def log_translator_reader(list_of_uni, mod="AccessionId"):
#     if mod == "Human_readble_UniProt":
#         index_to_look = 1
#     else:
#         index_to_look = 0
        
#     target_pdb = set()
#     if type(list_of_uni) is str:
#         list_of_uni = [list_of_uni]

#     with open('log_translator.txt', 'r') as filehandle:
#         for n in filehandle.readlines():
#             for uni in list_of_uni:
#                 if n.split()[index_to_look] == uni:
#                     target_pdb.add(n.split()[2])
#     return(target_pdb)

# log_translator_reader(["GGACT_HUMAN","NBS1_SCHPO"], "Human_readble_UniProt")

In [None]:
set_of_PDB_entries_with_renum_0 = set()
all_data = set()
with open('log_PDBrenum.txt', 'rt') as f:
    for n in f.readlines():
        # print(n.split())
        all_data.add(n.split()[1])
        if n.split()[-2] == "0":
             # print(n.split())
            set_of_PDB_entries_with_renum_0.add(n.split()[1])
print("total:", len(all_data))    
print("zero_renum:", len(set_of_PDB_entries_with_renum_0))

In [96]:
##########################DOWNLOADER################################
from src.download.modules import *

In [97]:
###read latest catalog and return list of all file names
def latest_catalog_reader():
    files_of_current_directory = os.listdir(current_directory)
    paths_to_ls_lR = list()
    paths_to_xml = list()

    for n in files_of_current_directory:
        if n.startswith('ls-lR'):
            n = current_directory + "/" + n + "/" + 'ls-lR'
            paths_to_ls_lR.append(n)
        if n.startswith('xml'):
            n = current_directory + "/" + n + "/" + 'xml'
            paths_to_xml.append(n)

    paths_to_ls_lR_sorted = sorted(paths_to_ls_lR, reverse=True)
    paths_to_xml_sorted = sorted(paths_to_xml, reverse=True)

    try:
        path_to_the_latest_list = paths_to_ls_lR_sorted[0]

        df_catalog_the_latest_listing = pd.read_csv(path_to_the_latest_list,
                                                    names=["1", "2", "3", "4", "Data_size", "Month", "Day", "Time",
                                                           "file_name", "10", "file_names_path"], sep="\s+",
                                                    low_memory=False)

        # mmCIF
        df_catalog_the_latest_mmCIF_listing_dropna = df_catalog_the_latest_listing.dropna()
        df_catalog_the_latest_mmCIF_listing_dropna_cif_gz = df_catalog_the_latest_mmCIF_listing_dropna[
            df_catalog_the_latest_mmCIF_listing_dropna['file_name'].str.endswith('cif.gz')]
        df_catalog_the_latest_mmCIF_listing_dropna_cif_gz_34kb = df_catalog_the_latest_mmCIF_listing_dropna_cif_gz[
            df_catalog_the_latest_mmCIF_listing_dropna_cif_gz.Data_size == 34.0]

        all_mmCIF_files = list()
        for n in df_catalog_the_latest_mmCIF_listing_dropna_cif_gz_34kb["file_name"]:
            all_mmCIF_files.append(n)

        # PDB
        df_catalog_the_latest_PDB_listing_dropna = df_catalog_the_latest_listing.dropna()
        df_catalog_the_latest_PDB_listing_dropna_ent_gz = df_catalog_the_latest_PDB_listing_dropna[
            df_catalog_the_latest_PDB_listing_dropna['file_name'].str.endswith('ent.gz')]
        df_catalog_the_latest_PDB_listing_dropna_ent_gz_34kb = df_catalog_the_latest_PDB_listing_dropna_ent_gz[
            df_catalog_the_latest_PDB_listing_dropna_ent_gz.Data_size == 35.0]

        all_PDB_files = list()
        for n in df_catalog_the_latest_PDB_listing_dropna_ent_gz_34kb["file_name"]:
            all_PDB_files.append(n)

        # SIFTS
        path_to_the_latest_list = paths_to_xml_sorted[0]

        df_catalog_the_latest_listing = pd.read_csv(path_to_the_latest_list,
                                                    names=["1", "2", "3", "4", "Data_size", "Month", "Day", "Time",
                                                           "file_name", "10", "file_names_path"], sep="\s+",
                                                    low_memory=False)

        df_catalog_the_latest_SIFTS_listing_dropna = df_catalog_the_latest_listing.dropna()
        df_catalog_the_latest_SIFTS_listing_dropna_xml_gz = df_catalog_the_latest_SIFTS_listing_dropna[
            df_catalog_the_latest_SIFTS_listing_dropna['file_name'].str.endswith('xml.gz')]
        df_catalog_the_latest_SIFTS_listing_dropna_cif_gz_34kb = df_catalog_the_latest_SIFTS_listing_dropna_xml_gz[
            df_catalog_the_latest_SIFTS_listing_dropna_xml_gz.Data_size == 27.0]

        all_SIFTS_files = list()
        for n in df_catalog_the_latest_SIFTS_listing_dropna_cif_gz_34kb["file_name"]:
            all_SIFTS_files.append(n)

    except IndexError:
        print("Sorry, nothing to read from. Try catalog_downloader() command first.")
        all_mmCIF_files = None
        all_PDB_files = None
        all_SIFTS_files = None

    return [all_mmCIF_files, all_PDB_files, all_SIFTS_files]

def my_hook(t):
    last_b = [0]

    def update_to(b=1, b_size=1, t_size=None):
        if t_size is not None:
            t.total = t_size
        t.update((b - last_b[0]) * b_size)
        last_b[0] = b
    return update_to


def downloader_for_catalog_with_urllib(ftp_to_download, where_the_file_goes):
    socket.setdefaulttimeout(300)
    for _ in range(10):
        try:
            last_slash = ftp_to_download.rsplit('/', 1)[-1]
            with tqdm.tqdm(unit="B", unit_scale=True, desc="Downloading mmCIF/SIFTS catalogs " + last_slash, position=0, leave=True) as t:
                reporthook = my_hook(t)
                urllib.request.urlretrieve(ftp_to_download, Path(str(where_the_file_goes) + "/" + last_slash), reporthook=reporthook)
            break
        except Exception:
            time.sleep(1)


def catalog_downloader():
    """PDB ls-lR catalog"""
    ftp_for_all_mmCIF_and_PDB = "ftp://ftp.rcsb.org/pub/pdb/data/structures/ls-lR"

    ftp_to_download = ftp_for_all_mmCIF_and_PDB
    last_slash = ftp_to_download.rsplit('/', 1)[-1]
    today_date = date.today()
    today_date_str = today_date.strftime("_%Y_%m_%d")

    where_the_file_goes = current_directory + "/" + last_slash + today_date_str
    if not os.path.exists(where_the_file_goes):
        os.makedirs(where_the_file_goes)
        downloader_for_catalog_with_urllib(ftp_to_download, where_the_file_goes)
    else:
        if not os.path.isfile(where_the_file_goes + "/ls-lR"):
            downloader_for_catalog_with_urllib(ftp_to_download, where_the_file_goes)

    # downloader_for_catalog_with_urllib(ftp_to_download, where_the_file_goes)

    # reading txt file SIFTS parent catalog and creating pandas df out of it
    df_catalog_listing_everything = pd.read_csv(Path(str(where_the_file_goes) + "/" + last_slash),
                                                names=["1", "2", "3", "4", "Data_size", "Month", "Day", "Time",
                                                       "file_name", "10", "file_names_path"], sep="\s+",
                                                low_memory=False)

    # Dropping all unnecessary rows leaving only files with 'cif.gz' endings
    df_mmCIF_catalog_dropna = df_catalog_listing_everything.dropna()
    df_mmCIF_catalog_dropna_cif_gz = df_mmCIF_catalog_dropna[df_mmCIF_catalog_dropna['file_name'].str.endswith('cif.gz')]
    df_mmCIF_catalog_dropna_cif_gz_34kb = df_mmCIF_catalog_dropna_cif_gz[df_mmCIF_catalog_dropna_cif_gz.Data_size == 34.0]

    # Dropping all unnecessary rows leaving only files with 'ent.gz' endings
    df_PDB_catalog_dropna = df_catalog_listing_everything.dropna()
    df_PDB_catalog_dropna_ent_gz = df_PDB_catalog_dropna[df_PDB_catalog_dropna['file_name'].str.endswith('ent.gz')]
    df_PDB_catalog_dropna_ent_gz_35kb = df_PDB_catalog_dropna_ent_gz[df_PDB_catalog_dropna_ent_gz.Data_size == 35.0]

    # creating lists of the mmCIF file_names
    list_of_mmCIF_cif_gz_file_names = list()
    for mmCIF_file_name in df_mmCIF_catalog_dropna_cif_gz_34kb["file_name"]:
        list_of_mmCIF_cif_gz_file_names.append(mmCIF_file_name)

    # creating lists of the PDB file_names
    list_of_PDB_ent_gz_file_names = list()
    for PDB_file_name in df_PDB_catalog_dropna_ent_gz_35kb["file_name"]:
        list_of_PDB_ent_gz_file_names.append(PDB_file_name)

    """SIFTS xml catalog"""
    ftp_all_SIFTS = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml"
    ftp_to_download = ftp_all_SIFTS
    last_slash = ftp_to_download.rsplit('/', 1)[-1]

    where_the_file_goes = current_directory + "/" + last_slash + today_date_str
    if not os.path.exists(where_the_file_goes):
        os.makedirs(where_the_file_goes)
        downloader_for_catalog_with_urllib(ftp_to_download, where_the_file_goes)
    else:
        if not os.path.isfile(where_the_file_goes + "/xml"):
            downloader_for_catalog_with_urllib(ftp_to_download, where_the_file_goes)

    # downloader_for_catalog_with_urllib(ftp_to_download, where_the_file_goes)

    # reading txt file SIFTS parent catalog and creating pandas df out of it
    df_catalog_listing_everything = pd.read_csv(Path(str(where_the_file_goes) + "/" + last_slash),
                                                names=["1", "2", "3", "4", "Data_size", "Month", "Day", "Time",
                                                       "file_name", "10", "file_names_path"], sep="\s+",
                                                low_memory=False)

    # Dropping all unnecessary rows leaving only files with 'xml.gz' endings
    df_SIFTS_catalog_dropna = df_catalog_listing_everything.dropna()
    df_SIFTS_catalog_dropna_xml_gz = df_SIFTS_catalog_dropna[
        df_SIFTS_catalog_dropna['file_name'].str.endswith('xml.gz')]
    df_SIFTS_catalog_dropna_xml_gz_27kb = df_SIFTS_catalog_dropna_xml_gz[
        df_SIFTS_catalog_dropna_xml_gz.Data_size == 27.0]

    # creating lists of the SIFTS file_names
    list_of_SIFTS_xml_gz_file_names = list()
    for SIFTS_file_names in df_SIFTS_catalog_dropna_xml_gz_27kb["file_name"]:
        list_of_SIFTS_xml_gz_file_names.append(SIFTS_file_names)

    _4Char_list_of_SIFTS_xml_gz_file_names = list()
    for SIFTS_file_names_4Char in list_of_SIFTS_xml_gz_file_names:
        _4Char_list_of_SIFTS_xml_gz_file_names.append(SIFTS_file_names_4Char[:4])

    _4Char_list_of_PDB_ent_gz_file_names = list()
    for PDB_file_names_4Char in list_of_PDB_ent_gz_file_names:
        _4Char_list_of_PDB_ent_gz_file_names.append(PDB_file_names_4Char[3:7])

    _4Char_list_of_mmCIF_cif_gz_file_names = list()
    for mmCIF_file_names_4Char in list_of_mmCIF_cif_gz_file_names:
        _4Char_list_of_mmCIF_cif_gz_file_names.append(mmCIF_file_names_4Char[:4])

    df_list_of_mmCIF_cif_gz_file_names = pd.DataFrame(
        zip(list_of_mmCIF_cif_gz_file_names, _4Char_list_of_mmCIF_cif_gz_file_names), columns=["mmCIF", "4mmCIF"])
    df_list_of_PDB_ent_gz_file_names = pd.DataFrame(
        zip(list_of_PDB_ent_gz_file_names, _4Char_list_of_PDB_ent_gz_file_names), columns=["PDB", "4PDB"])
    df_list_of_SIFTS_xml_gz_file_names = pd.DataFrame(
        zip(list_of_SIFTS_xml_gz_file_names, _4Char_list_of_SIFTS_xml_gz_file_names), columns=["SIFTS", "4SIFTS"])

    merged_df_mmCIF_PDB_file_names = df_list_of_mmCIF_cif_gz_file_names.merge(df_list_of_PDB_ent_gz_file_names,
                                                                              left_on='4mmCIF', right_on='4PDB',
                                                                              how="left")
    merged_df_mmCIF_PDB_SIFTS_file_names = merged_df_mmCIF_PDB_file_names.merge(df_list_of_SIFTS_xml_gz_file_names,
                                                                                left_on='4mmCIF', right_on='4SIFTS',
                                                                                how="left")

    merged_df_mmCIF_PDB_SIFTS_file_names['SIFTS'] = merged_df_mmCIF_PDB_SIFTS_file_names['SIFTS'].replace(np.nan, "0000")
    merged_df_mmCIF_PDB_SIFTS_file_names['PDB'] = merged_df_mmCIF_PDB_SIFTS_file_names['PDB'].replace(np.nan, "0000")

    SIFTS_file_names_with_null_if_files_absent = list()
    for SIFTS_file_name_null_for_absent in merged_df_mmCIF_PDB_SIFTS_file_names['SIFTS']:
        SIFTS_file_names_with_null_if_files_absent.append(SIFTS_file_name_null_for_absent)

    PDB_file_names_with_null_if_files_absent = list()
    for PDB_file_name_null_for_absent in merged_df_mmCIF_PDB_SIFTS_file_names['PDB']:
        PDB_file_names_with_null_if_files_absent.append(PDB_file_name_null_for_absent)

    mmCIF_file_names_with_null_if_files_absent = list()
    for mmCIF_file_name_null_for_absent in merged_df_mmCIF_PDB_SIFTS_file_names['mmCIF']:
        mmCIF_file_names_with_null_if_files_absent.append(mmCIF_file_name_null_for_absent)

    return (mmCIF_file_names_with_null_if_files_absent,
            PDB_file_names_with_null_if_files_absent,
            SIFTS_file_names_with_null_if_files_absent)

In [98]:
from src.download.modules import *
from src.download.catalogdownloader import catalog_downloader
from src.download.latestcatreader import latest_catalog_reader

from src.download.modules import *
# from src.download.catalogdownloader import catalog_downloader
# from src.download.latestcatreader import latest_catalog_reader
# from src.download.lookfilesinside import look_what_is_inside

# default_input_path_to_mmCIF = current_directory + "/mmCIF"
# default_input_path_to_mmCIF_assembly = current_directory + "/mmCIF_assembly"
# default_input_path_to_PDB = current_directory + "/PDB"
# default_input_path_to_PDB_assembly = current_directory + "/PDB_assembly"
# default_input_path_to_SIFTS = current_directory + "/SIFTS"
# default_output_path_to_mmCIF = current_directory + "/output_mmCIF"
# default_output_path_to_mmCIF_assembly = current_directory + "/output_mmCIF_assembly"
# default_output_path_to_PDB = current_directory + "/output_PDB"
# default_output_path_to_PDB_assembly = current_directory + "/output_PDB_assembly"


def download_pdb_assemblies_list_with_lxml():
    for _ in range(5):
        session = requests.Session()
        # rcsb = "https://files.rcsb.org/pub/pdb/data/biounit/PDB/all/"
        wwpdb = "https://ftp.wwpdb.org/pub/pdb/data/biounit/PDB/all/"
        links = set()
        try:
            with session.get(wwpdb, stream=True, timeout=100) as r:
                dom = html.fromstring(r.content)
                for link in dom.xpath('//a/@href'):
                    if ".gz" in link:
                        links.add(wwpdb + link)
            return links
        except requests.exceptions.RequestException:
            pass


def url_formation_for_pool(format_to_download="mmCIF", list_of_file_names=(),
                           default_input_path_to_mmCIF=current_directory + "/mmCIF",
                           default_input_path_to_PDB=current_directory + "/PDB",
                           default_input_path_to_SIFTS=current_directory + "/SIFTS",
                           default_input_path_to_mmCIF_assembly=current_directory + "/mmCIF_assembly",
                           default_input_path_to_PDB_assembly=current_directory + "/PDB_assembly"):
    urls_to_target_files = list()
    for file_name in list_of_file_names:
        if file_name == "0000":
            continue
        if len(file_name) >= 4:
            if format_to_download == "mmCIF" or format_to_download == "all":
                if not os.path.exists(default_input_path_to_mmCIF):
                    os.makedirs(default_input_path_to_mmCIF)
                if "ent" in file_name and file_name.startswith('pdb'):
                    target_name = file_name[3:7] + ".cif.gz"
                else:
                    target_name = file_name[0:4] + ".cif.gz"
                urls_to_target_files.append("https://files.rcsb.org/pub/pdb/data/structures/all/mmCIF/" + target_name)

            if format_to_download == "PDB" or format_to_download == "all":
                if not os.path.exists(default_input_path_to_PDB):
                    os.makedirs(default_input_path_to_PDB)
                if "ent" in file_name and file_name.startswith('pdb'):
                    target_name = "pdb" + file_name[3:7] + ".ent.gz"
                else:
                    target_name = "pdb" + file_name[0:4] + ".ent.gz"
                urls_to_target_files.append("https://files.rcsb.org/pub/pdb/data/structures/all/pdb/" + target_name)

            if format_to_download == "SIFTS" or format_to_download == "all":
                if not os.path.exists(default_input_path_to_SIFTS):
                    os.makedirs(default_input_path_to_SIFTS)
                if "ent" in file_name and file_name.startswith('pdb'):
                    target_name = file_name[3:7] + ".xml.gz"
                else:
                    target_name = file_name[0:4] + ".xml.gz"
                urls_to_target_files.append("http://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/" + target_name)

            if format_to_download == "mmCIF_assembly" or format_to_download == "all":
                if not os.path.exists(default_input_path_to_mmCIF_assembly):
                    os.makedirs(default_input_path_to_mmCIF_assembly)
                if "ent" in file_name and file_name.startswith('pdb'):
                    target_name = file_name[3:7] + ".cif.gz"
                else:
                    target_name = file_name[0:4] + ".cif.gz"
                urls_to_target_files.append("https://www.ebi.ac.uk/pdbe/static/entry/" + target_name[:4] + "-assembly.xml")

            if format_to_download == "PDB_assembly":
                if not os.path.exists(default_input_path_to_PDB_assembly):
                    os.makedirs(default_input_path_to_PDB_assembly)
                urls_to_target_files.append("https://ftp.wwpdb.org/pub/pdb/data/biounit/PDB/all/" + file_name)

        else:
            raise ValueError("Input file names list is not correct!!! It cannot be less than 4 characters")

    return urls_to_target_files


def download_with_pool(urls_to_target_files=(),
                       default_input_path_to_mmCIF=current_directory + "/mmCIF",
                       default_input_path_to_PDB=current_directory + "/PDB",
                       default_input_path_to_SIFTS=current_directory + "/SIFTS",
                       default_input_path_to_mmCIF_assembly=current_directory + "/mmCIF_assembly",
                       default_input_path_to_PDB_assembly=current_directory + "/PDB_assembly"):
    try:
        file_name_start_pos = urls_to_target_files.rfind("/") + 1
        format_start_pos = file_name_start_pos - 4
        file_name = urls_to_target_files[file_name_start_pos:]
        format_of_db = urls_to_target_files[format_start_pos:format_start_pos + 3]

        r = requests.get(urls_to_target_files, stream=True)

        if format_of_db == "CIF":
            if r.status_code == requests.codes.ok:
                with open(default_input_path_to_mmCIF + "/" + file_name, 'wb') as f:
                    for data in r:
                        f.write(data)

        if format_of_db == "pdb":
            if r.status_code == requests.codes.ok:
                with open(default_input_path_to_PDB + "/" + file_name, 'wb') as f:
                    for data in r:
                        f.write(data)

        if format_of_db == "xml":
            if r.status_code == requests.codes.ok:
                with open(default_input_path_to_SIFTS + "/" + file_name, 'wb') as f:
                    for data in r:
                        f.write(data)

        if format_of_db == "all":
            if r.status_code == requests.codes.ok:
                with open(default_input_path_to_PDB_assembly + "/" + file_name, 'wb') as f:
                    for data in r:
                        f.write(data)

        if format_of_db == "try":
            if r.status_code == requests.codes.ok:
                root = ET.fromstring(r.text)
                for n in root:
                    compos_ID_list = list(n.attrib.items())
                    if compos_ID_list[1][0] == "id":
                        req_child = requests.get(
                            "https://www.ebi.ac.uk/pdbe/static/entry/" + file_name[0:4] + "-assembly-" + compos_ID_list[1][1] + ".cif.gz",
                            stream=True)
                        if req_child.status_code == requests.codes.ok:
                            with open(default_input_path_to_mmCIF_assembly + "/" + file_name[0:4] + "-assembly-" + compos_ID_list[1][1] + ".cif.gz",
                                      'wb') as f:
                                for data in req_child:
                                    f.write(data)

    except requests.exceptions.RequestException:
        pass


def run_downloads_with_ThreadPool(format_to_download="mmCIF", urls_to_target=(),
                                  default_input_path_to_mmCIF=current_directory + "/mmCIF",
                                  default_input_path_to_PDB=current_directory + "/PDB",
                                  default_input_path_to_SIFTS=current_directory + "/SIFTS",
                                  default_input_path_to_mmCIF_assembly=current_directory + "/mmCIF_assembly",
                                  default_input_path_to_PDB_assembly=current_directory + "/PDB_assembly"):

    executor = ThreadPoolExecutor()
    partial_download_with_pool = partial(download_with_pool,
                                         default_input_path_to_mmCIF=default_input_path_to_mmCIF,
                                         default_input_path_to_PDB=default_input_path_to_PDB,
                                         default_input_path_to_SIFTS=default_input_path_to_SIFTS,
                                         default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly,
                                         default_input_path_to_PDB_assembly=default_input_path_to_PDB_assembly)

    jobs = [executor.submit(partial_download_with_pool, url) for url in urls_to_target]

    for _ in tqdm.tqdm(as_completed(jobs), total=len(jobs), position=0, leave=True, desc="Downloading " + format_to_download + " files"):
        pass


# if __name__ == '__main__':
#     all_files = latest_catalog_reader()
#     all_mmCIF_files = all_files[0]
#     all_PDB_files = all_files[1]
#     all_SIFTS_files = all_files[2]
#
#     urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF", all_mmCIF_files)
#     urls_to_target_PDB_files = url_formation_for_pool("PDB", all_PDB_files)
#     urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", all_SIFTS_files)
#
#     run_downloads_with_ThreadPool(urls_to_target_mmCIF_files)
#     run_downloads_with_ThreadPool(urls_to_target_PDB_files)
#     run_downloads_with_ThreadPool(urls_to_target_SIFTS_files)
#
#     download_pdb_assemblies = download_pdb_assemblies_list_with_lxml()
#     run_downloads_with_ThreadPool("pdb_assembly", download_pdb_assemblies)
#
#     urls_to_target_mmCIF_assembly_files = url_formation_for_pool("mmCIF_assembly", all_mmCIF_files)
#     run_downloads_with_ThreadPool("mmCIF_assembly", urls_to_target_mmCIF_assembly_files)

# uniprot_sprot = gzip.open("uniprot_sprot.fasta.gz", 'rt')
# uniprot_translation = dict()
# for line in uniprot_sprot:
#     if line.startswith(">sp"):
#         Computer_readble = line.split("|", 2)[1]
#         Human_readble = line.split("|", 2)[2].split(" ", 1)[0]
#         uniprot_translation[Computer_readble] = Human_readble
        
# uniprot_trembl = gzip.open("uniprot_trembl.fasta.gz", 'rt')
# uniprot_trembl_translation = dict()
# for line in uniprot_trembl:
#     if line.startswith(">tr"):
#         Computer_readble = line.split("|", 2)[1]
#         Human_readble = line.split("|", 2)[2].split(" ", 1)[0]
#         uniprot_trembl_translation[Computer_readble] = Human_readble

# from UniProt webserver
# exception list
# GFP_AEQVI
# Primary (citable) accession number: P42212
# Secondary accession number(s): Q17104, Q27903, Q93125

# GCN4_YEAST
# Primary (citable) accession number: P03069
# Secondary accession number(s): D3DLN9, Q96UT3

# C562_ECOLX
# Primary (citable) accession number: P0ABE7
# Secondary accession number(s): P00192, P76805, Q8XCE3

# ENLYS_BPT4
# Primary (citable) accession number: P00720
# Secondary accession number(s): Q38170, Q94N07

# MALE_ECOLI
# Primary (citable) accession number: P0AEX9
# Secondary accession number(s): P02928, Q2M6S0

# exception_AccessionIDs = ["P42212", "Q17104", "Q27903", "Q93125", "P03069", "D3DLN9", "Q96UT3", "P0ABE7", "P00192", "P76805", "Q8XCE3", "P00720", "Q38170", "Q94N07", "P0AEX9", "P02928", "Q2M6S0"]
# GFP_AEQVI GCN4_YEAST C562_ECOLX ENLYS_BPT4 MALE_ECOLI

# output_mmCIF_files_were_found = look_what_is_inside('output_mmCIF', default_output_path_to_mmCIF = default_output_path_to_mmCIF)
# no_change_list = list()
# no_SIFTS_list = list()
# no_UniProt_in_SIFTS_list = list()
# for n in output_mmCIF_files_were_found:
#     if "_no_change_out" in n:
#         no_change_list.append(n)
#     if "_no_SIFTS" in n:
#         no_SIFTS_list.append(n)
#     if "_no_UniProt" in n:
#         no_UniProt_in_SIFTS_list.append(n)

# changed = len(output_mmCIF_files_were_found) - len(no_change_list) - len(no_SIFTS_list) - len(no_UniProt_in_SIFTS_list)
# len(no_change_list)
# len(no_SIFTS_list)
# len(no_UniProt_in_SIFTS_list)
# len(output_mmCIF_files_were_found)
# changed * 100 / len(output_mmCIF_files_were_found)

In [100]:
# if __name__ == '__main__':
#     catalog_downloader()
#     all_files = latest_catalog_reader()
#     all_mmCIF_files = all_files[0]
#     all_PDB_files = all_files[1]
#     all_SIFTS_files = all_files[2]

#     urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF_assembly", all_mmCIF_files)
#     urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", all_SIFTS_files)

#     run_downloads_with_ThreadPool("mmCIF_assembly", urls_to_target_mmCIF_files)
#     run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files)

Downloading mmCIF/SIFTS catalogs ls-lR: 181MB [05:37, 534kB/s]  
Downloading mmCIF/SIFTS catalogs xml: 0.00B [05:07, ?B/s]
Downloading mmCIF/SIFTS catalogs xml: 0.00B [05:02, ?B/s]
Downloading mmCIF/SIFTS catalogs xml: 0.00B [05:02, ?B/s]
Downloading mmCIF/SIFTS catalogs xml: 0.00B [05:01, ?B/s]
Downloading mmCIF/SIFTS catalogs xml: 0.00B [05:02, ?B/s]
Downloading mmCIF/SIFTS catalogs xml: 0.00B [05:02, ?B/s]
Downloading mmCIF/SIFTS catalogs xml: 17.2MB [01:43, 165kB/s] 
Downloading mmCIF_assembly files: 100%|██████████| 175508/175508 [3:52:01<00:00, 12.61it/s]   
Downloading SIFTS files: 100%|██████████| 171841/171841 [51:46<00:00, 55.31it/s]  


In [None]:
#####################################################################################################################################################
# DEPENDENCIES #
#####################################################################################################################################################


import os
import argparse
from src.download import lefttorenumber
from src.download.inputtextfileparser import input_text_file_parser
from src.download.shortusagemessage import short_usage_messenger
from src.download.longusagemessage import long_usage_messenger
from src.download.supremedownloader import supreme_download_master
from src.download.lookfilesinside import look_what_is_inside


from src.renum.PDB.renumPDB import master_PDB_renumber_function
from src.renum.mmCIF.ProcessPool_run_renum import ProcessPool_run_renum
from src.renum.write_log import log_writer
from downloadwithThreadPool import run_downloads_with_ThreadPool, url_formation_for_pool


current_directory = os.getcwd()
exception_AccessionIDs = ["P42212", "Q17104", "Q27903", "Q93125", "P03069", "D3DLN9", "Q96UT3", "P0ABE7", "P00192",
                          "P76805", "Q8XCE3", "P00720", "Q38170", "Q94N07", "P0AEX9", "P02928", "Q2M6S0"]

#####################################################################################################################################################
# ARGUMENTS #
#####################################################################################################################################################


argpar = argparse.ArgumentParser(usage=short_usage_messenger(), add_help=False)

argpar.add_argument("-h", action="store_true", help=argparse.SUPPRESS)
argpar.add_argument("--help", action="store_true", help=argparse.SUPPRESS)

argpar.add_argument("-rftf", "--renumber_from_text_file", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-rfla", "--renumber_from_list_of_arguments", metavar="6dbp 3v03 2jit", nargs="*", type=str, help=argparse.SUPPRESS)

argpar.add_argument("-dftf", "--download_from_text_file", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-dfla", "--download_from_list_of_arguments", metavar="6dbp 3v03 2jit", nargs="+", type=str, help=argparse.SUPPRESS)

argpar.add_argument("-redb", "--renumber_entire_database", action="store_true", help=argparse.SUPPRESS)
argpar.add_argument("-dall", "--download_entire_database", action="store_true", help=argparse.SUPPRESS)
argpar.add_argument("-refr", "--refresh_entire_database", action="store_true", help=argparse.SUPPRESS)

argpar.add_argument("-PDB", "--PDB_format_only", action="store_true", help=argparse.SUPPRESS)
argpar.add_argument("-mmCIF", "--mmCIF_format_only", action="store_true", help=argparse.SUPPRESS)
argpar.add_argument("-PDB_assembly", "--PDB_assembly_format_only", action="store_true", help=argparse.SUPPRESS)
argpar.add_argument("-mmCIF_assembly", "--mmCIF_assembly_format_only", action="store_true", help=argparse.SUPPRESS)
argpar.add_argument("-all", "--all_formats", action="store_true", help=argparse.SUPPRESS)

argpar.add_argument("-sipm", "--set_default_input_path_to_mmCIF", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-sipma", "--set_default_input_path_to_mmCIF_assembly", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-sipp", "--set_default_input_path_to_PDB", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-sippa", "--set_default_input_path_to_PDB_assembly", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-sips", "--set_default_input_path_to_SIFTS", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-sopm", "--set_default_output_path_to_mmCIF", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-sopma", "--set_default_output_path_to_mmCIF_assembly", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-sopp", "--set_default_output_path_to_PDB", type=str, help=argparse.SUPPRESS)
argpar.add_argument("-soppa", "--set_default_output_path_to_PDB_assembly", type=str, help=argparse.SUPPRESS)

argpar.add_argument("-sdmn", "--set_default_mmCIF_num", type=int, help=argparse.SUPPRESS)
argpar.add_argument("-sdpn", "--set_default_PDB_num", type=int, help=argparse.SUPPRESS)

argpar.add_argument("-nproc", "--set_number_of_processes", type=int, help=argparse.SUPPRESS)
argpar.add_argument("-offz", "--set_to_off_mode_gzip", action="store_true", help=argparse.SUPPRESS)


args = argpar.parse_args()


#####################################################################################################################################################
# FLAGS #
#####################################################################################################################################################


if args.help:
    print(long_usage_messenger())

if args.h:
    print(short_usage_messenger())

if args.set_default_input_path_to_mmCIF:
    default_input_path_to_mmCIF = args.set_default_input_path_to_mmCIF
else:
    default_input_path_to_mmCIF = current_directory + "/mmCIF"

if args.set_default_input_path_to_mmCIF_assembly:
    default_input_path_to_mmCIF_assembly = args.set_default_input_path_to_mmCIF_assembly
else:
    default_input_path_to_mmCIF_assembly = current_directory + "/mmCIF_assembly"

if args.set_default_input_path_to_PDB:
    default_input_path_to_PDB = args.set_default_input_path_to_PDB
else:
    default_input_path_to_PDB = current_directory + "/PDB"

if args.set_default_input_path_to_PDB_assembly:
    default_input_path_to_PDB_assembly = args.set_default_input_path_to_PDB_assembly
else:
    default_input_path_to_PDB_assembly = current_directory + "/PDB_assembly"

if args.set_default_input_path_to_SIFTS:
    default_input_path_to_SIFTS = args.set_default_input_path_to_SIFTS
else:
    default_input_path_to_SIFTS = current_directory + "/SIFTS"

if args.set_default_output_path_to_mmCIF:
    default_output_path_to_mmCIF = args.set_default_output_path_to_mmCIF
else:
    default_output_path_to_mmCIF = current_directory + "/output_mmCIF"

if args.set_default_output_path_to_mmCIF_assembly:
    default_output_path_to_mmCIF_assembly = args.set_default_output_path_to_mmCIF_assembly
else:
    default_output_path_to_mmCIF_assembly = current_directory + "/output_mmCIF_assembly"

if args.set_default_output_path_to_PDB:
    default_output_path_to_PDB = args.set_default_output_path_to_PDB
else:
    default_output_path_to_PDB = current_directory + "/output_PDB"

if args.set_default_output_path_to_PDB_assembly:
    default_output_path_to_PDB_assembly = args.set_default_output_path_to_PDB_assembly
else:
    default_output_path_to_PDB_assembly = current_directory + "/output_PDB_assembly"

if args.set_default_mmCIF_num:
    default_mmCIF_num = args.set_default_mmCIF_num
else:
    default_mmCIF_num = 50000

if args.set_default_PDB_num:
    default_PDB_num = args.set_default_PDB_num
else:
    default_PDB_num = 5000

if args.set_to_off_mode_gzip:
    gzip_mode = "off"
else:
    gzip_mode = "on"

if args.set_number_of_processes:
    nproc = args.set_number_of_processes
else:
    nproc = None


#####################################################################################################################################################
# PARTIAL DB WORK #
#####################################################################################################################################################

# RENUMBER
# RENUMBER FROM TEXT FILE or RENUMBER FROM LIST OF ARGUMENTS
if args.renumber_from_text_file or args.renumber_from_list_of_arguments:
    if args.renumber_from_text_file:
        parsed_input_text = (input_text_file_parser(args.renumber_from_text_file))
    else:
        parsed_input_text = args.renumber_from_list_of_arguments

    if args.all_formats:
        urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF", parsed_input_text, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        urls_to_target_PDB_files = url_formation_for_pool("PDB", parsed_input_text, default_input_path_to_PDB=default_input_path_to_PDB)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", parsed_input_text, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        passed_as_arg_file_4Char_PDB = list()
        for file_name in parsed_input_text:
            passed_as_arg_file_4Char_PDB.append(file_name[:4])

        input_PDB_files_were_found = look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB)
        target_files_list_PDB = list()
        for file_name in input_PDB_files_were_found:
            if file_name[3:7] in passed_as_arg_file_4Char_PDB:
                target_files_list_PDB.append(file_name)
        master_PDB_renumber_function(target_files_list_PDB, default_input_path_to_PDB, default_input_path_to_SIFTS,
                                     default_output_path_to_PDB, default_PDB_num, gzip_mode)

        input_mmCIF_files_were_found = look_what_is_inside("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        passed_as_arg_file_4Char_mmCIF = list()
        for file_name in parsed_input_text:
            passed_as_arg_file_4Char_mmCIF.append(file_name[:4])

        target_files_list_mmCIF = list()
        for file_name in input_mmCIF_files_were_found:
            if file_name[:4] in passed_as_arg_file_4Char_mmCIF:
                target_files_list_mmCIF.append(file_name)

        if not os.path.exists(default_output_path_to_mmCIF):
            os.makedirs(default_output_path_to_mmCIF)
        res = ProcessPool_run_renum("mmCIF", target_files_list_mmCIF, default_input_path_to_mmCIF, default_input_path_to_SIFTS,
                                    default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
        log_writer(res)

    elif args.PDB_format_only:
        urls_to_target_PDB_files = url_formation_for_pool("PDB", parsed_input_text, default_input_path_to_PDB=default_input_path_to_PDB)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", parsed_input_text, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        input_PDB_files_were_found = look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB)
        passed_as_arg_file_4Char_PDB = list()
        for file_name in parsed_input_text:
            passed_as_arg_file_4Char_PDB.append(file_name[:4])

        target_files_list_PDB = list()
        for file_name in input_PDB_files_were_found:
            if file_name[3:7] in passed_as_arg_file_4Char_PDB:
                target_files_list_PDB.append(file_name)
        master_PDB_renumber_function(target_files_list_PDB, default_input_path_to_PDB, default_input_path_to_SIFTS,
                                     default_output_path_to_PDB, default_PDB_num, gzip_mode)

    elif args.mmCIF_assembly_format_only:
        urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF_assembly", parsed_input_text,
                                                            default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", parsed_input_text, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        run_downloads_with_ThreadPool("mmCIF_assembly", urls_to_target_mmCIF_files,
                                      default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        input_mmCIF_files_were_found = look_what_is_inside("mmCIF_assembly",
                                                           default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly)
        passed_as_args_files_list_4Char = list()
        for file_name in parsed_input_text:
            passed_as_args_files_list_4Char.append(file_name[:4])

        target_files_list = list()
        for file_name in input_mmCIF_files_were_found:
            if file_name[:4] in passed_as_args_files_list_4Char:
                target_files_list.append(file_name)

        if not os.path.exists(default_output_path_to_mmCIF_assembly):
            os.makedirs(default_output_path_to_mmCIF_assembly)
        res = ProcessPool_run_renum("mmCIF_assembly", target_files_list, default_input_path_to_mmCIF_assembly, default_input_path_to_SIFTS,
                                    default_output_path_to_mmCIF_assembly, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
        log_writer(res)

    else:
        urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF", parsed_input_text, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", parsed_input_text, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        input_mmCIF_files_were_found = look_what_is_inside("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        passed_as_args_files_list_4Char = list()
        for file_name in parsed_input_text:
            passed_as_args_files_list_4Char.append(file_name[:4])

        target_files_list = list()
        for file_name in input_mmCIF_files_were_found:
            if file_name[:4] in passed_as_args_files_list_4Char:
                target_files_list.append(file_name)

        if not os.path.exists(default_output_path_to_mmCIF):
            os.makedirs(default_output_path_to_mmCIF)
        res = ProcessPool_run_renum("mmCIF", target_files_list, default_input_path_to_mmCIF, default_input_path_to_SIFTS,
                                    default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
        log_writer(res)

# RENUMBER FROM LIST OF ARGUMENTS
# if args.renumber_from_list_of_arguments:
#     if args.all_formats:
#
#         urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF",  args.renumber_from_list_of_arguments,
#                                                             default_input_path_to_mmCIF=default_input_path_to_mmCIF)
#         urls_to_target_PDB_files = url_formation_for_pool("PDB",  args.renumber_from_list_of_arguments,
#                                                           default_input_path_to_PDB=default_input_path_to_PDB)
#         urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS",  args.renumber_from_list_of_arguments,
#                                                             default_input_path_to_SIFTS=default_input_path_to_SIFTS)
#
#         run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
#         run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB)
#         run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
#
#         input_PDB_files_were_found = look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB)
#         passed_as_arg_file_4Char_PDB = list()
#         for file_name in args.renumber_from_list_of_arguments:
#             passed_as_arg_file_4Char_PDB.append(file_name[:4])
#
#         target_files_list_PDB = list()
#         for file_name in input_PDB_files_were_found:
#             if file_name[3:7] in passed_as_arg_file_4Char_PDB:
#                 target_files_list_PDB.append(file_name)
#         master_PDB_renumber_function(target_files_list_PDB, default_input_path_to_PDB, default_input_path_to_SIFTS,
#                                      default_output_path_to_PDB, default_PDB_num, gzip_mode)
#
#         input_mmCIF_files_were_found = look_what_is_inside("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF)
#         passed_as_arg_file_4Char_mmCIF = list()
#         for file_name in args.renumber_from_list_of_arguments:
#             passed_as_arg_file_4Char_mmCIF.append(file_name[:4])
#
#         target_files_list_mmCIF = list()
#         for file_name in input_mmCIF_files_were_found:
#             if file_name[:4] in passed_as_arg_file_4Char_mmCIF:
#                 target_files_list_mmCIF.append(file_name)
#
#         if not os.path.exists(default_output_path_to_mmCIF):
#             os.makedirs(default_output_path_to_mmCIF)
#         res = ProcessPool_run_renum("mmCIF", target_files_list_mmCIF, default_input_path_to_mmCIF, default_input_path_to_SIFTS,
#                                     default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
#         log_writer(res)
#
#     elif args.PDB_format_only:
#         urls_to_target_PDB_files = url_formation_for_pool("PDB",  args.renumber_from_list_of_arguments,
#                                                           default_input_path_to_PDB=default_input_path_to_PDB)
#         urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS",  args.renumber_from_list_of_arguments,
#                                                             default_input_path_to_SIFTS=default_input_path_to_SIFTS)
#
#         run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB)
#         run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
#
#         input_PDB_files_were_found = look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB)
#         passed_as_arg_file_4Char_PDB = list()
#         for file_name in args.renumber_from_list_of_arguments:
#             passed_as_arg_file_4Char_PDB.append(file_name[:4])
#
#         target_files_list_PDB = list()
#         for file_name in input_PDB_files_were_found:
#             if file_name[3:7] in passed_as_arg_file_4Char_PDB:
#                 target_files_list_PDB.append(file_name)
#         master_PDB_renumber_function(target_files_list_PDB, default_input_path_to_PDB, default_input_path_to_SIFTS,
#                                      default_output_path_to_PDB, default_PDB_num, gzip_mode)
#
#     elif args.mmCIF_assembly_format_only:
#         urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF_assembly", args.renumber_from_list_of_arguments,
#                                                             default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly)
#         urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", args.renumber_from_list_of_arguments,
#                                                             default_input_path_to_SIFTS=default_input_path_to_SIFTS)
#
#         run_downloads_with_ThreadPool("mmCIF_assembly", urls_to_target_mmCIF_files,
#                                       default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly)
#         run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
#
#         input_mmCIF_files_were_found = look_what_is_inside("mmCIF_assembly",
#                                                            default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly)
#         passed_as_args_files_list_4Char = list()
#         for file_name in args.renumber_from_list_of_arguments:
#             passed_as_args_files_list_4Char.append(file_name[:4])
#
#         target_files_list = list()
#         for file_name in input_mmCIF_files_were_found:
#             if file_name[:4] in passed_as_args_files_list_4Char:
#                 target_files_list.append(file_name)
#
#         if not os.path.exists(default_output_path_to_mmCIF_assembly):
#             os.makedirs(default_output_path_to_mmCIF_assembly)
#         res = ProcessPool_run_renum("mmCIF_assembly", target_files_list, default_input_path_to_mmCIF_assembly, default_input_path_to_SIFTS,
#                                     default_output_path_to_mmCIF_assembly, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
#         log_writer(res)
#
#     else:
#         urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF",  args.renumber_from_list_of_arguments,
#                                                             default_input_path_to_mmCIF=default_input_path_to_mmCIF)
#         urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS",  args.renumber_from_list_of_arguments,
#                                                             default_input_path_to_SIFTS=default_input_path_to_SIFTS)
#         run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
#         run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
#         input_mmCIF_files_were_found = look_what_is_inside("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF)
#         passed_as_args_files_list_4Char = list()
#         for file_name in args.renumber_from_list_of_arguments:
#             passed_as_args_files_list_4Char.append(file_name[:4])
#
#         target_files_list = list()
#         for file_name in input_mmCIF_files_were_found:
#             if file_name[:4] in passed_as_args_files_list_4Char:
#                 target_files_list.append(file_name)
#
#         if not os.path.exists(default_output_path_to_mmCIF):
#             os.makedirs(default_output_path_to_mmCIF)
#         res = ProcessPool_run_renum("mmCIF", target_files_list, default_input_path_to_mmCIF, default_input_path_to_SIFTS,
#                                     default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
#         log_writer(res)

# DOWNLOAD
# DOWNLOAD FROM TEXT FILE
if args.download_from_text_file:
    parsed_input_text = (input_text_file_parser(args.download_from_text_file))
    if args.all_formats:
        urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF", parsed_input_text, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        urls_to_target_PDB_files = url_formation_for_pool("PDB", parsed_input_text, default_input_path_to_PDB=default_input_path_to_PDB)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", parsed_input_text, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

    elif args.PDB_format_only:
        urls_to_target_PDB_files = url_formation_for_pool("PDB", parsed_input_text, default_input_path_to_PDB=default_input_path_to_PDB)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", parsed_input_text, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

    else:
        urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF", parsed_input_text, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", parsed_input_text, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

# DOWNLOAD FROM LIST
if args.download_from_list_of_arguments:
    if args.all_formats:
        urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF",  args.download_from_list_of_arguments,
                                                            default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        urls_to_target_PDB_files = url_formation_for_pool("PDB",  args.download_from_list_of_arguments,
                                                          default_input_path_to_PDB=default_input_path_to_PDB)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS",  args.download_from_list_of_arguments,
                                                            default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

    elif args.PDB_format_only:
        urls_to_target_PDB_files = url_formation_for_pool("PDB", args.download_from_list_of_arguments,
                                                          default_input_path_to_PDB=default_input_path_to_PDB)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", args.download_from_list_of_arguments,
                                                            default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)

    else:
        urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF", args.download_from_list_of_arguments,
                                                            default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", args.download_from_list_of_arguments,
                                                            default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS)


#####################################################################################################################################################
# WHOLE DB WORK #
#####################################################################################################################################################

# RENUMBER ENTIRE DB
if args.renumber_entire_database:
    if args.all_formats:
        print("Starting to renumber tree databases...")
        print("Please, be patient...")
        supreme_download_master("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF, 
                                default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        input_mmCIF_files_were_found = look_what_is_inside("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF)
        mmCIF_files_left_to_renumber = lefttorenumber.left_to_renumber_mmCIF()

        res = ProcessPool_run_renum("mmCIF", mmCIF_files_left_to_renumber, default_input_path_to_mmCIF, default_input_path_to_SIFTS,
                                    default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
        log_writer(res)

        supreme_download_master("PDB", default_input_path_to_PDB=default_input_path_to_PDB, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        input_PDB_files_were_found = look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB)
        PDB_files_left_to_renumber = lefttorenumber.left_to_renumber_PDB()

        master_PDB_renumber_function(PDB_files_left_to_renumber, default_input_path_to_PDB, default_input_path_to_SIFTS, default_output_path_to_PDB,
                                     default_PDB_num, gzip_mode)
    elif args.PDB_format_only:
        print("Starting to renumber entire PDB database...")
        print("Please, be patient...")
        supreme_download_master("PDB", default_input_path_to_PDB=default_input_path_to_PDB, default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        input_PDB_files_were_found = look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB)
        PDB_files_left_to_renumber = lefttorenumber.left_to_renumber_PDB()

        master_PDB_renumber_function(PDB_files_left_to_renumber, default_input_path_to_PDB, default_input_path_to_SIFTS, default_output_path_to_PDB,
                                     default_PDB_num, gzip_mode)

    elif args.mmCIF_assembly_format_only:
        print("Starting to renumber entire mmCIF_assembly database...")
        print("Please, be patient...")
        supreme_download_master("mmCIF_assembly", default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly,
                                default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        mmCIF_assembly_left_to_renumber = lefttorenumber.left_to_renumber_mmCIF(default_input_path_to_mmCIF=default_input_path_to_mmCIF_assembly,
                                                                                default_output_path_to_mmCIF=default_output_path_to_mmCIF_assembly)

        res = ProcessPool_run_renum("mmCIF_assembly", mmCIF_assembly_left_to_renumber, default_input_path_to_mmCIF_assembly,
                                    default_input_path_to_SIFTS, default_output_path_to_mmCIF_assembly, default_mmCIF_num,
                                    exception_AccessionIDs, gzip_mode, nproc)
        log_writer(res)

    else:
        print("Starting to renumber entire mmCIF database...")
        print("Please, be patient...")
        supreme_download_master("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF, 
                                default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        mmCIF_files_left_to_renumber = lefttorenumber.left_to_renumber_mmCIF(default_input_path_to_mmCIF=default_input_path_to_mmCIF,
                                                                             default_output_path_to_mmCIF=default_output_path_to_mmCIF)

        res = ProcessPool_run_renum("mmCIF", mmCIF_files_left_to_renumber, default_input_path_to_mmCIF, default_input_path_to_SIFTS,
                                    default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
        log_writer(res)

# DOWNLOAD ENTIRE DB
if args.download_entire_database:
    if args.all_formats:
        print("Starting to download tree databases...")
        print("Please, be patient...")
        supreme_download_master("all", default_input_path_to_mmCIF=default_input_path_to_mmCIF,
                                default_input_path_to_PDB=default_input_path_to_PDB,
                                default_input_path_to_SIFTS=default_input_path_to_SIFTS,
                                default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly,
                                default_input_path_to_PDB_assembly=default_input_path_to_PDB_assembly)

    elif args.PDB_format_only:
        print("Starting to download entire PDB database...")
        print("Please, be patient...")
        supreme_download_master("PDB", default_input_path_to_PDB=default_input_path_to_PDB,
                                default_input_path_to_SIFTS=default_input_path_to_SIFTS)

    elif args.mmCIF_assembly_format_only:
        print("Starting to download entire mmCIF_assembly database...")
        print("Please, be patient...")
        supreme_download_master("mmCIF_assembly", default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly,
                                default_input_path_to_SIFTS=default_input_path_to_SIFTS)

    elif args.PDB_assembly_format_only:
        print("Starting to download entire PDB_assembly database...")
        print("Please, be patient...")
        supreme_download_master("PDB_assembly", default_input_path_to_PDB_assembly=default_input_path_to_PDB_assembly,
                                default_input_path_to_SIFTS=default_input_path_to_SIFTS)

    else:
        print("Starting to download entire mmCIF database...")
        print("Please, be patient...")
        supreme_download_master("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF, 
                                default_input_path_to_SIFTS=default_input_path_to_SIFTS)

# REFRESH ENTIRE DB
if args.refresh_entire_database:
    if args.all_formats:
        print("Starting to refresh tree databases...")
        print("Please, be patient...")
        left_to_refresh = supreme_download_master("all", "refresh", default_input_path_to_mmCIF=default_input_path_to_mmCIF,
                                                  default_input_path_to_PDB=default_input_path_to_PDB,
                                                  default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        left_to_refresh_mmCIF = left_to_refresh[0]
        left_to_refresh_PDB = left_to_refresh[1]
        master_PDB_renumber_function(left_to_refresh_PDB, default_input_path_to_PDB, default_input_path_to_SIFTS, default_output_path_to_PDB,
                                     default_PDB_num, gzip_mode)
        res = ProcessPool_run_renum("mmCIF", left_to_refresh_mmCIF, default_input_path_to_mmCIF, default_input_path_to_SIFTS,
                                    default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
        log_writer(res)

    elif args.PDB_format_only:
        print("Starting to refresh entire PDB database...")
        print("Please, be patient...")
        left_to_refresh_PDB = supreme_download_master("PDB", "refresh", default_input_path_to_PDB=default_input_path_to_PDB,
                                                      default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        master_PDB_renumber_function(left_to_refresh_PDB, default_input_path_to_PDB, default_input_path_to_SIFTS, default_output_path_to_PDB,
                                     default_PDB_num, gzip_mode)

    elif args.PDB_assembly_format_only:
        print("Starting to refresh entire PDB database...")
        print("Please, be patient...")
        left_to_refresh_PDB_assembly = supreme_download_master("PDB_assembly", "refresh",
                                                               default_input_path_to_PDB_assembly=default_input_path_to_PDB_assembly,
                                                               default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        master_PDB_renumber_function(left_to_refresh_PDB_assembly, default_input_path_to_PDB_assembly, default_input_path_to_SIFTS,
                                     default_output_path_to_PDB_assembly, default_PDB_num, gzip_mode)

    elif args.mmCIF_assembly_format_only:
        print("Starting to refresh entire mmCIF_assembly database...")
        print("Please, be patient...")
        left_to_refresh_mmCIF_assembly = supreme_download_master("mmCIF_assembly", "refresh",
                                                                 default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly,
                                                                 default_input_path_to_SIFTS=default_input_path_to_SIFTS)

        res = ProcessPool_run_renum("mmCIF_assembly", left_to_refresh_mmCIF_assembly, default_input_path_to_mmCIF_assembly,
                                    default_input_path_to_SIFTS, default_output_path_to_mmCIF_assembly, default_mmCIF_num,
                                    gzip_mode, exception_AccessionIDs, nproc)
        log_writer(res)

    else:
        print("Starting to refresh entire mmCIF database...")
        print("Please, be patient...")
        left_to_refresh_mmCIF = supreme_download_master("mmCIF", "refresh", default_input_path_to_mmCIF=default_input_path_to_mmCIF,
                                                        default_input_path_to_SIFTS=default_input_path_to_SIFTS)
        res = ProcessPool_run_renum("mmCIF", left_to_refresh_mmCIF, default_input_path_to_mmCIF, default_input_path_to_SIFTS,
                                    default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc)
        log_writer(res)
