In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Abrir el dataframe

import os
import pandas as pd

input_folder = "/content/drive/MyDrive/TMF/T1/ENLACE_COVALENTE"
input_file = os.path.join(input_folder, "df_harm_affinity.csv")
df_harm = pd.read_csv(input_file, sep = ',')
print(df_harm.shape[0],df_harm.columns)

110850 Index(['PDB_entry_id', 'Classification', 'Organism', 'Uniprot_id', 'Ligand_id',
       'Ligand_InChi', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Kd/Ki'],
      dtype='object')


In [None]:
#@title installar librerías

!pip install pdbecif
!pip install pandas
!pip install Biopython

Collecting pdbecif
  Downloading PDBeCif-1.5-py3-none-any.whl.metadata (2.8 kB)
Downloading PDBeCif-1.5-py3-none-any.whl (48 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdbecif
Successfully installed pdbecif-1.5
Collecting Biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Biopython
Successfully installed Biopython-1.85


In [None]:
#@title Crear los directorios y clonar el repositorio GitHub
import os

repo_path = "/content/drive/MyDrive/TMF/T1/ENLACE_COVALENTE"

# Check if the PDB-CAT repository has been cloned and installed
if not os.path.isfile("PDB-CAT_READY"):
    os.system("git clone https://github.com/URV-cheminformatics/PDB-CAT.git")
    os.chdir(repo_path)  # Change directory to the cloned repository
    os.system("pip install -r requirements.txt")  # Install PDB-CAT if it has a setup.py or pyproject.toml
    os.chdir("/content")  # Change back to the original directory
    os.system("touch PDB-CAT_READY")  # Create the PDB-CAT_READY file to indicate successful cloning and installation
print("PDB-CAT installed")

PDB-CAT installed


In [None]:
#@title Definir carpetas de input y de output

import os

# Definir la carpeta base correctamente
github = "/content/drive/MyDrive/TMF/T1/ENLACE_COVALENTE/PDB-CAT/PDB-CAT"

def ensure_directories():
    # Crear rutas correctamente
    cif_dir = os.path.join(github, "cif")
    out_dir = os.path.join(github, "out")

    # Crear directorios si no existen
    os.makedirs(cif_dir, exist_ok=True)
    os.makedirs(out_dir, exist_ok=True)

    print("Directorios creados o ya existentes:", cif_dir, out_dir)

ensure_directories()

Directorios creados o ya existentes: /content/drive/MyDrive/TMF/T1/ENLACE_COVALENTE/PDB-CAT/PDB-CAT/cif /content/drive/MyDrive/TMF/T1/ENLACE_COVALENTE/PDB-CAT/PDB-CAT/out


In [None]:
#@title Import librarias

from pdbecif.mmcif_io import CifFileReader
from pdbecif.mmcif_tools import MMCIF2Dict
import pandas as pd
import time
import re
import shutil
from Bio.Align import PairwiseAligner
from Bio.PDB import *
from datetime import datetime
from google.colab import files
import sys
sys.path.append('/content/PDB-CAT')
from PDBCAT_module import *

In [None]:
#@title Definir las rutas de las carpetas
"""
=========
INITIAL INFORMATION. CHANGE THE CONTENT OF THESE VARIABLES IF NECESSARY
"""
reference = ''

directory_path = github + "/cif/9/"  # Path to the folder with the cif files to process
output_path = github + "/out/"                                 # Path for the new categorizing folders
out_file = output_path + "df.csv"                             # Path and name of the FIRST csv output file (protein-centered) (.csv)
out_file_ligands = output_path + "df_ligands.csv"             # Path and name of the SECOND csv output file (ligand-centered) (.csv)
mutation = False                                              # Analyze mutations. True or False
pdb_reference_sequence = directory_path + reference           # Path to the pdb file that will be the reference sequence.
entity_reference = 0                                          # '0' means that the first _entity_poly of the pdb_reference_sequence will be the reference sequence
res_threshold = 15                                            # Chose a threshold to discriminate between peptides and the subunits of the protein


In [None]:
#@title MAIN CODE
"""
MAIN CODE. YOU DO NOT NEED TO CHANGE THIS PART
"""
blacklist, blacklist_dict = read_blacklist("/content/PDB-CAT/blacklist.txt") # Path to the blacklist file that contain the codes of the small molecules not considered ligands

# READ THE REFERENCE SEQUENCE. It is a PDB file in CIF format.
reference_seq=''
if mutation == True:
    ref_cfr = CifFileReader()
    ref_cif_obj = ref_cfr.read(pdb_reference_sequence, output='cif_wrapper', ignore=['_atom_site'])
    ref_cif_data = list(ref_cif_obj.values())[0]
    if '_entity_poly' in ref_cif_data:
        reference_seq = ref_cif_data['_entity_poly']['pdbx_seq_one_letter_code_can'][entity_reference]
        reference_seq = reference_seq.replace("\n","")

# First csv output. Protein-centered
# Second csv output. Ligand-centered

data = []
data_ligands = []
fields_to_include = ["PDB_ID", "Ligand", "Ligand_names","Ligand_types", "Ligand_functions", "Covalent_Bond", "Bond"]
fields_to_append = ["PDB_ID"]


for filename in os.listdir(directory_path):
    if filename.endswith('.cif'):
        file_path = os.path.join(directory_path, filename)
        data_from_file = process_cif_file(file_path, mutation, blacklist, reference_seq, res_threshold)
        data.append(data_from_file)

        # Split ligand names and create a new row for each ligand
        ligands = data_from_file["Ligand"].split('\n')
        ligand_names_list = data_from_file["Ligand_names"].split('\n')
        ligand_types_list = data_from_file["Ligand_types"].split('\n')
        covalent_bond_list = data_from_file["Covalent_Bond"].split('\n')
        ligand_covalents_bond = data_from_file["Bond"].split('\n')
        descarted_ligands = data_from_file["Discarted_Ligands"].split('\n')


        # Find the maximum length among the three lists
        max_length = max(len(ligands), len(ligand_names_list), len(ligand_types_list), len(covalent_bond_list), len(ligand_covalents_bond), len(descarted_ligands))

        for i in range(max_length):
            ligand_row = {field: data_from_file[field] for field in fields_to_include}

            # Get the element from each list
            ligand_row["Ligand"]= ligands[i].strip() if i < len(ligands) else ""
            ligand_row["Ligand_names"] = ligand_names_list[i].strip() if i < len(ligand_names_list) else ""
            ligand_row["Ligand_types"] = ligand_types_list[i].strip() if i < len(ligand_types_list) else ""
            ligand_row["Covalent_Bond"] = covalent_bond_list[i].strip() if i < len(covalent_bond_list) else ""
            ligand_row["Bond"] = ligand_covalents_bond[i].strip() if i < len(ligand_covalents_bond) else ""
            data_ligands.append(ligand_row)


            # Add column to the ligands DataFrame and fill it with corresponding information
            if i < len(descarted_ligands) and descarted_ligands[i].strip():  # Ensure there is information before adding
                ligand_row_discarded = {field: data_from_file[field] for field in fields_to_include}
                ligand_row_discarded["Ligand"] = descarted_ligands[i].strip()
                ligand_row_discarded["Ligand_names"] = blacklist_dict[descarted_ligands[i].strip()]
                ligand_row_discarded["Ligand_types"] = "Discarded"
                ligand_row_discarded["Covalent_Bond"] = ""
                ligand_row_discarded["Bond"] = ""
                data_ligands.append(ligand_row_discarded)


# First csv output. Protein-centered
df = pd.DataFrame(data)  # Create a Pandas df
df.to_csv(out_file, index=False)  # Save the df into a file

# Second csv output. Ligand-centered
df_ligand = pd.DataFrame(data_ligands) # Create a Pandas df

# Remove rows where 'Ligand' is empty or contains only white spaces
df_ligand['Ligand'] = df_ligand['Ligand'].str.strip()
df_ligand = df_ligand[df_ligand['Ligand'] != '']

# Define the new names for the columns
new_header = ['ID', 'Molecule', 'Name', 'Type', 'Function', 'Covalent', 'Bond']
df_ligand.columns = new_header

# Second csv output. Ligand-centered
df_ligand.to_csv(out_file_ligands, index=False) # Save the df into a file

# Classify whether there is a mutation
if mutation == False:
    no_mutated_list = os.listdir(directory_path)
    no_mutated_list = [filename[:-4] for filename in no_mutated_list]

if mutation ==True:
    no_mutated_list, non_mut_path = mutation_classification(directory_path, out_file, output_path)
    output_path = non_mut_path

# Classify depend on the bond
#bond_classification(directory_path, out_file, no_mutated_list, output_path, mutation)