In [6]:
import mol2
import pandas as pd
import numpy as np
import os
from random import sample
import pymol

In [7]:
files = "../Data/final_data/"
structures={}

for i in os.listdir(files):
    structures[i] = ['cavityALL.mol2', 'protein.mol2', 'site.mol2', 'ligand.mol2']

random_files = sample(list(structures.keys()), 1000)
random_files

['3h4v_9',
 '2idx_2',
 '4ffm_1',
 '2pa7_1',
 '4l80_5',
 '2nv4_2',
 '1gzf_2',
 '5far_1',
 '4r7u_1',
 '4rkf_1',
 '4jnq_1',
 '3bc1_1',
 '4uyg_3',
 '4dt8_2',
 '1d4d_1',
 '2y7j_3',
 '1xp8_1',
 '2i5b_5',
 '4yzc_1',
 '4twr_1',
 '4ng2_1',
 '5d3e_1',
 '3i6d_2',
 '4pxn_1',
 '1krh_1',
 '3o6r_2',
 '5cpm_1',
 '3gfb_4',
 '2a59_4',
 '4cid_1',
 '2bwp_1',
 '3ghu_1',
 '4zhx_1',
 '1fvp_1',
 '5esd_4',
 '3nmt_1',
 '3rf7_1',
 '4dma_1',
 '3qbd_2',
 '4ifc_1',
 '3x3x_2',
 '4a62_2',
 '1i2l_1',
 '4fgc_2',
 '3rpn_6',
 '4jem_2',
 '3p0h_2',
 '3icz_2',
 '2fna_1',
 '3v2b_1',
 '4y0k_1',
 '3sx2_7',
 '3nv6_1',
 '2wb2_1',
 '1nv9_1',
 '1af7_1',
 '4cta_2',
 '4du6_4',
 '3kk6_1',
 '2q97_1',
 '3hja_1',
 '2q1w_1',
 '1zh8_1',
 '1b5e_2',
 '1rv7_1',
 '4zir_1',
 '4nh0_2',
 '2io8_1',
 '1uoq_1',
 '4zoh_1',
 '4aqk_1',
 '4xiw_1',
 '3ut3_1',
 '1agn_3',
 '3tzl_1',
 '3ea0_1',
 '3x1d_1',
 '1ny5_2',
 '4s1i_2',
 '1yun_1',
 '2yiu_1',
 '4azw_1',
 '3vw9_1',
 '4x9m_1',
 '4ec3_1',
 '5aq1_2',
 '4o9c_1',
 '2ztl_3',
 '2hmv_2',
 '1u80_2',
 '2qe6_2',

Create the new basic data frame for the protein.

In [38]:
def NewDataFrame(protein):
    """
    This function creates a new pandas dataframe each time it's called. It takes
    one parameter:

        - protein: this is the protein object that we've defined in the previous .

    The function will return the format in a pandas data frame. It will also
    calculate the protein's average distance between alpha carbons. Only the
    alpha carbons will be saved for the sake of simplicity. It will also take
    the distance between the protein and the cavity, the protein and the ligand.
    """
    def get_avg_distance(X):
        avg_distances = []
        for i, coord1 in enumerate(X):
            distances = []
            for j, coord2 in enumerate(X):
                distances.append(np.linalg.norm(coord1-coord2))
            avg_distances.append(sum(distances)/len(distances))
        return avg_distances

    def get_avg_distance2(X, Y):
        avg_distances = []
        for i, coord1 in enumerate(X):
            distances = []
            for j, coord2 in enumerate(Y):
                distances.append(np.linalg.norm(coord1-coord2))
            avg_distances.append(sum(distances)/len(distances))
        return avg_distances

    def get_angles(coords):
        """
        Calculates the phi and psi angles of a protein from its coordinates. The
        angle is calculated in radians. It uses the inverse tangent formula.

        Args:
            coords (list): List of lists containing the coordinates of the
            protein atoms.

        Returns:
            phi (list): List of the phi angles of the protein.
            psi (list): List of the psi angles of the protein.
        """
        coords=coords.tolist()
        phi = []
        psi = []
        for i in range(1, len(coords) - 1):
            x1, y1, z1 = coords[i - 1]
            x2, y2, z2 = coords[i]
            x3, y3, z3 = coords[i + 1]

            # Calcula el ángulo phi
            phi_i = np.arctan2(y2 - y1, x2 - x1) - np.arctan2(z2 - z1, np.sqrt((x2 - x1)**2 + (y2 - y1)**2))
            phi.append(phi_i)

            # Calcula el ángulo psi
            psi_i = np.arctan2(y2 - y1, x2 - x1) - np.arctan2(z3 - z2, np.sqrt((x3 - x2)**2 + (y3 - y2)**2))
            psi.append(psi_i)
        return phi, psi
    
    new_df = pd.DataFrame(protein.get_proteinCA())
    site_atoms = protein.get_siteCA().tolist()
    protein_atoms = protein.get_proteinCA().tolist()
    binding = []
    for atom in protein_atoms:
        if atom in site_atoms:
            binding.append(1)
        else:
            binding.append(0)
    proteinCA_angles_PHI, proteinCA_angles_PSI = get_angles(protein.get_proteinCA())
    proteinCA_angles_PHI.append(0)
    proteinCA_angles_PHI.insert(0,0)
    proteinCA_angles_PSI.append(0)
    proteinCA_angles_PSI.insert(0,0)
    protein_distances = get_avg_distance(protein.get_proteinCA())
    protein_ligand = get_avg_distance2(protein.get_proteinCA(), protein.get_ligand())
    protein_cavity = get_avg_distance2(protein.get_proteinCA(), protein.get_cavity())
    # Add the values to the dataframe.
    new_df.columns = ["X_COORD", "Y_COORD", "Z_COORD"]
    new_df["PROTEIN_AVG_LENGTH"] = protein_distances
    new_df["PROTEIN_LIGAND_LENGTH"] = protein_ligand
    new_df["PROTEIN_CAVITY_LENGTH"] = protein_cavity
    new_df["PROTEIN_PSI"] = proteinCA_angles_PSI
    new_df["PROTEIN_PHI"] = proteinCA_angles_PHI
    new_df["BINDING_ATOM"] = binding 
    #new_df["ENERGY"] = energy

    return new_df


TEST

In [39]:
path = "../Data/final_data/1a2n_1/"
files = ['cavity6.mol2', 'protein.mol2', 'site.mol2', 'ligand.mol2']

my_protein = mol2.Protein(name="1a2n", protein=f"{path}protein.mol2", cavity=f"{path}cavity6.mol2", ligand=f"{path}ligand.mol2", site=f"{path}site.mol2")

In [43]:
my_new_df = NewDataFrame(my_protein)
my_new_df

Unnamed: 0,X_COORD,Y_COORD,Z_COORD,PROTEIN_AVG_LENGTH,PROTEIN_LIGAND_LENGTH,PROTEIN_CAVITY_LENGTH,PROTEIN_PSI,PROTEIN_PHI,BINDING_ATOM
0,28.483,40.949,50.038,30.203966,26.570205,26.180816,0.000000,0.000000,0
1,26.575,38.219,51.784,29.236928,26.462434,26.130136,-3.230472,-2.663619,0
2,28.107,37.098,55.091,28.821448,26.338629,26.043562,-1.308518,-1.681389,0
3,27.252,34.276,57.460,28.285636,26.654877,26.437346,-3.106973,-2.541799,0
4,26.404,35.148,61.025,30.406867,29.838038,29.618584,1.730930,1.100250,0
...,...,...,...,...,...,...,...,...,...
413,22.099,33.158,59.362,30.566866,30.708636,30.518980,3.083350,2.470040,0
414,22.739,34.794,55.987,29.857913,29.097156,28.865362,1.157243,2.288775,0
415,23.005,38.521,56.139,31.783441,30.895894,30.598239,2.297255,1.458888,0
416,22.336,41.073,53.435,32.969824,31.691888,31.350848,1.951506,2.624883,0


What follows:
```
for i in prots: # given that prots is a dictionary with the proteins as keys and the dataframes as values.
    randomforest.fit(prots[i])
```