In [10]:
# importing stuff
from utils import get_pdb_resolution_list, get_sequences_from_file, download_pdb_sequences
import pandas as pd
from io import StringIO
import re

In [11]:
# Defining constants
# We're limiting the resolution [Combined Resolution] to 3.0 as higher (lower quality) resolutions are not reliable
MAX_RESOLUTION = 3.0
MIN_Z_SCORE = 2.0 # Suggested on the DALI website
MAX_RMSD = 2.0
MAX_EVALUE_FOLDSEEK = 0.001 # suggested on the foldseek website

In [12]:
# Common names for metrics
TARGET_COLUMN = 'target'
TARGET_PDB_COLUMN = 'target_pdb'
QUERY_COLUMN = 'query'
Z_SCORE_COLUMN = 'z-score'
RMSD_COLUMN = 'rmsd'
SEQUENCE_COLUMN = 'sequence'
RESOLUTION_COLUMN = 'resolution'

In [13]:
# the path to the incoming dump files
PDBE_SEARCH_RESULTS_PATH ="./data/search_results/pdbe.txt"
DALI_SEARCH_RESULTS_PATH = "./data/search_results/dali.txt"
FOLDSEEK_SEARCH_RESULTS_PATH = "./data/search_results/foldseek.tsv"

# The path to the output (codes) files.
PDBE_CODES_PATH = "./data/codes/pdbe.txt"   
DALI_CODES_PATH = "./data/codes/dali.txt"
FOLDSEEK_CODES_PATH = "./data/codes/foldseek.txt"
INTERSECTION_CODES_PATH = "./data/codes/intersection.txt"

# The path to the output (sequences pre-clustering) files.
PDBE_PRECLUSTERING_SEQUENCES_PATH = "./data/sequences/preclustering-pdbe.txt"
DALI_PRECLUSTERING_SEQUENCES_PATH = "./data/sequences/preclustering-dali.txt"
FOLDSEEK_PRECLUSTERING_SEQUENCES_PATH = "./data/sequences/preclustering-foldseek.txt"
INTERSECTION_PRECLUSTERING_SEQUENCES_PATH = "./data/sequences/preclustering-intersection.txt"

In [14]:
# we now add a sequences column to the dataframe
# we do so by getting the sequences from the PDB SEQRES file
SEQ_RES_PATH = download_pdb_sequences()

# Common Functions

In [15]:
# filters a datagrame
# it filters by resolution, z-score, and rmsd and evalue
# for each of these filters it first checks if the column exists
# since for example foldseek doesn't have a z-score column
# it keeps track of which filters were applied
# and prints out a message with the filters that were applied
def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Filter a dataframe by resolution, z-score, and rmsd and evalue"""
    # keep track of which filters were applied
    filters = []
    len_before = len(df)
    
    # filter by resolution
    if RESOLUTION_COLUMN in df.columns:
        df = df[df[RESOLUTION_COLUMN] <= MAX_RESOLUTION]
        filters.append(f"resolution <= {MAX_RESOLUTION}")
    
    # filter by z-score
    if Z_SCORE_COLUMN in df.columns:
        df = df[df[Z_SCORE_COLUMN] >= MIN_Z_SCORE]
        filters.append(f"z-score >= {MIN_Z_SCORE}")
    
    # filter by rmsd
    if RMSD_COLUMN in df.columns:
        df = df[df[RMSD_COLUMN] <= MAX_RMSD]
        filters.append(f"rmsd <= {MAX_RMSD}")
    
    # filter by evalue
    if 'evalue' in df.columns:
        df = df[df['evalue'] <= MAX_EVALUE_FOLDSEEK]
        filters.append(f"evalue <= {MAX_EVALUE_FOLDSEEK}")
    
    # print out the filters that were applied
    print(f"Applied filters: {', '.join(filters)} ({len_before - len(df)} rows removed)")
    
    return df

# removes duplicate rows from a dataframe (rows with the same target_pdb and sequence)
# also logs the number of duplicate rows that were removed
def remove_duplicate_rows(df: pd.DataFrame) -> pd.DataFrame:
    """Remove duplicate rows from a dataframe"""
    # get the number of rows before removing duplicates
    num_rows_before = len(df)
    
    # remove duplicate rows
    df = df.drop_duplicates(subset=[TARGET_PDB_COLUMN, SEQUENCE_COLUMN])
    
    # get the number of rows after removing duplicates
    num_rows_after = len(df)
    
    # log the number of rows removed
    print(f"Removed {num_rows_before - num_rows_after} duplicate rows")
    
    return df

# Adds a column to the dataframe with the resolution of the PDB ID
# The resolution is obtained using the get_pdb_resolution_list function
async def add_resolution_column(df: pd.DataFrame) -> pd.DataFrame:
    """Add a column to the dataframe with the resolution of the PDB ID"""
    # get the resolutions
    resolutions : dict[str, float] = await get_pdb_resolution_list(set(df[TARGET_PDB_COLUMN].unique()))
    
    # add the resolution column
    df[RESOLUTION_COLUMN] = df[TARGET_PDB_COLUMN].map(lambda x: resolutions[x])
    
    return df


# adds a column to the dataframe with the sequence of the PDB ID
# The sequence is obtained using the get_sequences_from_file function
def add_sequence_column(df: pd.DataFrame) -> pd.DataFrame:
    """Add a column to the dataframe with the sequence of the PDB ID"""
    # get the sequences
    sequences : dict[str, str] = get_sequences_from_file((df[TARGET_COLUMN].values), res_seq_path=SEQ_RES_PATH)
    
    # add the sequence column
    df[SEQUENCE_COLUMN] = df[TARGET_COLUMN].map(lambda x: sequences[x])
    
    return df

# A utility function which groups all the above functions
async def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Process a dataframe by filtering and removing duplicates"""
    df = await add_resolution_column(df)
    
    # If the sequence column doesn't exist, add it
    if SEQUENCE_COLUMN not in df.columns:
        df = add_sequence_column(df)
    else:
        print("Sequence column already exists")

    df = remove_duplicate_rows(df)
    df = filter_dataframe(df)
    
    return df

In [16]:
# Writes a fasta file with the sequences of the filtered dataframe
# To the file specified by the path
def write_fasta(df: pd.DataFrame, path: str) -> None:
    """Writes a fasta file with the sequences of the filtered dataframe"""
    
    with open(path, "w") as f:
        for index, row in df.iterrows():
            f.write(f">{row[TARGET_COLUMN]}\n{row[SEQUENCE_COLUMN]}\n\n")
        
# writes a file with all the pdb codes and chains of the filtered dataframe to the path
# with the provided separator
def write_pdb_codes(df: pd.DataFrame, path: str, sep = ","):
    """Writes a file with all the pdb codes and chains of the filtered dataframe to the path"""
    with open(path, "w") as f:
        f.write(sep.join(df[TARGET_COLUMN].unique()))
        
# Reads a file with the pdb codes and chains and returns a list of them
def read_pdb_codes(path: str, sep = ",") -> list[str]:
    """Reads a file with the pdb codes and chains and returns a list of them"""
    with open(path, "r") as f:
        return f.read().split(sep)

# Parsing all the file types
For each of the methods below, the database will follow the conventions for the column names

### Parsing the PDBe Results

In [17]:
# open the file read it, remove any "PDB " in the file and open it as a fixed width file in pandas 
# PDBe results are partially a fixed-width file for the exception of the first row (header)
# so, to open we first format the data.
with open(PDBE_SEARCH_RESULTS_PATH, "r") as f:
    # The PDB code is redundant as we performed a search only against the PDB database
    # Moreover, spaces in the last two columns would cause problems when reading the file
    data = f.read().replace("PDB ", "")
    
    # replace any amount of whitespaces (from 1 on) with a single tab
    data = re.sub(r"[ ]+", "\t", data)
    
    # remove tabs at the beginning of the line (if any)
    data = re.sub(r"^\t", "", data, flags=re.MULTILINE)

    # read the data into a pandas dataframe as a TSV file
    pdbe_results = pd.read_csv(StringIO(data), sep="\t")
    
    # we remove duplicate rows (based on the Target column (if any))
    # and log how many rows were removed for debugging purposes
    len_before = len(pdbe_results)
    pdbe_results.drop_duplicates(subset="Target", inplace=True)
    print(f"Removed {len_before - len(pdbe_results)} duplicate rows")
  
    # replace : in the Query and Target columns with _ (since it's how foldseek names the files and how sequences with chains are named in the seqres file from PDB)
    pdbe_results["Query"] = pdbe_results["Query"].str.replace(":", "_")
    pdbe_results["Target"] = pdbe_results["Target"].str.replace(":", "_") 
    
    # Rename the Target and Query columns to target and query respectively
    pdbe_results.rename(columns={
        "Target": TARGET_COLUMN, 
        "Query": QUERY_COLUMN ,
        "RMSD": RMSD_COLUMN,
        "Z-score": Z_SCORE_COLUMN
    }, inplace=True)
    
    # Add a column called target_pdb which is the first 4 characters of the Target column
    pdbe_results["target_pdb"] = pdbe_results["target"].str.split("_").str[0]
    
    # drop the column called ##
    pdbe_results.drop("##", axis=1, inplace=True)

# Print the first 5 rows of the dataframe
pdbe_results.head()

Removed 0 duplicate rows


Unnamed: 0,Q-score,P-score,z-score,rmsd,Nalgn,Nsse,Ngaps,Seq-%,Nmd,Nres-Q,Nsse-Q,Nres-T,Nsse-T,query,target,target_pdb
0,1.0,16.18,11.93,0.0,58,4,0,1.0,0,58,4,58,4,5pti_A,5pti_A,5pti
1,0.999,14.33,11.2,0.093,58,4,0,0.9828,0,58,4,58,4,5pti_A,9pti_A,9pti
2,0.9956,12.59,10.48,0.199,58,4,0,0.9655,0,58,4,58,4,5pti_A,7pti_A,7pti
3,0.959,12.25,10.33,0.248,56,4,0,0.9821,0,58,4,56,4,5pti_A,1bpt_A,1bpt
4,0.9297,12.12,10.27,0.592,57,4,0,0.9825,0,58,4,58,4,5pti_A,1fan_A,1fan


In [18]:
pdbe_results = await process_dataframe(pdbe_results)

Removed 135 duplicate rows
Applied filters: resolution <= 3.0, z-score >= 2.0, rmsd <= 2.0 (23 rows removed)


In [19]:
write_fasta(pdbe_results, PDBE_PRECLUSTERING_SEQUENCES_PATH)
write_pdb_codes(pdbe_results, PDBE_CODES_PATH)

### Parsing Outputs from foldseek

In [20]:
# Foldseek output is just a tsv, open it as a dataframe, just add a column called target_pdb to the dataframe
foldseek_results = pd.read_csv(FOLDSEEK_SEARCH_RESULTS_PATH, sep="\t")

# rename the tseq column to sequence and add a column called target_pdb to the dataframe
foldseek_results.rename(columns={"tseq": SEQUENCE_COLUMN}, inplace=True)
foldseek_results[TARGET_PDB_COLUMN] = foldseek_results[TARGET_COLUMN].str.split("_").str[0]

# Process the dataframe
foldseek_results = await process_dataframe(foldseek_results)

# Write the sequences to a fasta file
write_fasta(foldseek_results, FOLDSEEK_PRECLUSTERING_SEQUENCES_PATH)
write_pdb_codes(foldseek_results, FOLDSEEK_CODES_PATH)

Sequence column already exists
Removed 137 duplicate rows
Applied filters: resolution <= 3.0, rmsd <= 2.0, evalue <= 0.001 (35 rows removed)


In [21]:
foldseek_results.head()

Unnamed: 0,query,target,qstart,tstart,evalue,prob,rmsd,sequence,target_pdb,resolution
0,5pti.pdb,7pti_A,1,1,1.582e-10,1.0,0.199,RPDFCLEPPYTGPCKARIIRYFYNAKAGLAQTFVYGGCRAKRNNFK...,7pti,1.6
1,5pti.pdb,1bti_A,1,1,7.734e-11,1.0,0.9165,RPDFCLEPPYTGPCKARIIRYAYNAKAGLCQTFVYGGCRAKRNNFK...,1bti,2.2
2,5pti.pdb,1fan_A,1,1,1.483e-10,1.0,1.02,RPDFCLEPPYTGPCKARIIRYFYNAKAGLCQTFVYGGCRAKRNNAK...,1fan,2.0
3,5pti.pdb,1t7c_D,1,1,1.483e-10,1.0,1.283,RPDFCLEPPYTGPCEARIIRYFYNAKAGLCQTFVYGGCRAKRNNFK...,1t7c,1.85
4,5pti.pdb,3p92_E,1,1,1.483e-10,1.0,0.9127,RPDFCLEPPYTGPCRAGIIRYFYNAKAGLCQTFVYGGCRAKRNNFK...,3p92,1.5992


### Parsing DALI output

In [22]:
# DALI outputs are strangely formatted:
# this is an example of a DALI output:
# No:  Chain   Z    rmsd lali nres  %id PDB  Description
#    1:  5pti-A 14.8  0.0   58    58  100   MOLECULE: TRYPSIN INHIBITOR;                                         
#    2:  9pti-A 14.3  0.1   58    58   98   MOLECULE: BOVINE PANCREATIC TRYPSIN INHIBITOR;    

# to start, we open the file and read it as a string
with open(DALI_SEARCH_RESULTS_PATH, "r") as f:
    data = f.read()
    
    # We remove from MOLECULE: to the end of the line (since we don't need the description)
    data = re.sub(r"MOLECULE:.*$", "", data, flags=re.MULTILINE)
    
    # We remove the # at the beginning of any line
    data = re.sub(r"^#", "", data, flags=re.MULTILINE)
    
    # We then replace any amount of whitespaces (from 1 on) with a single tab and remove any tabs at the beginning of the line (if any)
    data = re.sub(r"[ ]+", "\t", data)
    data = re.sub(r"^\t", "", data, flags=re.MULTILINE)
    
    # We then read the data into a pandas dataframe as a TSV file
    dali_results = pd.read_csv(StringIO(data), sep="\t")
    
    # We now fix columns
    # - remove the Description column and No:, PDB and Description columns
    # - rename the Chain column to target
    # - rename the Z column to Z-score
    # - rename the rmsd column to RMSD
    dali_results.drop(["Description", "No:", "PDB"], axis=1, inplace=True)
    dali_results.rename(columns={"Chain": TARGET_COLUMN, "Z": Z_SCORE_COLUMN, "rmsd": RMSD_COLUMN}, inplace=True)
    
    # We then lastly format the target column to be in the form of PDB_CHAIN instead of PDB-CHAIN
    dali_results[TARGET_COLUMN] = dali_results[TARGET_COLUMN].str.replace("-", "_")
    
    # We then add a column called target_pdb which is the first 4 characters of the Target column
    dali_results["target_pdb"] = dali_results[TARGET_COLUMN].str.split("_").str[0]
    
dali_results.head()

Unnamed: 0,target,z-score,rmsd,lali,nres,%id,target_pdb
0,5pti_A,14.8,0.0,58,58,100,5pti
1,9pti_A,14.3,0.1,58,58,98,9pti
2,7pti_A,13.9,0.2,58,58,97,7pti
3,1g6x_A,13.3,1.1,58,58,93,1g6x
4,1bpi_A,13.3,1.2,58,58,100,1bpi


In [23]:
# Process the dataframe
dali_results = await process_dataframe(dali_results)

# Write the sequences to a fasta file
write_fasta(dali_results, DALI_PRECLUSTERING_SEQUENCES_PATH)
write_pdb_codes(dali_results, DALI_CODES_PATH)

Removed 142 duplicate rows
Applied filters: resolution <= 3.0, z-score >= 2.0, rmsd <= 2.0 (37 rows removed)


# Comparing the results of the three methods

In [24]:
# We open and read the codes file as a string and compare intersections between the codes
pdbe_codes = set(read_pdb_codes(PDBE_CODES_PATH))
foldseek_codes = set(read_pdb_codes(FOLDSEEK_CODES_PATH))
dali_codes = set(read_pdb_codes(DALI_CODES_PATH))

# We then get the intersection between the codes
print("PDBE and Foldseek intersection:", len(pdbe_codes.intersection(foldseek_codes)))
print("PDBE and DALI intersection:", len(pdbe_codes.intersection(dali_codes)))
print("Foldseek and DALI intersection:", len(foldseek_codes.intersection(dali_codes)))
print("PDBE, Foldseek and DALI intersection:", len(pdbe_codes.intersection(foldseek_codes).intersection(dali_codes)))

# Print the number of codes in each set (For reference)
print("PDBE codes:", len(pdbe_codes))
print("Foldseek codes:", len(foldseek_codes))
print("DALI codes:", len(dali_codes))

PDBE and Foldseek intersection: 83
PDBE and DALI intersection: 138
Foldseek and DALI intersection: 77
PDBE, Foldseek and DALI intersection: 68
PDBE codes: 168
Foldseek codes: 122
DALI codes: 169


In [25]:
# We save a fasta with the names of the codes that are in all 3 datasets
# to do so, we get the intersection between the codes and then write the fasta
intersection = pdbe_codes.intersection(foldseek_codes).intersection(dali_codes)

# we then take the intersection from the pdbe codes and write the fasta
intersection_results = pdbe_results[pdbe_results[TARGET_COLUMN].isin(intersection)]

# We then write the fasta and codes
write_fasta(intersection_results, INTERSECTION_PRECLUSTERING_SEQUENCES_PATH)
write_pdb_codes(intersection_results, INTERSECTION_CODES_PATH)

In [36]:
# We save a fasta with the names of the codes that are in at least one dataset
# to do so, we get the union between the codes and then write the fasta
union = pdbe_codes.union(foldseek_codes).union(dali_codes)

# we then create a dataframe by creating a dataframe with just three columns: target, target_pdb and sequence
# and for each code in the union, we add a row with the code, the first 4 characters of the code and the sequence
# to take the sequence, we look first in the pdbe results, then in the foldseek results and lastly in the dali results
# if the code is not in any of the results, we throw an error
union_results = pd.DataFrame(columns=[TARGET_COLUMN, TARGET_PDB_COLUMN, SEQUENCE_COLUMN])

for code in union:
    # get the sequence from the pdbe results, foldseek results and dali results
    _pdbe_sequence = pdbe_results[pdbe_results[TARGET_COLUMN] == code][SEQUENCE_COLUMN].values
    _foldseek_sequence = foldseek_results[foldseek_results[TARGET_COLUMN] == code][SEQUENCE_COLUMN].values
    _dali_sequence = dali_results[dali_results[TARGET_COLUMN] == code][SEQUENCE_COLUMN].values
    
    _pdbe_sequence = _pdbe_sequence[0] if len(_pdbe_sequence) > 0 else None
    _foldseek_sequence = _foldseek_sequence[0] if len(_foldseek_sequence) > 0 else None
    _dali_sequence = _dali_sequence[0] if len(_dali_sequence) > 0 else None
    
    # we check that the non null sequences are the same 
    # if they are not, we throw an error
    _all_results = [s for s in [_pdbe_sequence, _foldseek_sequence, _dali_sequence] if s is not None]
    if len(_all_results) > 0 and len(set(_all_results)) > 1:
        # foldseek only keeps the sequence for the aligned portion of the protein, not the whole sequence so it would be different from the one found on 
        # the PDB SEQRES file
        pass
    elif len(_all_results) == 0:
        raise ValueError(f"Sequence for {code} not found in any of the results")
    
    sequence = _all_results[0]
    
    # we then add a row to the dataframe with the code, the first 4 characters of the code and the sequence
    union_results = pd.concat([union_results, pd.DataFrame([{
        TARGET_COLUMN: code,
        TARGET_PDB_COLUMN: code[:4],
        SEQUENCE_COLUMN: sequence
    }])])

union_results.head()

Unnamed: 0,target,target_pdb,sequence
0,3byb_B,3byb,KDRPDFCELPADTGPCRVRFPSFYYNPDEKKCLEFIYGGCEGNANN...
0,3auc_A,3auc,RPAFCLEPPYAGPGKARIIRYFYNAAAGAAQTFVYGGVRAKRNNFA...
0,1zr0_D,1zr0,PTGNNAEICLLPLDYGPCRALLLRYYYDRYTQSCRQFLYGGCEGNA...
0,3aui_B,3aui,RPAFCLEPPYAGPGKARIIRYFYNAAAGAAQAFVYGGVRAKRNNFA...
0,3bth_I,3bth,RPDFCLEPPYTGPCHARIIRYFYNAKAGLCQTFVYGGCRAKRNNFK...


In [37]:
# Save the fasta and codes
write_fasta(union_results, "./data/sequences/preclustering-union.txt")
write_pdb_codes(union_results, "./data/codes/union.txt")

# Clustering and Converting to Stockholm format

In [38]:
# Constants
CLUSTER_FILE_PATH = "./data/clusters/union-alfatclust_out.clust"
CLUSTER_EVAL_FILE_PATH = "./data/clusters/union-alfatclust_out.clust.eval"
CLUSTER_CODES_PATH = "./data/clusters/union-codes.txt"

ALIGNMENTS_PATH = "./data/alignment/union.fasta"
ALIGNMENTS_STOCKHOLM_PATH = "./data/alignment/union.sto"

In [39]:
# Parses the files produced by alfatclust and returns a dataframe and a dictionary
# the dataframe is the eval file and the dictionary maps cluster ids to pdb ids
def parse_clust_file(clust_file_path : str, clust_eval_file_path : str) -> tuple[pd.DataFrame, dict[str, list[str]]]:
    
    # parse the clust file and create a dictionary mapping cluster id to a list of pdb ids
    clusters = dict()
    with open(clust_file_path, "r") as f:
        current_cluster = None
        for line in f:
            if line.startswith("#"):
                current_cluster = int(line.split()[1])
                clusters[current_cluster] = []
            else:
                clusters[current_cluster].append(line.strip())

    # parse the eval file and create a dataframe from it
    with open(clust_eval_file_path, "r") as f:
        eval_df = pd.read_csv(f)
    
    # set the index to the Cluster Id column
    eval_df.set_index("Cluster Id", inplace=True)
        
    out_clusters = {}
    
    # loop over the clusters in the clusters dictionary and:
    # if the cluster is in the eval dataframe then we add the id from the "Center sequence" column to the out_clusters dictionary
    # otherwise, if the cluster has only one pdb id then we add that pdb id to the out_clusters dictionary
    # otherwise (if the cluster has more than one pdb id and is not in the eval dataframe) we print a warning
    for cluster_id, cluster in clusters.items():
        if cluster_id in eval_df.index:
            out_clusters[cluster_id] = eval_df.loc[cluster_id]["Center sequence"]
        elif len(cluster) == 1:
            out_clusters[cluster_id] = cluster[0]
        else:
            print(f"Warning: cluster {cluster_id} is not in the eval file and has more than one pdb id")

    return eval_df, out_clusters

cluster_eval_df, clusters = parse_clust_file(CLUSTER_FILE_PATH, CLUSTER_EVAL_FILE_PATH)

In [40]:
# save the values in the cluster dictionary to a file called clusters_codes.txt
# replacing the _ with : in the pdb id
with open(CLUSTER_CODES_PATH, "w") as f:
    for cluster_id, pdb_id in clusters.items():
        f.write(f"{pdb_id.replace('_', ':')}\n")

# Converting the alignments to Stockholm

In [42]:
from Bio import SeqIO


# parse the alignments file and create a dictionary mapping the target pdb id to the sequence
records = SeqIO.parse(ALIGNMENTS_PATH, "fasta")
count = SeqIO.write(records, ALIGNMENTS_STOCKHOLM_PATH, "stockholm")
print("Converted %i records" % count)

Converted 24 records
