### Fixing FoldSeek default filters

1. Read in FoldSeek files for each protein
2. Filter by Evalue, throw out those with E-values > 0.001
3. Apply functions & store newly filtered FoldSeek files 

### Requires:
* FoldSeek structure comparisons generated during viral acetyltransferase search, stored in Desktop/phage_annotation_input/structure_comparison:
    * in each folder batch_XX with XX a batch number between 0 and 34:
        * PROTEIN_relaxed.pdb / PROTEIN_best.pdb : copy of relaxed (if available, otherwise best) AlphaFold predicted structure for PROTEIN
        * data_foldseek_5.csv : comma seperated input file to fill in the job script with array tools, each row contains info on 5 proteins, listing for each protein consecutively (i) the input protein structure file name, (ii-iii) and the file name of the 2 FoldSeek output files for the input. protein
        * script_foldseek_5.slurm : SLURM job script for the FoldSeek job (optimized for KUL cluster)
        * FoldSeek output files (see FoldSeek docs for more information on the file format):
            * PROTEIN_aln_af50m.txt : output file of the FoldSeek run for PROTEIN against the AlphaFold database
            * PROTEIN_aln_pdb.txt : output file of the FoldSeek run for PROTEIN against the PDB databaseslurm-XXXXX.out files : error/output files for the HPC jobs
            * PROTEIN_foldseek_af50m_filtered.csv and PROTEIN_foldseek_pdb_filtered.csv : FoldSeek output file against AlphaFold/PDB database for PROTEIN which has been subjected to (i) the addition of the average pLDDT of the aligned region (pLDDT_qAln), and (ii) filtering based on the filters described in the notebook

### Generates:
* default settings filtered FoldSeek structure comparisons, by filtering out the raw (_aln_) FoldSeek result files on the default E-value cutoff of 0.001:
    * in each folder batch_XX with XX a batch number between 0 and 34:
        * PROTEIN_foldseek_af50m_default.csv and PROTEIN_foldseek_pdb_default.csv : FoldSeek output file against AlphaFold/PDB database for PROTEIN which has been subjected to E-value filtering

In [1]:
!conda list --explicit

# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
# created-by: conda 24.11.0
@EXPLICIT
https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda
https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2024.11.26-h06a4308_0.conda
https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.38-h1181459_1.conda
https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-11.2.0-h1234567_1.conda
https://repo.anaconda.com/pkgs/main/noarch/pybind11-abi-5-hd3eb1b0_0.conda
https://repo.anaconda.com/pkgs/main/noarch/tzdata-2024a-h04d1e81_0.conda
https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda
https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda
https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-11.2.0-h1234567_1.conda
https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h5eee18b_6.conda
https://repo.anaconda.com/pkgs/main/linux-64/c-ares-1.

In [2]:
# imports
import os
import pandas as pd

In [3]:
base_path = "c_structure_annotation/structure_comp"

In [8]:
# Reading in the FoldSeek output files for a specific protein in a batch - code copied from notebook 2_best_hit
def read_foldseek_protein(batch, protein):
    path_batch = os.path.join(base_path, f"batch_{batch}")
    pdb_raw = os.path.join(path_batch, f"{protein}_relaxed_aln_pdb.txt")
    af50m_raw = os.path.join(path_batch, f"{protein}_relaxed_aln_af50m.txt")
    phold_raw = os.path.join(path_batch, f"{protein}_relaxed_phold.txt")
    df_pdb_raw = pd.read_table(pdb_raw, header = None, names = ["query","target","fident","alnlen","mismatch","gapopen","qstart","qend","tstart","tend","evalue","bits","prob","lddt","lddtfull"])
    df_af50m_raw = pd.read_table(af50m_raw, header = None, names = ["query","target","fident","alnlen","mismatch","gapopen","qstart","qend","tstart","tend","evalue","bits","prob","lddt","lddtfull"])
    df_phold_raw = pd.read_table(phold_raw, header = None, names = ["query","target","fident","alnlen","mismatch","gapopen","qstart","qend","tstart","tend","evalue","bits","prob","lddt","lddtfull"])
    return df_pdb_raw, df_af50m_raw, df_phold_raw

In [4]:
# Filter by Evalue, throw out those with E-values > 0.001
def filter_foldseek(foldseek_df):
    data_filtered = foldseek_df[foldseek_df["evalue"] <= 0.001]
    return data_filtered

In [11]:
# Apply functions & store newly filtered FoldSeek files 
# loop over batches
for i in range(1, 5):
    batch = f"batch_{i}"
    path_batch = os.path.join(base_path, batch)
    ncbi_proteins = set()
    # go over results - extract proteins
            
    for file in os.listdir(path_batch):
        if file.endswith(".txt") and "_relaxed_" in file:
            protein = file.split("_relaxed_")[0]
            ncbi_proteins.add(protein)

    for protein in list(ncbi_proteins):
        # filter out dataframes
        pdb_r_filtered = filter_foldseek(read_foldseek_protein(i, protein)[0])
        af50m_r_filtered = filter_foldseek(read_foldseek_protein(i, protein)[1])
        phold_r_filtered = filter_foldseek(read_foldseek_protein(i, protein)[2])

        # store new dataframes
        pdb_r_filtered.to_csv(os.path.join(path_batch, f"{protein}_foldseek_pdb_default.csv"), index=False) 
        af50m_r_filtered.to_csv(os.path.join(path_batch, f"{protein}_foldseek_af50m_default.csv"), index=False) 
        phold_r_filtered.to_csv(os.path.join(path_batch, f"{protein}_foldseek_phold_default.csv"), index=False) 

    print(f"Finished {batch} FoldSeek hit filtering.")

Finished batch_1 FoldSeek hit filtering.
Finished batch_2 FoldSeek hit filtering.
Finished batch_3 FoldSeek hit filtering.
Finished batch_4 FoldSeek hit filtering.


In [14]:
# loop over batches
for i in range(5,6):
    batch = f"batch_{i}"
    path_batch = os.path.join(base_path, batch)
    ncbi_proteins = set()
    # go over results - extract proteins
            
    for file in os.listdir(path_batch):
        if file.endswith(".txt") and "_relaxed_" in file:
            protein = file.split("_relaxed_")[0]
            ncbi_proteins.add(protein)

    for protein in list(ncbi_proteins):
        # filter out dataframes
        pdb_r_filtered = filter_foldseek(read_foldseek_protein(i, protein)[0])
        af50m_r_filtered = filter_foldseek(read_foldseek_protein(i, protein)[1])
        phold_r_filtered = filter_foldseek(read_foldseek_protein(i, protein)[2])

        # store new dataframes
        pdb_r_filtered.to_csv(os.path.join(path_batch, f"{protein}_foldseek_pdb_default.csv"), index=False) 
        af50m_r_filtered.to_csv(os.path.join(path_batch, f"{protein}_foldseek_af50m_default.csv"), index=False) 
        phold_r_filtered.to_csv(os.path.join(path_batch, f"{protein}_foldseek_phold_default.csv"), index=False) 

    print(f"Finished {batch} FoldSeek hit filtering.")

Finished batch_5 FoldSeek hit filtering.
