In [5]:
import pandas as pd
import os
from pathlib import Path
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import numpy as np
#import protein_design_utils

import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from isambard import ampal

In [9]:
def PMPNN_directory_to_df(pdb_paths=str,temp_dir=str, temp_label=float):
    # Define your base directories
    base_dir = Path(pdb_paths)
    PMPNN_dir = base_dir / "seqs" / temp_dir

    # Initialize a list to collect data
    data = []

    # Iterate over each .pdb file in the RFDiff_files directory
    for pdb_file in base_dir.glob("*.pdb"):

        pdb_file_name = pdb_file.stem  # Get the .pdb file name without the extension

        # Construct the path to the corresponding .fa file
        fa_file_path = PMPNN_dir / pdb_file_name / "seqs" / f"{pdb_file_name}.fa"

        # Check if the .fa file exists
        if fa_file_path.exists():
            # Open and read the .fa file
            with open(fa_file_path, "r") as fa_file:
                lines = fa_file.readlines()

            for line in lines:
                if line.startswith(">"):  # Header line with scores and metadata
                    # Extract scores and metadata
                    header_parts = line.strip().split(", ")
                    metadata = {part.split("=")[0]: part.split("=")[1] for part in header_parts if "=" in part}

                    # Prepare a new row for the DataFrame
                    row = {
                        "pdb_file": pdb_file_name,
                        "sequence": "",  # Placeholder, will be filled with the next line
                        "score": metadata.get("score"),
                        "global_score": metadata.get("global_score"),
                        "seq_recovery": metadata.get("seq_recovery"),
                        "fasta_file_path": fa_file_path
                        # Add more metadata as needed
                    }

                else:  # Sequence line
                    row["sequence"] = line.strip()
                    data.append(row)
        else:
            print(f'file was not found at {fa_file_path}')

    # Convert the collected data into a pandas DataFrame
    pmpnn_df = pd.DataFrame(data)

    pmpnn_df['sequence_length'] = pmpnn_df['sequence'].str.len()
    pmpnn_df['RDiff_pdb_file_path'] = pmpnn_df['pdb_file'].apply(lambda x: f"{base_dir}/{x}.pdb")
    pmpnn_df = pmpnn_df.dropna(subset=['seq_recovery']).reset_index(drop=True)
    
    pmpnn_df['pmpnn temp']=temp_label
    return pmpnn_df

In [None]:
'./02SEP24/seqs/tempp1/'

In [11]:
temp_1_df = PMPNN_directory_to_df('./02SEP24/','temp1',1.0)
temp_p1_df = PMPNN_directory_to_df('./02SEP24/','tempp1',0.1)
temp_p01_df = PMPNN_directory_to_df('./02SEP24/','tempp01',0.01)

In [12]:
master_df = pd.concat([temp_1_df, temp_p1_df, temp_p01_df],ignore_index=True)

In [14]:
master_df = master_df.reset_index()

In [15]:
master_df

Unnamed: 0,index,pdb_file,sequence,score,global_score,seq_recovery,fasta_file_path,sequence_length,RDiff_pdb_file_path,pmpnn temp
0,0,normal_28,LTKYENSQEQACNIYKRNALGADNAAEISRGNEIAARENPVKASEE...,2.2076,2.1258,0.0500,02SEP24/seqs/temp1/normal_28/seqs/normal_28.fa,80,02SEP24/normal_28.pdb,1.00
1,1,normal_28,EEELKRAEYAANHIWKEAQMGMEYADTYMLGCDIRAQIYPEQQAES...,2.1360,2.1018,0.0375,02SEP24/seqs/temp1/normal_28/seqs/normal_28.fa,80,02SEP24/normal_28.pdb,1.00
2,2,normal_28,YPSFYIALNLSQIFDHSANLSLLYANQKAAGDASRARKPPENAVSA...,2.3515,2.1603,0.0250,02SEP24/seqs/temp1/normal_28/seqs/normal_28.fa,80,02SEP24/normal_28.pdb,1.00
3,3,normal_28,ELKYSEGLADAASQIRAGGNGTAEAMDEAVGAITYAKQAVGLRVHA...,2.3720,2.1852,0.1125,02SEP24/seqs/temp1/normal_28/seqs/normal_28.fa,80,02SEP24/normal_28.pdb,1.00
4,4,normal_633,LDKNDNSQESACNAAKRNALLADNEAEIQRANGVDKRTVNVMLSTE...,2.3719,2.2206,0.0250,02SEP24/seqs/temp1/normal_633/seqs/normal_633.fa,80,02SEP24/normal_633.pdb,1.00
...,...,...,...,...,...,...,...,...,...,...
2263,2263,normal_992,SEEEKKKEEEKKAAEAAKAAKEAALAEGKKDPALYAKVKGAIDAAE...,1.2751,1.7939,0.0375,02SEP24/seqs/tempp01/normal_992/seqs/normal_99...,80,02SEP24/normal_992.pdb,0.01
2264,2264,beta_377,AAATVTAANIAIATALANAPTWAPDEAIARVTAAAAKLSPKYAEAA...,1.0538,1.7782,0.0500,02SEP24/seqs/tempp01/beta_377/seqs/beta_377.fa,80,02SEP24/beta_377.pdb,0.01
2265,2265,beta_377,AEETVTKAELEAAEALAKAPEWSPEETLAKVTAAYAKLSPKRAEEA...,1.0532,1.7615,0.0625,02SEP24/seqs/tempp01/beta_377/seqs/beta_377.fa,80,02SEP24/beta_377.pdb,0.01
2266,2266,beta_377,AAATTTAAAVAAAAALAAAPTADAAATRAALAAAYATLSPARAAAA...,0.9235,1.6996,0.0500,02SEP24/seqs/tempp01/beta_377/seqs/beta_377.fa,80,02SEP24/beta_377.pdb,0.01


In [16]:
def write_fasta_files(df, folder_path, max_files_per_folder=1000):
    # Create the base folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Initialize counters for folder and file
    folder_counter = 1
    file_counter = 0
    
    # Create the first subfolder
    current_folder = os.path.join(folder_path, f"set{folder_counter}")
    if not os.path.exists(current_folder):
        os.makedirs(current_folder)
    
    # Iterate over rows of the dataframe
    for index, row in df.iterrows():
        # Check if we've reached the max files per folder
        if file_counter >= max_files_per_folder:
            # Move to the next folder
            folder_counter += 1
            current_folder = os.path.join(folder_path, f"set{folder_counter}")
            if not os.path.exists(current_folder):
                os.makedirs(current_folder)
            file_counter = 0  # Reset file counter for the new folder
        
        # Generate the filename for the FASTA file using the index
        file_name = f"{row['index']}.fasta"
        file_path = os.path.join(current_folder, file_name)
        
        # Write the sequence to the FASTA file
        with open(file_path, "w") as fasta_file:
            fasta_file.write(f">{index}\n")
            fasta_file.write(f"{row['sequence']}\n")
        
        # Update the dataframe with the file path
        df.at[index, 'fasta_path'] = file_path
        
        # Increment the file counter
        file_counter += 1
    
    return df

In [17]:
master_df = write_fasta_files(master_df, '../ESM/03SEP24')


In [18]:
master_df

Unnamed: 0,index,pdb_file,sequence,score,global_score,seq_recovery,fasta_file_path,sequence_length,RDiff_pdb_file_path,pmpnn temp,fasta_path
0,0,normal_28,LTKYENSQEQACNIYKRNALGADNAAEISRGNEIAARENPVKASEE...,2.2076,2.1258,0.0500,02SEP24/seqs/temp1/normal_28/seqs/normal_28.fa,80,02SEP24/normal_28.pdb,1.00,../ESM/03SEP24/set1/0.fasta
1,1,normal_28,EEELKRAEYAANHIWKEAQMGMEYADTYMLGCDIRAQIYPEQQAES...,2.1360,2.1018,0.0375,02SEP24/seqs/temp1/normal_28/seqs/normal_28.fa,80,02SEP24/normal_28.pdb,1.00,../ESM/03SEP24/set1/1.fasta
2,2,normal_28,YPSFYIALNLSQIFDHSANLSLLYANQKAAGDASRARKPPENAVSA...,2.3515,2.1603,0.0250,02SEP24/seqs/temp1/normal_28/seqs/normal_28.fa,80,02SEP24/normal_28.pdb,1.00,../ESM/03SEP24/set1/2.fasta
3,3,normal_28,ELKYSEGLADAASQIRAGGNGTAEAMDEAVGAITYAKQAVGLRVHA...,2.3720,2.1852,0.1125,02SEP24/seqs/temp1/normal_28/seqs/normal_28.fa,80,02SEP24/normal_28.pdb,1.00,../ESM/03SEP24/set1/3.fasta
4,4,normal_633,LDKNDNSQESACNAAKRNALLADNEAEIQRANGVDKRTVNVMLSTE...,2.3719,2.2206,0.0250,02SEP24/seqs/temp1/normal_633/seqs/normal_633.fa,80,02SEP24/normal_633.pdb,1.00,../ESM/03SEP24/set1/4.fasta
...,...,...,...,...,...,...,...,...,...,...,...
2263,2263,normal_992,SEEEKKKEEEKKAAEAAKAAKEAALAEGKKDPALYAKVKGAIDAAE...,1.2751,1.7939,0.0375,02SEP24/seqs/tempp01/normal_992/seqs/normal_99...,80,02SEP24/normal_992.pdb,0.01,../ESM/03SEP24/set3/2263.fasta
2264,2264,beta_377,AAATVTAANIAIATALANAPTWAPDEAIARVTAAAAKLSPKYAEAA...,1.0538,1.7782,0.0500,02SEP24/seqs/tempp01/beta_377/seqs/beta_377.fa,80,02SEP24/beta_377.pdb,0.01,../ESM/03SEP24/set3/2264.fasta
2265,2265,beta_377,AEETVTKAELEAAEALAKAPEWSPEETLAKVTAAYAKLSPKRAEEA...,1.0532,1.7615,0.0625,02SEP24/seqs/tempp01/beta_377/seqs/beta_377.fa,80,02SEP24/beta_377.pdb,0.01,../ESM/03SEP24/set3/2265.fasta
2266,2266,beta_377,AAATTTAAAVAAAAALAAAPTADAAATRAALAAAYATLSPARAAAA...,0.9235,1.6996,0.0500,02SEP24/seqs/tempp01/beta_377/seqs/beta_377.fa,80,02SEP24/beta_377.pdb,0.01,../ESM/03SEP24/set3/2266.fasta


## Go run ESMFold for monomer refolding

In [None]:
mapping_dict = {
    'normal': 'normal',
    'soluble': 'soluble'
}
path_to_ESM = './RF_beta/ESM_outputs/'

# Function to map pdb_file to result file path
def map_result_path(row):
    
    def find_pdb_file(pdb_dir):
        try:
            pdb_files = [f for f in os.listdir(pdb_dir) if f.endswith('.pdb')]
            if pdb_files:
                return os.path.join(pdb_dir, pdb_files[0])
            else:
                return None
        except FileNotFoundError:
            return None
    
    weight_type = mapping_dict.get(row['PMPNN weights'])
    pdb_number = row['pdb_file']
    result_file = f"{weight_type}{pdb_number}_result/"
    result_path = os.path.join(path_to_ESM, result_file)
    result = find_pdb_file(result_path)
    print(result)
    return result

RF_beta_df['ESM_pdb_path'] = RF_beta_df.apply(map_result_path, axis=1)


