In [None]:
import pandas as pd
from Bio.PDB import PDBParser, parse_pdb_header
import ast
import numpy as np
import os
from Bio.SeqUtils import seq1

: 

In [32]:
def parse_pdb(pdb_id):
    """
    Parse the PDB file and return a DataFrame with amino acid information.
    """
    pdb_id, chain_input = pdb_id.split('_')
    path_to_pdb_dir = "/Users/Eck00018/Documents/PhD/dea_chapter_multi_task/chapter-multi-task/data/molpatch/input/"
    path_to_pdb_processed_dir = "/Users/Eck00018/Documents/PhD/dea_chapter_multi_task/chapter-multi-task/data/molpatch/processed/"
    path_to_pdb_input = f"{path_to_pdb_dir}{pdb_id}.pdb"
    path_to_pdb_processed = f"{path_to_pdb_processed_dir}{pdb_id}_{chain_input}.pdb"
    data = []

    missing_residues = parse_pdb_header(path_to_pdb_input)['missing_residues']

    for missing_residue in missing_residues:
        if missing_residue['chain'].lower() != chain_input.lower():
            continue
        data.append({
                'pdbid': pdb_id,
                'chain': missing_residue['chain'],
                'pdb_index': missing_residue['ssseq'],
                'amino_acid': seq1(missing_residue['res_name'])
            })

    parser = PDBParser()
    structure = parser.get_structure(pdb_id, path_to_pdb_processed)[0]

    for chain in structure:
        chain_id = chain.id
        if chain_id.lower() != chain_input.lower():
            continue
        for residue in chain:
            residue_id = residue.id[1]
            amino_acid = residue.get_resname()

            data.append({
                'pdbid': pdb_id,
                'chain': chain_id,
                'pdb_index': residue_id,
                'amino_acid': seq1(amino_acid)
            })

    return pd.DataFrame(data).sort_values(by="pdb_index").reset_index(drop=True)

In [33]:
def prepare_patches_df(path_to_csv):
    """
    Read the patches CSV, process the data, and return the DataFrame.
    """
    df_patches = pd.read_csv(path_to_csv)
    df_patches['residue_ID'] = df_patches['residue_ID'].apply(lambda x: ast.literal_eval(x))
    df_patches['chain'] = df_patches['residue_ID'].apply(lambda x: x[0])
    df_patches['pdb_index'] = df_patches['residue_ID'].apply(lambda x: x[1][1])
    df_patches.drop(columns=['residue_ID', 'protein_id', 'patch_rank', 'residue_type'], inplace=True)
    return df_patches

In [34]:
def merge_dataframes(df_amino_acids, df_patches):
    """
    Merge amino acid DataFrame with patches DataFrame and return the merged DataFrame.
    """
    merged_df = pd.merge(df_amino_acids, df_patches, on=['pdb_index', 'chain'], how='left')
    merged_df['patch_size'] = merged_df['patch_size'].fillna(-1)
    max_patch_size_index = merged_df.groupby(['pdb_index', 'chain'])['patch_size'].idxmax()
    merged_df = merged_df.loc[max_patch_size_index]
    merged_df['patch_size'] = merged_df['patch_size'].replace(-1, np.nan)
    merged_df = merged_df.reset_index(drop=True)
    return merged_df

In [35]:
path_to_csv = "/Users/Eck00018/Documents/PhD/dea_chapter_multi_task/chapter-multi-task/data/molpatch/result/"
path_to_output = "/Users/Eck00018/Documents/PhD/dea_chapter_multi_task/chapter-multi-task/data/patches/raw/"

for csv_file in os.listdir(path_to_csv):
    pdb_id = csv_file.split('.')[0]
    pdb_file = pdb_id + ".pdb"
    output_file = pdb_id.upper().replace('_', '') + ".csv"
    df_amino_acids = parse_pdb(pdb_id)
    df_patches = prepare_patches_df(path_to_csv + csv_file)
    merged_df = merge_dataframes(df_amino_acids, df_patches)
    merged_df.to_csv(path_to_output + output_file)


