In [1]:
# Visuallize computational screening results 
import os
import zipfile
from pathlib import Path
from typing import List, Union
from zipfile import ZipFile
import shutil
import pandas as pd
from tqdm import tqdm
import seaborn as sns

In [2]:
def parse_mutations(mutation_string):
    """
    Convert a mutation string like "F239Y,H123R,420W" to a list of positions and mutations
    without the original amino acid letter if present.
    
    Args:
        mutation_string (str): A comma-separated string of mutations
        
    Returns:
        list: A list of mutation strings without the original amino acid letter
    """
    # Split the string by commas
    mutations = mutation_string.split(',')
    
    # Process each mutation
    result = []
    for mutation in mutations:
        if mutation:
            # Check if the mutation starts with a letter
            if mutation and mutation[0].isalpha():
                # Remove the first character (original amino acid)
                result.append(mutation[1:])
            else:
                # If there's no letter at the beginning, keep as is
                result.append(mutation)
    
    return result

In [3]:
def parse_txt(file_path):
    with open(file_path, 'r') as f:
        lines = f.read().splitlines()
    return lines

def parse_fasta_path(file_path):
    with open(file_path, 'r') as f:
        seq = '' 
        for line in f.read().splitlines()[1:]:
            seq += line.strip()
    return seq

In [4]:
LIBRARY_DIR = Path("/home/azamh/anc_iter/evolib/library")
SCREEN_DIR = Path("/home/azamh/anc_iter/evolib/screen")
STRUCTURE_DIR = SCREEN_DIR / "structures"
LIBRARY_RESULTS_DIR = SCREEN_DIR / "library_data"
RESULTS_COPY_DIR = Path('/home/azamh/isde_article/directed_evolution/computational_results')
STRUCTURE_ZIP_DIR = Path('/home/azamh/isde_article/directed_evolution/structures')

SUB_LIBRARIES = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
LIBRARIES = ['wt', "r1", "r2", "r3", "r4"] + \
    [f'r0{x}' for x in SUB_LIBRARIES] + \
    [f'r1{x}' for x in SUB_LIBRARIES] + \
    [f'r2{x}' for x in SUB_LIBRARIES] + \
    [f'r3{x}' for x in SUB_LIBRARIES]
print(LIBRARIES)

dfs = []
all_mutations = set()
for library in tqdm(LIBRARIES):
    library_dir = LIBRARY_DIR / library
    library_results_xlsx = LIBRARY_RESULTS_DIR / f"{library}_screen.xlsx"
    assert library_results_xlsx.exists()

    library_df = pd.read_excel(library_results_xlsx, header = 0, index_col = 0)
    library_df['Fasta_Paths'] = parse_txt(library_dir / 'fasta_paths.txt')
    library_df['Mutations'] = parse_txt(library_dir / 'mutations.txt')
    library_df['Mutations'] = library_df["Mutations"].apply(parse_mutations)
    library_df["Sequence"] = library_df["Fasta_Paths"].apply(parse_fasta_path)
    library_df['Library'] = library
    dfs.append(library_df) 

    for mutations in library_df['Mutations']:
        for mutation in mutations:
            all_mutations.add(mutation)

all_mutations = list(sorted(all_mutations))

for df in tqdm(dfs): 
    for mut in all_mutations:
        df[mut] = 0
    for index, mutations in zip(df.index, df['Mutations']):
        for mutation in mutations:
            df.loc[index, mutation] = 1

MASTER_DF = pd.concat(dfs)
MASTER_DF = MASTER_DF.drop("Fasta_Paths", axis = 1)
MASTER_DF

['wt', 'r1', 'r2', 'r3', 'r4', 'r0a', 'r0b', 'r0c', 'r0d', 'r0e', 'r0f', 'r0g', 'r0h', 'r0i', 'r0j', 'r1a', 'r1b', 'r1c', 'r1d', 'r1e', 'r1f', 'r1g', 'r1h', 'r1i', 'r1j', 'r2a', 'r2b', 'r2c', 'r2d', 'r2e', 'r2f', 'r2g', 'r2h', 'r2i', 'r2j', 'r3a', 'r3b', 'r3c', 'r3d', 'r3e', 'r3f', 'r3g', 'r3h', 'r3i', 'r3j']


100%|███████████████████████████████████████████| 45/45 [00:16<00:00,  2.68it/s]
100%|███████████████████████████████████████████| 45/45 [00:03<00:00, 12.03it/s]


Unnamed: 0,Avg_Fad_Dist,Avg_Fad_Angle,Avg_Docking_Energy_Efficiency,Avg_Pafnucy_pKa_Efficiency,Average_Stereo,Predicted_Conversion,Mutations,Sequence,Library,119F,...,327S,397F,420W,54I,55A,80A,96L,97R,98W,98Y
278,6.277537,64.2975,-3.071432,0.341764,1,1,[],LLLLLLLLLLAILGGGPTGLLLGLGLLERGLEYLLYERALPYYGLG...,wt,0,...,0,0,0,0,0,0,0,0,0,0
278a,6.693740,50.7625,-2.931220,0.334547,-1,0,[],LLLLLLLLLLAVIGAGPTGLLLALGLLERGLEYLLLEKALPYYGLG...,wt,0,...,0,0,0,0,0,0,0,0,0,0
279,5.164363,43.6200,-3.148335,0.360252,0,0,[],EDDSRAPLQVAIIGGGMTGLALALGLLNRDVDFTVYERAATFGELG...,wt,0,...,0,0,0,0,0,0,0,0,0,0
279a,5.382963,43.9150,-3.156252,0.340591,0,0,[],EDDSRAPLQVAIIGGGMTGLALALGLLNRDVDFTVYERAATFGELG...,wt,0,...,0,0,0,0,0,0,0,0,0,0
280,5.455963,46.3025,-3.308199,0.351833,1,0,[],NGNSRSPLEVAIVGGGITGLALAVGLLKRNVNFTIYERAASFGELG...,wt,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xp_659718_r3j,4.328954,12.0450,-4.739025,0.346347,-1,1,"[80A, 122C, 119F]",MTPSTKPKTFHVAIVGGGIAGLSLAIALHHRDVSVKIYEQAHAFAE...,r3j,1,...,0,0,0,0,0,1,0,0,0,0
xp_660831_r3j,5.031949,38.4625,-4.317498,0.338380,-1,1,"[55A, 119F]",MGSLWSSPSLLPSQQDNETEPFSHLPKEIGTDPTLREDSNVSNRNS...,r3j,1,...,0,0,0,0,1,0,0,0,0,0
xp_660986_r2i,4.564863,24.6750,-4.489273,0.352515,1,1,"[239Y, 123R, 420W]",MSASTPTVNGTNEPISIAIIGAGIIGTVLALGLTRRKDAFPLPVNV...,r3j,0,...,0,0,1,0,0,0,0,0,0,0
xp_681171_r1e,5.088867,35.8800,-5.153571,0.357517,0,1,"[225T, 80A]",MPGTVRPGEPVQVAIIGGGIVGVVLAVGLIRQNVKVRLFEQSQGFR...,r3j,0,...,0,0,0,0,0,1,0,0,0,0


In [6]:
# Create zip of structures

def create_zipfile_files_only(paths: List[Path], output_path: Union[str, Path], 
                             compression: int = zipfile.ZIP_LZMA) -> Path:
    """
    Create a ZIP file from a list of PosixPath objects, assuming all paths are files.
    
    Args:
        paths: List of PosixPath file objects to include in the ZIP file
        output_path: Path where the ZIP file will be created
        compression: Compression method (default: ZIP_DEFLATED)
        
    Returns:
        Path object pointing to the created ZIP file
    """
    # Convert output_path to Path if it's a string
    output_path = Path(output_path)
    
    # Ensure the output directory exists
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Create the ZIP file
    with zipfile.ZipFile(output_path, 'w', compression=compression) as zipf:
        for path in paths:
            # Add file to zip with just its filename (no directory structure)
            zipf.write(path, path.name)
    
    return output_path

for lib in tqdm(LIBRARIES):
    structure_lib_dir = STRUCTURE_DIR / lib
    zip_file = STRUCTURE_ZIP_DIR / f'{lib}.zip'
    assert structure_lib_dir.exists()

    if not zip_file.exists():
        pdb_files = [x for x in list(structure_lib_dir.iterdir()) if x.suffix == '.pdb' ]
        create_zipfile_files_only(pdb_files, zip_file)

100%|█████████████████████████████████████████| 45/45 [00:00<00:00, 3576.38it/s]
