In [27]:
import py3Dmol

def view_custom_pdb(pdb_file, residue_num, chain='A'):
    """
    Visualize a PDB structure from a file, highlight a specific residue with thicker sticks,
    show the backbone for the selected residue, and add an arrow-like indicator.
    
    Parameters:
    - pdb_file: str, path to the local PDB file.
    - residue_num: int, residue number to highlight.
    - chain: str, the chain identifier (default 'A').
    """
    # Read the PDB file content
    with open(pdb_file, 'r') as f:
        pdb_content = f.read()

    # Initialize py3Dmol viewer
    viewer = py3Dmol.view(width=800, height=600)
    
    # Add the PDB structure to the viewer
    viewer.addModel(pdb_content, 'pdb')
    
    # Apply cartoon style for the overall structure
    viewer.setStyle({'cartoon': {'color': 'cyan'}})
    
    # Highlight the specified residue by number and chain with larger sticks and backbone
    viewer.setStyle({'chain': chain, 'resi': str(residue_num)}, 
                    {'stick': {'colorscheme': 'orangeCarbon', 'radius': 0.6},  # Larger sticks
                     'cartoon': {'color': 'orange', 'style': 'trace'}})  # Backbone trace
    
    # Add molecular surface for the rest of the structure
    viewer.addSurface(py3Dmol.VDW, {'opacity': 0.5})
    
    # Add an arrow-like indicator near the selected residue
    viewer.addLabel(f'Residue {residue_num}', 
                    {'position': {'resi': residue_num, 'chain': chain}, 
                     'backgroundColor': 'white', 'fontColor': 'black', 'fontSize': 16,
                     'alignment': 'bottomCenter'})
    
    # Set zoom and background color
    viewer.zoomTo()
    viewer.setBackgroundColor('white')
    
    # Show the structure
    return viewer.show()



In [28]:
#alignments should be in the encoded alphabet but come from foldmason alignment logic. assumed to be true
#trees are from FT1. assumed to be true
res = {}
import glob
import os
import pandas as pd
import tqdm

dfs = []

import sys
sys.path.append('/home/dmoi/projects/foldtree2')

import torch
import torch.nn.functional as F
from Bio import Phylo, AlignIO, SeqIO
from scipy.special import gamma as gamma_function
import numpy as np

# Helper function to read a tree (returns a Phylo object)
def read_tree(tree_file):
    return Phylo.read(tree_file, 'newick')

# Helper function to read a multiple sequence alignment (MSA)
def read_msa(msa_file, format='fasta'):
    return AlignIO.read(msa_file, format)

def msa2array(msa):
    #use biopython to read the alignment
    msa = AlignIO.read(msa, 'fasta')
    index = {seq.id: i for i, seq in enumerate(msa)}
    return index, np.array([list(rec) for rec in msa], np.character)

class msaarray:
    def __init__(self, msa):
        self.index, self.array = msa2array(msa)
        self.n, self.L = self.array.shape
        self.alphabet = np.unique(self.array)
        self.alphabet_size = len(self.alphabet)

    def __getitem__(self, i):
        if type(i) is slice:
            return self.array[i]
        else:
            return self.array[self.index[i]]

    def __len__(self):
        return self.n

    def __iter__(self):
        for i in self.index:
            yield i


def read_seq(seq_file, format='fasta'):
    print(seq_file)
    try:
        return SeqIO.read(seq_file, format)
    except:
        print('Error reading sequence file')
        print('Trying to read as a list of sequences')
        return [ s for s in SeqIO.parse(seq_file, format) ]


def fasta2df(fasta):
    seqs = {}
    s = read_seq(fasta)
    for i, seq in enumerate(s):
        seqs[i] = {'id': seq.id, 'seq': str(seq.seq)}    
    return pd.DataFrame.from_dict(seqs, orient='index')
def df2fasta(df, fasta , key = 'seq'):
    with open(fasta, 'w') as f:
        for i, row in df.iterrows():
            f.write('>' + row['id'] + '\n')
            f.write(row[key] + '\n')
def copy_aln(row):
    encoded = list(row['encoded'])
    foldmason = list(row['foldmason'])
    for i, c in enumerate(foldmason):
        if c == '-':
            encoded.insert(i, '-')
    return ''.join(encoded)
datadir = '/home/dmoi/datasets/'
families = glob.glob( datadir + 'afdbclusters/structfams/*/')
for f in tqdm.tqdm( families):
    if os.path.exists(f+'fident_distmat.txt_tree.txt') and os.path.exists(f+'foldmason.fasta_aa.fa') and os.path.exists(f+'encoded.fasta'):
        #read encoded, foldtree tree, and foldmason msa
        encoded = fasta2df(f+'encoded.fasta')
        msa = fasta2df(f+'foldmason.fasta_aa.fa')
        #merge encoded and msa
        merged = pd.merge(encoded, msa, on='id')
        merged.columns = ['id', 'encoded', 'foldmason']
        merged['family'] = f.split('/')[-2]
        #transfer gaps to encoded
        merged['encoded_aln_foldmason'] = merged.apply( copy_aln , axis=1)
        merged['aln_len'] = merged['encoded_aln_foldmason'].apply(len)
        print(merged.aln_len.value_counts())
        print(merged)
        #output encoded aln to fasta
        merged['pdbfile'] = merged['id'].apply(lambda x: f +'/structs/'+x + '.pdb')
        df2fasta(merged[['id', 'encoded_aln_foldmason']], f+'encoded_aln_foldmason.fasta', key = 'encoded_aln_foldmason')
        dfs.append( merged )

df = pd.concat(dfs)

  0%|                                                                                         | 0/101 [00:00<?, ?it/s]

 11%|████████▌                                                                      | 11/101 [00:00<00:00, 105.29it/s]

/home/dmoi/datasets/afdbclusters/structfams/A0A011N458/encoded.fasta
Error reading sequence file
Trying to read as a list of sequences
/home/dmoi/datasets/afdbclusters/structfams/A0A011N458/foldmason.fasta_aa.fa
Error reading sequence file
Trying to read as a list of sequences
aln_len
170    10
Name: count, dtype: int64
           id                                            encoded  \
0      G6F3X8  TT2+_S)}|2E]|_77]E4M+]<|8|4']S/VVSZKR2/IZ8I|||...   
1  A0A0H3KU55  DDG]9]9=/44XX999828UD9=8=KZ]9/:=9G=X2&:Z8IK:Z=...   
2      A9HPZ2  DDYE9]:R|:}PRF'/]RV&8|&:DJ|]:8J|6JRC$$}/FX2]PZ...   
3  A0A4P0Y7M4  TT9_ZE4]{RR_}]}RZ4{8]='SS/::Y}:RPC:Z8'==V|9:92...   
4  A0A535F069  TT2P+2|EE+_A38_77_}_R+8_{8M]'RE/]:7'KR'):ZK'|:...   
5  A0A378W409  DD:&~8P_B]8_RJ{WR@}:|K8DV'VR/V$&WVR2):ZR:|:)|S...   
6  A0A645JEF6  DD2=9C842]{E98{RS_}]8]]||8|:'8}/::}/KR+|<]JL|_...   
7  A0A7X6IPW2  ]488R4{{R_}P}_234{_R}{W{[,844CR_6JR/:|}Z\J2)1Z...   
8      E3T6K5  TT2{E9{{9S_9_}+8==8I4I&8/:V}9VR2/:Z}P||/]W:W]6...  

 35%|███████████████████████████▍                                                   | 35/101 [00:00<00:00, 113.48it/s]

Error reading sequence file
Trying to read as a list of sequences
/home/dmoi/datasets/afdbclusters/structfams/A0A009LHJ9/foldmason.fasta_aa.fa
Error reading sequence file
Trying to read as a list of sequences
aln_len
302    10
Name: count, dtype: int64
           id                                            encoded  \
0  A0A2L1UX87  TT++4EO548{845[]W74944,%4P08]0/#R*AZ8<2%]ROR0%...   
1  A0A077LAA9  TTSGSZGSW5854R94[4{84]{)G+IRLP+8O/8L4OZ[Z8<ZLZ...   
2  A0A7W4L670  TT4+R+8GE/E999&89&2,48,{NP#80+O]%ZAZ8N2%M[]R0%...   
3  A0A009LHJ9  TT++G3R438354R85]Z[+Z{Z%]2,O]8Z0#25C81#Q^,ZX[0...   
4  A0A257KAW4  TT99E824G+]CZ=]=S4G9{G]]/99G9I2ZETG999:&&K&&&8...   
5  A0A0H3HJY2  TT/P7]3)3O3}}8A}}]2EE43{4N]8*%O0%ZAZ8<4%MROR0%...   
6  A0A8B2BWA3  TTTGS[[85W9R3]8[]8*O_SS]9]P+]/_]+))Z2884L4P47H...   
7  A0A1H1WPZ6  TTSZ/9ER_4R4{9E]9{X,%%22P%8]Z/PR5Z8NNP#R%5R408...   
8  A0A090AZI0  TT+GS8S492TSES9+A4_8%]X{Z4{2M8LO8L0Z+5O8_L%+RO...   
9  A0A1V4X3G2  TTG{+E[#/8{_RRER9S9548X,%N]R*NO#53]0RP#[%80P%R...   

 59%|██████████████████████████████████████████████▉                                | 60/101 [00:00<00:00, 116.61it/s]

/home/dmoi/datasets/afdbclusters/structfams/A0A011MIK5/encoded.fasta
Error reading sequence file
Trying to read as a list of sequences
/home/dmoi/datasets/afdbclusters/structfams/A0A011MIK5/foldmason.fasta_aa.fa
Error reading sequence file
Trying to read as a list of sequences
aln_len
255    10
Name: count, dtype: int64
           id                                            encoded  \
0  A0A7U9FC42  TT4{{[]#[_{8]4/9/=/Y89+=/29{]RC2/58/A+}IRIT//9...   
1  A0A1B0C1P2  TT2RGR/_]_{29]9/{R/22_SR{#O]88/O/49T]]_{/]49I2...   
2  A0A011MIK5  TTS4{24#A/8/4!8=]KD]=G]&{SR4_S=4{]]R2_/]2/}])}...   
3  A0A1Y2GRZ0  TT++ZGP]{_44I2{G_P_GG//}'8PP'4I+{IR]{_/G{2G/{/...   
4  A0A482W1D7  TTREG+CPZ4PP44_TT4G]2]4GE9_G{/{_+4Z8]K]+44R__}...   
5  A0A212SGZ0  TA{8[5G_PC]4///{R/I2429{_//N8W42_9Y2989]&==I=_...   
6  A0A445CDB1  T!I298U]89{9/9]C/924G_2]]G2GEP}}}}}}E4E+4S/P}}...   
7      Q6ZH04  TT_Z&]2//8/Z//E229]YZ8]XC2&I98K8Y:XK__9{_//R22...   
8      R8BPF0  TTR9]]4844_{{SS82R{I2]S]44+4{42]_{4C_C__94]4{G...  

 84%|██████████████████████████████████████████████████████████████████▍            | 85/101 [00:00<00:00, 113.65it/s]

/home/dmoi/datasets/afdbclusters/structfams/A0A010YLN3/encoded.fasta
Error reading sequence file
Trying to read as a list of sequences
/home/dmoi/datasets/afdbclusters/structfams/A0A010YLN3/foldmason.fasta_aa.fa
Error reading sequence file
Trying to read as a list of sequences
aln_len
250    10
Name: count, dtype: int64
           id                                            encoded  \
0  A0A2N0JIB4  !;6X>5(R8Q1FFWW2/P]6/W:6/QP]W1XPWXX/2277/<PMM/...   
1  A0A0D7QJG5  ;;^XBQ<2F(Q6JP;/>CQRQ(8Q22X//1PMQ0RZ18P^^R^XP1...   
2  A0A843FYU0  AA2PX,X,5N205/Z0#QL/^52/P,PNP8N/X0X#8A22[O/NP[...   
3  A0A7V8BDC4  M}|/%/LJ9P{P3/227)/<PM2){MML<8/L7177P<LHMZ<M4R...   
4      I4HFY3  TTT{W4O%2R2RH8R<20M0ZLLR%/,,02[5QP[/227//+PMP8...   
5  A0A380Q314  T!CF6>"8W8^12]\8+M2}PZWJ2Z8(6XM22X//1P^ZCWJ$W\...   
6  A0A1Q7Z3U8  ;;,Q<20H8N4+#554#%4Z0Q#^/22XZ/CPX6>WXQ6C221XX^...   
7  A0A842LBY7  TT__<+0MOQH^,MN20M0CQLQ/>[MPQP8/22RO/NP572784L...   
8  A0A7K4A191  ;;/C1^X^,51205PZQQQ1/5WQQP8/22%O/NP^]PQQ%#NQ/^...  

100%|███████████████████████████████████████████████████████████████████████████████| 101/101 [00:01<00:00, 98.78it/s]

aln_len
413    10
Name: count, dtype: int64
           id                                            encoded  \
0      W4JRI3  DDX||}KK/J]J}]|VCPPR\C~$Y8|]]V}&V'CY882CD/ZP&K...   
1      A4WYG6  TT/&{I{|2//}2|]R8}/I}/'/:8I::&KS9GP89=IZC2=R2{...   
2  A0A6J3LAH8  T;C0<]ZQ]}8VR/Z/882XPP/4ZX8/{[O_8X/88/%Z222RI8...   
3  A0A523W4J3  TT_]/S}4'S)G'_8]_4}]]}8/E}G'S]GP22RO]]8/8L]P2<...   
4  A0A377LXT5  T;28X]48QC/WPPMMMTR+PRM1J\R0/V(M+/8}G2{{{GZG_Q...   
5  A0A433TVS0  TD=GPSERP{R/]}_42{4O4}[8P}#}}3]Z{Z7+}]E/{4P/49...   
6  A0A793EX33  P]R7{/'7/PZ<8/__T]S9/{ES/494R2ZIZI2{X/B#(#21/Z...   
7  A0A366F5W4  TT|//_R4[/'A{OZ}_84/+]7//8M/L{P{ZZ8G2/I{/]])}P...   
8      B7N320  TTE=24S/PZ_8/__T]S9/{ES/494R4ZIZI2{X/,#(#2N/Z;...   
9  A0A140NQJ6  TT_75/H#(L2<OZMZRPH\7J//<)Z\M)QHR]]M<80/Q08RCH...   

                                           foldmason      family  \
0  ----------------------------------------------...  A0A009YGQ6   
1  ----------------------------------------------...  A0A009YGQ6   
2  




In [29]:
print( df.pdbfile.values)

['/home/dmoi/datasets/afdbclusters/structfams/A0A011N458//structs/G6F3X8.pdb'
 '/home/dmoi/datasets/afdbclusters/structfams/A0A011N458//structs/A0A0H3KU55.pdb'
 '/home/dmoi/datasets/afdbclusters/structfams/A0A011N458//structs/A9HPZ2.pdb'
 ...
 '/home/dmoi/datasets/afdbclusters/structfams/A0A010R7U6//structs/A0A2G5F175.pdb'
 '/home/dmoi/datasets/afdbclusters/structfams/A0A010R7U6//structs/A0A453NZA8.pdb'
 '/home/dmoi/datasets/afdbclusters/structfams/A0A010R7U6//structs/A0A1U8J7P2.pdb']


In [30]:
#select a few chars from embedding and visualize them

inputchars = """0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ! " # $ % & ' ( ) * + , / : ; < = > @ [ \ ] ^ _ { | } ~""".split()

selected = inputchars[5]
print(selected)

#find selected residues in sequence
df['selected'] = df['encoded_aln_foldmason'].apply(lambda x: x.find(selected))
maxcount = 5
found = df[df['selected'] != -1].sample(maxcount)
count = 0
for i, row in found.iterrows():
    count += 1
    view_custom_pdb(row['pdbfile'], row['selected'], chain='A')

5


  inputchars = """0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ! " # $ % & ' ( ) * + , / : ; < = > @ [ \ ] ^ _ { | } ~""".split()
