In [3]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

import biofile

bfd = biofile.Dataset(identifier = 'TSP_Aam', 
                      local = '../output/', 
                      remote = 's3://arcadia-protein-evolution/cartography/tsp/').unpickle()

output_folder = bfd.Local / '20230331_test'
alphafold_folder = output_folder / 'alphafold'

bfd.attrs

local files will be saved in ../output/
remote files will be saved in s3://arcadia-protein-evolution/cartography/tsp/


{'identifier': 'TSP_Aam',
 'local': '../output/',
 'remote': 's3://arcadia-protein-evolution/cartography/tsp/',
 'files': {},
 'blast_refseq_list': <biofile.biofile.Biofile at 0x17898fd00>,
 'blast_uniprot_list': <biofile.biofile.Biofile at 0x17898fa30>,
 'TSP_hit0': <biofile.biofile.Biofile at 0x17898ffd0>,
 'TSP_hit1': <biofile.biofile.Biofile at 0x17898ef80>,
 'TSP_hit2': <biofile.biofile.Biofile at 0x17898ee90>,
 'TSP_blasthits0': <biofile.biofile.Biofile at 0x17898f1c0>,
 'TSP_blasthits1': <biofile.biofile.Biofile at 0x17898f0d0>,
 'TSP_blasthits2': <biofile.biofile.Biofile at 0x1789e9e70>,
 'foldseek_uniprot_list': <biofile.biofile.Biofile at 0x1789e9f30>}

In [4]:
db_folder = output_folder / 'all_foldomesDB'

foldseek_clustertsv = output_folder / 'clu_greedy.tsv'

if not os.path.exists(foldseek_clustertsv):

    subprocess.run(['foldseek', 'createdb', alphafold_folder, db_folder])

    foldseek_out = output_folder / 'all_by_all'
    foldseek_tmp = output_folder / 'tmp'
    subprocess.run(['foldseek', 'search', db_folder, db_folder, foldseek_out, foldseek_tmp, '-a'])

    foldseek_tmscore = output_folder / 'all_by_all_tmscore'
    subprocess.run(['foldseek', 'aln2tmscore', db_folder, db_folder, foldseek_out, foldseek_tmscore])

    foldseek_tsv = str(foldseek_tmscore) + '.tsv'
    subprocess.run(['foldseek', 'createtsv', db_folder, db_folder, foldseek_tmscore, foldseek_tsv])
    
    foldseek_cluster = output_folder / 'clu'
    subprocess.run(['foldseek', 'clust', db_folder, foldseek_out, foldseek_cluster, '--cluster-mode', '0', '--similarity-type', '2'])
    
    subprocess.run(['foldseek', 'createtsv', db_folder, db_folder, foldseek_cluster, foldseek_clustertsv])

else:
    print('final cluster tsv file already found at', foldseek_clustertsv)

final cluster tsv file already found at ../output/20230331_test/clu_greedy.tsv


In [5]:
df = pd.read_csv(foldseek_clustertsv, sep = '\t', names = ['ClusterRep', 'uniprot_id'])
df['ClusterRep'] = df['ClusterRep'].str.split('-', expand = True)[1]
df['uniprot_id'] = df['uniprot_id'].str.split('-', expand = True)[1]

df_merged = df.groupby('ClusterRep').agg({i: ('first' if i == 'ClusterRep' else lambda x: [i for i in x]) for i in df.columns}).reset_index(drop = True)
df_merged.drop(columns = ['ClusterRep'], inplace = True)
df_merged.insert(0, 'StruCluster', 'SC' + df_merged.index.astype('str'))

In [6]:
df_exploded = df_merged.explode('uniprot_id')
df_exploded['filename'] = 'AF-' + df_exploded['uniprot_id'] + '-F1-model_v4.pdb'
df_exploded = df_exploded[['filename', 'uniprot_id', 'StruCluster']]

df_exploded.to_csv(bfd.Local / 'foldseek_struclusters.tsv', sep = '\t', index = None)
display(df_exploded)

Unnamed: 0,filename,uniprot_id,StruCluster
0,AF-A0A1L8FE91-F1-model_v4.pdb,A0A1L8FE91,SC0
0,AF-A0A044RHJ8-F1-model_v4.pdb,A0A044RHJ8,SC0
0,AF-A0A060XTX6-F1-model_v4.pdb,A0A060XTX6,SC0
0,AF-A0A060YZZ2-F1-model_v4.pdb,A0A060YZZ2,SC0
0,AF-A0A087T3W5-F1-model_v4.pdb,A0A087T3W5,SC0
...,...,...,...
9,AF-W5N2B7-F1-model_v4.pdb,W5N2B7,SC9
9,AF-W5N8M6-F1-model_v4.pdb,W5N8M6,SC9
9,AF-X1WGX6-F1-model_v4.pdb,X1WGX6,SC9
10,AF-Q9V776-F1-model_v4.pdb,Q9V776,SC10


In [19]:
blast_list = pd.read_csv(bfd.blast_uniprot_list.path, header = None)
foldseek_list = pd.read_csv(bfd.foldseek_uniprot_list.path, header = None)

joint_list = pd.concat([blast_list, foldseek_list])[0].unique()

accessions_record_file = bfd.Local / 'TSP_Aam-ALL_hits.uniprot_list'

if not os.path.exists(accessions_record_file):
    with open(accessions_record_file, 'w+') as fileobj:
        fileobj.writelines(acc + '\n' for acc in joint_list)