In [2]:
!pip install git+https://github.com/Arcadia-Science/biofile.git@das/dev --upgrade

Collecting git+https://github.com/Arcadia-Science/biofile.git@das/dev
  Cloning https://github.com/Arcadia-Science/biofile.git (to revision das/dev) to /private/var/folders/5b/71_5djmd0p5_yhs0tpbbq68r0000gn/T/pip-req-build-_q_xefjb
  Running command git clone --filter=blob:none --quiet https://github.com/Arcadia-Science/biofile.git /private/var/folders/5b/71_5djmd0p5_yhs0tpbbq68r0000gn/T/pip-req-build-_q_xefjb
  Running command git checkout -b das/dev --track origin/das/dev
  Switched to a new branch 'das/dev'
  branch 'das/dev' set up to track 'origin/das/dev'.
  Resolved https://github.com/Arcadia-Science/biofile.git to commit 81d00b1dd02c3d259d234ce0d79df5aff6d82ac2
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting boto3
  Downloading boto3-1.26.115-py3-none-any.whl (135 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting s3path
  Using cached s3path-0.4.1-py3-none-an

In [3]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

import biofile

bfd = biofile.Dataset(identifier = 'TSP_Aam', 
                      local = '../output/', 
                      remote = 's3://arcadia-protein-evolution/cartography/tsp/').unpickle()

output_folder = bfd.Local / '20230418_test'
alphafold_folder = output_folder / 'alphafold'

bfd.attrs

local files will be saved in ../output/
remote files will be saved in s3://arcadia-protein-evolution/cartography/tsp/


{'identifier': 'TSP_Aam',
 'local': '../output/',
 'remote': 's3://arcadia-protein-evolution/cartography/tsp/',
 'files': {},
 'blast_refseq_list': <biofile.biofile.Biofile at 0x17cb8ef50>,
 'blast_uniprot_list': <biofile.biofile.Biofile at 0x17cb8efb0>,
 'TSP_hit0': <biofile.biofile.Biofile at 0x17cb8ef20>,
 'TSP_hit1': <biofile.biofile.Biofile at 0x17cb8d8a0>,
 'TSP_hit2': <biofile.biofile.Biofile at 0x17cb8d930>,
 'TSP_blasthits0': <biofile.biofile.Biofile at 0x17cb8d990>,
 'TSP_blasthits1': <biofile.biofile.Biofile at 0x17cbd08e0>,
 'TSP_blasthits2': <biofile.biofile.Biofile at 0x17cbd09a0>,
 'foldseek_uniprot_list': <biofile.biofile.Biofile at 0x17cbd0850>}

In [4]:
db_folder = output_folder / 'all_foldomesDB'

foldseek_clustertsv = output_folder / 'clu_greedy.tsv'

if not os.path.exists(foldseek_clustertsv):

    subprocess.run(['foldseek', 'createdb', alphafold_folder, db_folder])

    foldseek_out = output_folder / 'all_by_all'
    foldseek_tmp = output_folder / 'tmp'
    subprocess.run(['foldseek', 'search', db_folder, db_folder, foldseek_out, foldseek_tmp, '-a'])

    foldseek_tmscore = output_folder / 'all_by_all_tmscore'
    subprocess.run(['foldseek', 'aln2tmscore', db_folder, db_folder, foldseek_out, foldseek_tmscore])

    foldseek_tsv = str(foldseek_tmscore) + '.tsv'
    subprocess.run(['foldseek', 'createtsv', db_folder, db_folder, foldseek_tmscore, foldseek_tsv])
    
    foldseek_cluster = output_folder / 'clu'
    subprocess.run(['foldseek', 'clust', db_folder, foldseek_out, foldseek_cluster, '--cluster-mode', '0', '--similarity-type', '2'])
    
    subprocess.run(['foldseek', 'createtsv', db_folder, db_folder, foldseek_cluster, foldseek_clustertsv])

else:
    print('final cluster tsv file already found at', foldseek_clustertsv)

createdb ../output/20230418_test/alphafold ../output/20230418_test/all_foldomesDB 

MMseqs Version:        	5.53465f0
Chain name mode        	0
Mask b-factor threshold	0
Coord store mode       	2
Write lookup file      	1
Tar Inclusion Regex    	.*
Tar Exclusion Regex    	^$
Threads                	10
Verbosity              	3

Output file: ../output/20230418_test/all_foldomesDB
Time for merging to all_foldomesDB_ss: 0h 0m 0s 2ms
Time for merging to all_foldomesDB_h: 0h 0m 0s 1ms
Time for merging to all_foldomesDB_ca: 0h 0m 0s 2ms
Time for merging to all_foldomesDB: 0h 0m 0s 2ms
Ignore 0 out of 1738.
Too short: 0, incorrect  0.
Time for processing: 0h 0m 0s 532ms
Create directory ../output/20230418_test/tmp
search ../output/20230418_test/all_foldomesDB ../output/20230418_test/all_foldomesDB ../output/20230418_test/all_by_all ../output/20230418_test/tmp -a 

MMseqs Version:              	5.53465f0
Seq. id. threshold           	0
Coverage threshold           	0
Coverage mode             

In [5]:
df = pd.read_csv(foldseek_clustertsv, sep = '\t', names = ['ClusterRep', 'uniprot_id'])
df['ClusterRep'] = df['ClusterRep'].str.split('-', expand = True)[1]
df['uniprot_id'] = df['uniprot_id'].str.split('-', expand = True)[1]

df_merged = df.groupby('ClusterRep').agg({i: ('first' if i == 'ClusterRep' else lambda x: [i for i in x]) for i in df.columns}).reset_index(drop = True)
df_merged.drop(columns = ['ClusterRep'], inplace = True)
df_merged.insert(0, 'StruCluster', 'SC' + df_merged.index.astype('str'))

In [6]:
df_exploded = df_merged.explode('uniprot_id')
df_exploded['filename'] = 'AF-' + df_exploded['uniprot_id'] + '-F1-model_v4.pdb'
df_exploded = df_exploded[['filename', 'uniprot_id', 'StruCluster']]

df_exploded.to_csv(bfd.Local / 'foldseek_struclusters.tsv', sep = '\t', index = None)
display(df_exploded)

Unnamed: 0,filename,uniprot_id,StruCluster
0,AF-A0A1L8FE91-F1-model_v4.pdb,A0A1L8FE91,SC0
0,AF-A0A044RHJ8-F1-model_v4.pdb,A0A044RHJ8,SC0
0,AF-A0A059F9B8-F1-model_v4.pdb,A0A059F9B8,SC0
0,AF-A0A060XTX6-F1-model_v4.pdb,A0A060XTX6,SC0
0,AF-A0A060YZZ2-F1-model_v4.pdb,A0A060YZZ2,SC0
...,...,...,...
9,,,SC9
9,,,SC9
9,,,SC9
10,AF-Q9V776-F1-model_v4.pdb,Q9V776,SC10


In [7]:
blast_list = pd.read_csv(bfd.blast_uniprot_list.path, header = None)
foldseek_list = pd.read_csv(bfd.foldseek_uniprot_list.path, header = None)

joint_list = pd.concat([blast_list, foldseek_list])[0].unique()

accessions_record_file = bfd.Local / 'TSP_Aam-ALL_hits.uniprot_list'

if not os.path.exists(accessions_record_file):
    with open(accessions_record_file, 'w+') as fileobj:
        fileobj.writelines(acc + '\n' for acc in joint_list)

In [10]:
df2 = pd.read_csv(foldseek_clustertsv, sep = '\t', names = ['ClusterRep', 'uniprot_id'])

In [12]:
display(df2)

Unnamed: 0,ClusterRep,uniprot_id
0,AF-A0A1L8FE91-F1-model_v4.pdb,AF-A0A1L8FE91-F1-model_v4.pdb
1,AF-A0A1L8FE91-F1-model_v4.pdb,AF-A0A044RHJ8-F1-model_v4.pdb
2,AF-A0A1L8FE91-F1-model_v4.pdb,AF-A0A059F9B8-F1-model_v4.pdb
3,AF-A0A1L8FE91-F1-model_v4.pdb,AF-A0A060XTX6-F1-model_v4.pdb
4,AF-A0A1L8FE91-F1-model_v4.pdb,AF-A0A060YZZ2-F1-model_v4.pdb
...,...,...
1733,AF-D5LHI6-F1-model_v4.pdb,TSP_Aam1030859_hit1.pdb
1734,AF-D5LHI6-F1-model_v4.pdb,TSP_Aam172335_hit1.pdb
1735,AF-D5LHI6-F1-model_v4.pdb,TSP_Aam2220_hit1.pdb
1736,AF-Q9V776-F1-model_v4.pdb,AF-Q9V776-F1-model_v4.pdb
