In [1]:
!pip install git+https://github.com/Arcadia-Science/biofile.git@das/dev --upgrade

Collecting git+https://github.com/Arcadia-Science/biofile.git@das/dev
  Cloning https://github.com/Arcadia-Science/biofile.git (to revision das/dev) to /private/var/folders/5b/71_5djmd0p5_yhs0tpbbq68r0000gn/T/pip-req-build-oppr0wu3
  Running command git clone --filter=blob:none --quiet https://github.com/Arcadia-Science/biofile.git /private/var/folders/5b/71_5djmd0p5_yhs0tpbbq68r0000gn/T/pip-req-build-oppr0wu3
  Running command git checkout -b das/dev --track origin/das/dev
  Switched to a new branch 'das/dev'
  branch 'das/dev' set up to track 'origin/das/dev'.
  Resolved https://github.com/Arcadia-Science/biofile.git to commit 81d00b1dd02c3d259d234ce0d79df5aff6d82ac2
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting boto3
  Downloading boto3-1.26.104-py3-none-any.whl (135 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting s3path
  Using cached s3path-0.4.1-py3-none-an

In [80]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

import biofile

bfd = biofile.Dataset(identifier = 'TSP_Aam', 
                      local = '../output/', 
                      remote = 's3://arcadia-protein-evolution/cartography/tsp/')

local files will be saved in ../output/
remote files will be saved in s3://arcadia-protein-evolution/cartography/tsp/


In [81]:
input_files = [str(bfd.Local / i) for i in os.listdir(bfd.Local) if 'TSP' in i and '.txt' in i]
output_files = [j.replace('.txt', '_blastp_results.out') for j in input_files]

files_dict = dict(zip(input_files, output_files))

for input_file, output_file in tqdm(files_dict.items()):
    
    if not os.path.exists(output_file):
        !blastp -db nr -query {input_file} -out {output_file} -remote -max_target_seqs 50000 -outfmt 6

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 16844.59it/s]


In [82]:
all_accessions = []

for file in files_dict.values():
    df = pd.read_csv(file, sep = '\t', header = None)
    accessions = list(df[1].values)
    all_accessions = all_accessions + accessions
    
    newfile = file.replace('.out', '.refseq_list')
    
    if not os.path.exists(newfile):
        with open(newfile, 'w+') as fileobj:
            fileobj.writelines(acc + '\n' for acc in accessions)

all_accessions_set = set(all_accessions)

outfile = '../output/TSP_Aam-RefSeq_hits.refseq_list'

if not os.path.exists(outfile):
    with open(outfile, 'w+') as fileobj:
        fileobj.writelines(acc + '\n' for acc in all_accessions_set)

In [83]:
uniprot_idmm_results = ['../output/TSP_BLAST_to_EMBL-Genbank_DDBJ.tsv', '../output/TSP_BLAST_to_RefSeq.tsv']

dummy_df = pd.DataFrame({'From':[], 'Entry':[]})

for file in uniprot_idmm_results:
    df = pd.read_csv(file, sep = '\t')
    sliced_df = df[['From', 'Entry']]
    dummy_df = pd.concat([dummy_df, sliced_df])
    
dummy_df.drop_duplicates(inplace = True)
display(dummy_df)

Unnamed: 0,From,Entry
0,AIW62652.1,A0A0A0V684
1,TFK09892.1,A0A4D9ERY4
2,GFU45762.1,A0A8X6R426
3,AIW62496.1,A0A0A0V9N6
4,KXN91798.1,A0A137QWX7
...,...,...
48,XP_019635888.1,A0A6P4ZGM4
49,XP_018082512.1,A0A1L8FE91
50,XP_016353901.1,A0A671KLV9
51,XP_032046946.1,A0A6J3D6B3


In [84]:
output_folder = bfd.Local / '20230331_test'
alphafold_folder = output_folder / 'alphafold'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)
if not os.path.exists(alphafold_folder):
    os.mkdir(alphafold_folder)

In [85]:
accessions = dummy_df['Entry'].unique().tolist()

accessions_record_file = outfile.replace('.refseq_list', '.uniprot_list')

if not os.path.exists(accessions_record_file):
    with open(accessions_record_file, 'w+') as fileobj:
        fileobj.writelines(acc + '\n' for acc in accessions)

for accession in tqdm(accessions):
    output = alphafold_folder / 'AF-{}-F1-model_v4.pdb'.format(accession)
    source = 'https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.pdb'.format(accession)
    
    if not os.path.exists(output):
        subprocess.run(['curl' , '-JLo' , str(output), source], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 165/165 [00:00<00:00, 63006.21it/s]


In [86]:
keyfiles_dict = {
    'blast_refseq_list': outfile,
    'blast_uniprot_list': accessions_record_file,
}

inputfiles_dict = {f'TSP_hit{i}':file for i, file in enumerate(input_files)}
outputfiles_dict = {f'TSP_blasthits{i}':file for i, file in enumerate(output_files)}

keyfiles_dict = keyfiles_dict | inputfiles_dict | outputfiles_dict

bfd.add_keyfiles(keyfiles_dict)

In [93]:
bfd.local_to_s3()
bfd.pickle()
bfd.push_to_s3(overwrite = True)

s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam-RefSeq_hits.refseq_list exists. Set overwrite = True to replace.
s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam-RefSeq_hits.uniprot_list exists. Set overwrite = True to replace.
s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam-1030859.txt exists. Set overwrite = True to replace.
s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam-2220.txt exists. Set overwrite = True to replace.
s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam-172335.txt exists. Set overwrite = True to replace.
s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam-1030859_blastp_results.out exists. Set overwrite = True to replace.
s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam-2220_blastp_results.out exists. Set overwrite = True to replace.
s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam-172335_blastp_results.out exists. Set overwrite = True to replace.
s3://arcadia-protein-evolution/cartography/tsp/TSP_Aam.json exists. overwr

In [95]:
bfd.attrs

{'identifier': 'TSP_Aam',
 'local': '../output/',
 'remote': 's3://arcadia-protein-evolution/cartography/tsp/',
 'files': {},
 'blast_refseq_list': <biofile.biofile.Biofile at 0x17af2b790>,
 'blast_uniprot_list': <biofile.biofile.Biofile at 0x17af32150>,
 'TSP_hit0': <biofile.biofile.Biofile at 0x17aede450>,
 'TSP_hit1': <biofile.biofile.Biofile at 0x17aef1a10>,
 'TSP_hit2': <biofile.biofile.Biofile at 0x17af284d0>,
 'TSP_blasthits0': <biofile.biofile.Biofile at 0x17af2a890>,
 'TSP_blasthits1': <biofile.biofile.Biofile at 0x17af55990>,
 'TSP_blasthits2': <biofile.biofile.Biofile at 0x17af55b50>}