In [1]:
import os
import sys
import json
import ssl
from time import sleep
from urllib import request
from urllib.error import HTTPError
from tqdm import tqdm
import pandas as pd

In [2]:
class InterProFetcher:
    HEADER_SEPARATOR = "|"
    LINE_LENGTH = 80

    def __init__(self, interpro_accession, ncbi_taxid, subunit, organism, output_dir="."):
        self.interpro_accession = interpro_accession
        self.ncbi_taxid = ncbi_taxid
        self.subunit = subunit
        self.base_url = f"https://www.ebi.ac.uk:443/interpro/api/protein/reviewed/entry/InterPro/{interpro_accession}/taxonomy/uniprot/{ncbi_taxid}/?page_size=200&extra_fields=sequence"
        self.output_dir = output_dir
        self.output_file = os.path.join(output_dir, f"{subunit.lower()}_{organism}_{interpro_accession.lower()}_interpro.faa")

    def fetch_data(self, url):
        """Fetch data from the given URL, handling errors and retries."""
        context = ssl._create_unverified_context()
        attempts = 0

        while attempts < 4:
            try:
                req = request.Request(url, headers={"Accept": "application/json"})
                with request.urlopen(req, context=context) as res:
                    if res.status == 204:
                        return None  # No content
                    return json.loads(res.read().decode())
            except HTTPError as e:
                if e.code == 408:
                    attempts += 1
                    sleep(61)  # Wait and retry for timeout errors
                else:
                    sys.stderr.write(f"Error fetching data: {e}\n")
                    attempts = 4  # Break the loop for other HTTP errors
            except Exception as e:
                sys.stderr.write(f"An error occurred: {e}\n")
                break

        return None

    def write_fasta(self, item, file):
        """Writes the FASTA format for a single protein item to a file."""
        header_parts = [item["metadata"]["accession"], item["metadata"]["name"]]
        entries = item.get("entry_subset") or item.get("entries")

        if entries:
            entries_header = self.HEADER_SEPARATOR.join(
                [f'{entry["accession"]}({"|".join([",".join([str(fragment["start"]) + "..." + str(fragment["end"]) for fragment in locations["fragments"]]) for locations in entry["entry_protein_locations"]])})' for entry in entries]
            )
            header_parts.append(entries_header)

        file.write(">" + self.HEADER_SEPARATOR.join(header_parts) + "\n")

        seq = item["extra_fields"]["sequence"]
        for i in range(0, len(seq), self.LINE_LENGTH):
            file.write(seq[i:i+self.LINE_LENGTH] + "\n")

    def output_list_to_file(self):
        next_url = self.base_url

        # Ensure the output directory exists
        os.makedirs(self.output_dir, exist_ok=True)

        with open(self.output_file, "w") as file:
            while next_url:
                data = self.fetch_data(next_url)
                if not data:
                    break  # Exit if no data is returned

                for item in data["results"]:
                    self.write_fasta(item, file)

                next_url = data.get("next")  # Prepare the URL for the next page of results
                sleep(1)  # Throttle requests

In [3]:
interpro = pd.read_csv("/Users/akshayonly/Work/Updated/Data/Misc/InterPro/nuo_interpro_classification_accessions.csv")
interpro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Protein             17 non-null     object
 1   InterPro Accession  17 non-null     object
dtypes: object(2)
memory usage: 404.0+ bytes


In [4]:
interpro.head()

Unnamed: 0,Protein,InterPro Accession
0,NuoE,IPR002023
1,NuoF,IPR011537
2,NuoG,IPR010228
3,NuoB,IPR006138
4,NuoC,IPR010218


In [5]:
prok_ncbi_taxid = [('archaea', 2157), ('bacteria', 2)]

output_dir = "/Users/akshayonly/Work/Sequence-Data/InterPro"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for organism, ncbi_taxid in prok_ncbi_taxid:
    for i, row in tqdm(interpro.iterrows()):
        subunit_protein = row['Protein']
        interpro_accession = row['InterPro Accession']
        fetcher = InterProFetcher(interpro_accession, ncbi_taxid, subunit_protein, organism, output_dir)
        fetcher.output_list_to_file()

17it [00:31,  1.88s/it]
17it [05:57, 21.01s/it]


In [6]:
delete_empty_downloads = f"find {output_dir} -type f -empty -delete"
os.system(delete_empty_downloads)

0