In [1]:
"""
Download proteins and other features from InterPro for a given pfam protein family
"""

import json
import sys
from urllib.error import HTTPError
from urllib.request import urlopen
from time import sleep

In [24]:
# PF16754, PF06737, PF01832, PF05838, PF00959
query = "PF05838"
api_url = "https://www.ebi.ac.uk/interpro/api"
url = f"{api_url}/protein/uniprot/entry/pfam/{query}/"
url += "?page_size=20"

In [None]:
data = []
next_url = url

max_retries = 20
retry_delay = 3  # Delay in seconds between retries

retry_count = 0
while next_url and retry_count < max_retries:
    try:
        with urlopen(next_url) as response:
            result = json.loads(response.read().decode("utf-8"))
            data.extend(result["results"])
            next_url = result.get("next")
            retry_count = 0  # Reset retry count on successful request
            print("chunk arrived")
            print(next_url)
    except Exception as e:
        print("An error occurred:", str(e))
        retry_count += 1
        sleep(retry_delay)  # Delay before retrying


chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF05838/?cursor=source%3As%3Aa0a011pw38&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF05838/?cursor=source%3As%3Aa0a031gy41&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF05838/?cursor=source%3As%3Aa0a060cgk7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF05838/?cursor=source%3As%3Aa0a069pqm5&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF05838/?cursor=source%3As%3Aa0a077kxn6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF05838/?cursor=source%3As%3Aa0a085tye9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF05838/?cursor=source%3As%3Aa0a090mqi4&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF05838/?cursor=sourc

In [None]:
len(data)

In [None]:
data[0]

In [None]:
# Process the downloaded data
max_retries = 8
retry_delay = 2  # Delay in seconds between retries
retry_count = 0
total_len = 0
total_sequences = 0
filtered_data = []

for result in data:
    metadata = result["metadata"]
    accession = metadata["accession"]
    name = metadata["name"]
    source_database = metadata["source_database"]
    length = metadata["length"]
    source_organism = metadata["source_organism"]["scientificName"]

    sequence_url = f"{api_url}/protein/uniprot/{accession}"
    
    sequence = "N/A"  # Default value for sequence
    retry_count = 0
    while retry_count < max_retries:
        try:
            with urlopen(sequence_url) as response:
                sequence = json.loads(response.read().decode("utf-8"))
            break  # Break out of retry loop on successful request
        except Exception as e:
            print("An error occurred while retrieving the sequence:", str(e))
            retry_count += 1
            sleep(retry_delay)  # Delay before retrying
    if sequence != "N/A":
        result["sequence"] = sequence
        total_len += length
        total_sequences += 1
        filtered_data.append(result)
        #print("Accession:", accession)
        #print("Name:", name)
        #print("Source Database:", source_database)
        #print("Length:", length)
        #print("Source Organism:", source_organism)
        #print("Sequence:", sequence)
        #print("---")
print("Sequences retrieved: ", total_sequences)
print("Sequence average length: ", total_len/total_sequences)

In [None]:
import random

# Calculate the number of entries to select (10% of the data)
ID_test_percentage = 0.1
num_entries_to_select = int(len(filtered_data) * ID_test_percentage)

# Randomly select ID_test_percentage of the data entries
random_selection = random.sample(filtered_data, num_entries_to_select)

# Print the randomly selected entries
#for entry in random_selection:
#    print(entry)

# Remove the randomly selected entries from `filtered_data` if needed
filtered_data = [entry for entry in filtered_data if entry not in random_selection]


In [None]:
print(len(random_selection))
print(len(filtered_data))

In [None]:
random_selection[0]

In [None]:
import os
import pickle

# Create a directory named "data" in the current working directory if it doesn't exist
data_dir = "data"
os.makedirs(data_dir, exist_ok=True)

# Save random_selection as a .p file
random_selection_file = os.path.join(data_dir, "random_selection_" + query + ".p")
with open(random_selection_file, "wb") as file:
    pickle.dump(random_selection, file)

# Save filtered_data as a .p file
filtered_data_file = os.path.join(data_dir, "filtered_data_" + query + ".p")
with open(filtered_data_file, "wb") as file:
    pickle.dump(filtered_data, file)