In [1]:
"""
Download proteins and other features from InterPro for a given pfam protein family
"""

import json
import sys
from urllib.error import HTTPError
from urllib.request import urlopen
from time import sleep

In [2]:
# PF16754, PF06737, PF01832, PF05838, PF00959
query = "PF00959"
api_url = "https://www.ebi.ac.uk/interpro/api"
url = f"{api_url}/protein/uniprot/entry/pfam/{query}/"
url += "?page_size=20"

In [3]:
data = []
next_url = url

max_retries = 20
retry_delay = 3  # Delay in seconds between retries

retry_count = 0
while next_url and retry_count < max_retries:
    try:
        with urlopen(next_url) as response:
            result = json.loads(response.read().decode("utf-8"))
            data.extend(result["results"])
            next_url = result.get("next")
            retry_count = 0  # Reset retry count on successful request
            print("chunk arrived")
            print(next_url)
    except Exception as e:
        print("An error occurred:", str(e))
        retry_count += 1
        sleep(retry_delay)  # Delay before retrying


chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a011mkx9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a014dsx7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a022j442&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a031i8v8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a060bdq1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a060c997&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a060qh24&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a0q4uhu3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a0q6akv8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a0q7hva6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a0q8umr7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a0r2jft7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a0s3tpx2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a0s4wgt3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a1i5arr6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a1i5xsv6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a1i7jmh8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a1j0lp19&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a1j7d160&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a1l2cuv5&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a1l6jbv4&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a265b7q9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a285d1j8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a291ax84&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a291lbj2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2a2cal9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2a4fjh8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2a5mib1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2t4b1e9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2t5nrd2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2t7q7e6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2t9jwp5&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2u1twa1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2u3bdg9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a2u8i6h5&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a3c0hir1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a3c2bdk0&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a3d1dcr2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a3d3kgu8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a3d4tr32&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a3d9bjz3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a3e1r9n1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a437rh68&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a442bb57&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a443vmj1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a447jgz2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a447xfe6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a451a1h6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a455xiz2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a530r7q7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a542mab3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a551yfc3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a554xax6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a560cfw3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a562bcy2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a564mvb0&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a5w5h716&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a5w7q8d2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a5w9h622&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a5x0zjj8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a5x3p4v7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a5x7ekx5&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a5x8ywm5&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a6g9rf74&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a6h0zkr5&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a6h2gck8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a6h9g5x0&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a6i2ixm7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a6i4u279&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a6i5qry1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a744k7s5&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a747ddd6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a747xch9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a750cit4&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a752ipq3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a753dzy7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a756lae2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a7w5vtm6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a7w6jca7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a7w7kja7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a7w8vv11&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a7x0alu3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a7x1efm1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a7x3jry4&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a8e6il22&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a8e7bys6&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a8e7nz79&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a8f1nim8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a8f5n2g7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a8f7ge98&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a8g2cm84&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a965cc07&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a965mrs1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a965qfx9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a966k8v9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a967xnp1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a968v5j3&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aa0a969d7m8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=sourc

chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Ae6bpz2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Ae6yw70&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Ae9g4n4&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Af0qbp2&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Af2vxg7&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Af4t8i8&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Af4xzt4&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Af8ji71&page_size=20


chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aw6r8g9&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Aw9bfi1&page_size=20
chunk arrived
https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/PF00959/?cursor=source%3As%3Ax5lyk4&page_size=20
chunk arrived
None


In [4]:
len(data)

19473

In [5]:
data[0]

{'metadata': {'accession': 'A0A009F987',
  'name': 'Lysozyme',
  'source_database': 'unreviewed',
  'length': 164,
  'source_organism': {'taxId': '1310618',
   'scientificName': 'Acinetobacter baumannii 118362',
   'fullName': 'Acinetobacter baumannii 118362'}},
 'entries': [{'accession': 'PF00959',
   'entry_protein_locations': [{'fragments': [{'start': 65,
       'end': 163,
       'dc-status': 'CONTINUOUS'}],
     'model': 'PF00959',
     'score': 5e-21}],
   'protein_length': 164,
   'source_database': 'pfam',
   'entry_type': 'domain',
   'entry_integrated': 'ipr002196'}]}

In [None]:
# Process the downloaded data
max_retries = 8
retry_delay = 2  # Delay in seconds between retries
retry_count = 0
total_len = 0
total_sequences = 0
filtered_data = []

for result in data:
    metadata = result["metadata"]
    accession = metadata["accession"]
    name = metadata["name"]
    source_database = metadata["source_database"]
    length = metadata["length"]
    source_organism = metadata["source_organism"]["scientificName"]

    sequence_url = f"{api_url}/protein/uniprot/{accession}"
    
    sequence = "N/A"  # Default value for sequence
    retry_count = 0
    while retry_count < max_retries:
        try:
            with urlopen(sequence_url) as response:
                sequence = json.loads(response.read().decode("utf-8"))
            break  # Break out of retry loop on successful request
        except Exception as e:
            print("An error occurred while retrieving the sequence:", str(e))
            retry_count += 1
            sleep(retry_delay)  # Delay before retrying
    if sequence != "N/A":
        result["sequence"] = sequence
        total_len += length
        total_sequences += 1
        filtered_data.append(result)
        #print("Accession:", accession)
        #print("Name:", name)
        #print("Source Database:", source_database)
        #print("Length:", length)
        #print("Source Organism:", source_organism)
        #print("Sequence:", sequence)
        #print("---")
print("Sequences retrieved: ", total_sequences)
print("Sequence average length: ", total_len/total_sequences)

In [11]:
print("Sequences retrieved: ", total_sequences)
print("Sequence average length: ", total_len/total_sequences)

Sequences retrieved:  19473
Sequence average length:  201.6551122066451


In [None]:
import random

# Calculate the number of entries to select (10% of the data)
# ID_test_percentage = 0.1
# num_entries_to_select = int(len(filtered_data) * ID_test_percentage)

# Randomly select ID_test_percentage of the data entries
# random_selection = random.sample(filtered_data, num_entries_to_select)

# Print the randomly selected entries
#for entry in random_selection:
#    print(entry)
#    break

# Remove the randomly selected entries from `filtered_data` if needed
# filtered_data = [entry for entry in filtered_data if entry not in random_selection]


In [None]:
print(len(filtered_data))

In [None]:
import os
import pickle

# Create a directory named "data" in the current working directory if it doesn't exist
data_dir = "data"
os.makedirs(data_dir, exist_ok=True)

# Save filtered_data as a .p file
filtered_data_file = os.path.join(data_dir, "filtered_data_" + query + ".p")
with open(filtered_data_file, "wb") as file:
    pickle.dump(filtered_data, file)

In [None]:
# Save random_selection of 10 proteins as a .p file
random_selection = random.sample(filtered_data, 10)
random_selection_file = os.path.join(data_dir, "random_selection_10_" + query + ".p")
with open(random_selection_file, "wb") as file:
    pickle.dump(random_selection, file)
# Save random_selection of 100 proteins as a .p file
random_selection = random.sample(filtered_data, 100)
random_selection_file = os.path.join(data_dir, "random_selection_100_" + query + ".p")
with open(random_selection_file, "wb") as file:
    pickle.dump(random_selection, file)
# Save random_selection of 500 proteins as a .p file
random_selection = random.sample(filtered_data, 500)
random_selection_file = os.path.join(data_dir, "random_selection_500_" + query + ".p")
with open(random_selection_file, "wb") as file:
    pickle.dump(random_selection, file)
# Save random_selection of 1000 proteins as a .p file
random_selection = random.sample(filtered_data, 1000)
random_selection_file = os.path.join(data_dir, "random_selection_1000_" + query + ".p")
with open(random_selection_file, "wb") as file:
    pickle.dump(random_selection, file)
# Save random_selection of 2000 proteins as a .p file
random_selection = random.sample(filtered_data, 2000)
random_selection_file = os.path.join(data_dir, "random_selection_2000_" + query + ".p")
with open(random_selection_file, "wb") as file:
    pickle.dump(random_selection, file)
# Save random_selection of 5000 proteins as a .p file
random_selection = random.sample(filtered_data, 5000)
random_selection_file = os.path.join(data_dir, "random_selection_5000_" + query + ".p")
with open(random_selection_file, "wb") as file:
    pickle.dump(random_selection, file)
# Save random_selection of 10000 proteins as a .p file
random_selection = random.sample(filtered_data, 10000)
random_selection_file = os.path.join(data_dir, "random_selection_10000_" + query + ".p")
with open(random_selection_file, "wb") as file:
    pickle.dump(random_selection, file)