In [1]:
import requests
from requests.adapters import HTTPAdapter, Retry
import json
import re

In [2]:
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

In [3]:
def get_next_link(headers):
    if "Link" in headers:
        re_next_link = re.compile(r'<(.+)>; rel=\"next\"')
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url):
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers["x-total-results"]
        yield response, total
        batch_url = get_next_link(response.headers)


In [4]:
batch_size = 500

def filter_entry_positive(entry):
  for feature in entry["features"]:
    if feature["type"]== "Signal":
      if feature["description"] == "" :
        endVal = int(feature["location"]["end"]["value"])
        if endVal > 13 :
          return True
  return False

def extract_fields_positive(entry):
  e = None
  for f in entry["features"]:
    if f["type"] == "Signal" and f["description"] == "":
      e = f["location"]["end"]["value"]
      break
  
  return (
      entry["primaryAccession"],
      entry["organism"]["scientificName"],
      entry["organism"]["lineage"][1],
      entry["sequence"]["length"],
      e,
      "Null"
  )

In [9]:
url_positive = "https://rest.uniprot.org/uniprotkb/search?format=json&query=%28%28taxonomy_id%3A2759%29+NOT+%28length%3A%5B1+TO+40%5D%29+NOT+%28fragment%3Atrue%29+AND+%28reviewed%3Atrue%29+AND+%28ft_signal_exp%3A*%29%29&size=500"
url_negative= "https://rest.uniprot.org/uniprotkb/search?format=json&query=%28%28fragment%3Afalse%29+AND+%28length%3A%5B40+TO+*%5D%29+AND+%28taxonomy_id%3A2759%29+NOT+%28ft_signal%3A*%29+AND+%28%28cc_scl_term_exp%3ASL-0091%29+OR+%28cc_scl_term_exp%3ASL-0191%29+OR+%28cc_scl_term_exp%3ASL-0173%29+OR+%28cc_scl_term_exp%3ASL-0209%29+OR+%28cc_scl_term_exp%3ASL-0204%29+OR+%28cc_scl_term_exp%3ASL-0039%29%29+AND+%28reviewed%3Atrue%29+AND+%28existence%3A1%29%29&size=500"

col_unificate = ["Accession", "Organism", "Kingdom", "Sequence length", "SP cleavage", "transmembrane_term"]


In [10]:
def filter_entry_negative(entry):
  return True

def extract_fields_negative(entry):
   transmembrane_check = "False"
   for char in entry["features"]:
     if char["type"] == "Transmembrane":
       if char['description'] == "Helical":
              if char["location"]["start"]["value"] < 90:
                transmembrane_check = "True"
                break

   return (
       entry["primaryAccession"],
       entry["organism"]["scientificName"],
       entry["organism"]["lineage"][1],
       entry["sequence"]["length"],
       "False",
       transmembrane_check
   )

In [11]:
def get_dataset(search_url, filter_function, extract_function,columns, output_file_name, output_fasta_file_name):
    n_total, n_filtered = 0, 0
    with open(output_file_name, 'w') as ofs:
      print(*columns,sep="\t", file=ofs)
      with open(output_fasta_file_name, 'w') as ofs_fasta:
        for batch, total in get_batch(search_url):
          batch_json = json.loads(batch.text)
          for entry in batch_json["results"]:
            n_total += 1
            if filter_function(entry):
              n_filtered += 1
              fields = extract_function(entry)
              print(*fields, sep="\t", file=ofs)
              print(">", entry["primaryAccession"], sep="", file=ofs_fasta)
              print(entry["sequence"]["value"], file=ofs_fasta)
    print(f"Total: {n_total}, filtered: {n_filtered}")

In [12]:
print("--- Generazione del dataset positivo ---")
get_dataset(url_positive, filter_entry_positive, extract_fields_positive, col_unificate, "positive_dataset.tsv","positive_dataset.fasta")

print("--- Generazione del dataset negativo ---")
get_dataset(url_negative, filter_entry_negative, extract_fields_negative, col_unificate, "negative_dataset.tsv","negative_dataset.fasta")


--- Generazione del dataset positivo ---
Total: 2949, filtered: 2932
--- Generazione del dataset negativo ---
Total: 20615, filtered: 20615


In [13]:
import pandas as pd

negative_set = pd.read_csv("negative_dataset.tsv", sep="\t")
positive_set = pd.read_csv("positive_dataset.tsv", sep="\t")

print(f"{len(positive_set)} righe nel dataset positivo.")
print(f"{len(negative_set)} righe nel dataset negativo.")

print("\n--- Esempio di dati POSITIVI ---")
print(positive_set.head())

print("\n--- Esempio di dati NEGATIVI ---")
print(negative_set.head())

2932 righe nel dataset positivo.
20615 righe nel dataset negativo.

--- Esempio di dati POSITIVI ---
  Accession      Organism  Kingdom  Sequence length  SP cleavage  \
0    O00300  Homo sapiens  Metazoa              401           21   
1    O00478  Homo sapiens  Metazoa              584           29   
2    O00748  Homo sapiens  Metazoa              559           26   
3    O14763  Homo sapiens  Metazoa              440           55   
4    O43155  Homo sapiens  Metazoa              660           35   

  transmembrane_term  
0               Null  
1               Null  
2               Null  
3               Null  
4               Null  

--- Esempio di dati NEGATIVI ---
  Accession      Organism  Kingdom  Sequence length  SP cleavage  \
0    Q6YN16  Homo sapiens  Metazoa              418        False   
1    Q9UHK6  Homo sapiens  Metazoa              382        False   
2    A3KMH1  Homo sapiens  Metazoa             1905        False   
3    O75874  Homo sapiens  Metazoa            