In [1]:
import requests
from requests.adapters import HTTPAdapter, Retry
import json
import re
import sys
import pandas as pd

In [2]:
url = "https://rest.uniprot.org/uniprotkb/search?format=json&query=%28%28fragment%3Afalse%29+AND+%28taxonomy_id%3A2759%29+AND+%28length%3A%5B40+TO+*%5D%29+AND+%28reviewed%3Atrue%29+AND+%28existence%3A1%29+AND+%28ft_signal_exp%3A*%29%29&size=500"
response = requests.get(url)
session = requests.Session()

In [3]:
def get_next_link(headers):
    if "Link" in headers:
        # Cerca qualunque link rel="next"
        links = headers["Link"].split(",")
        for link in links:
            match = re.search(r'<(.+)>; rel="next"', link)
            if match:
                return match.group(1)
    return None

def get_batch(batch_url):
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers.get("x-total-results")
        yield response, total
        batch_url = get_next_link(response.headers)

def get_kingdom(entry):
  if "Fungi" in entry["organism"]["lineage"]:
     kd = "Fungi"
  elif "Viridiplantae" in entry["organism"]["lineage"]:
    kd = "Viridiplantae"
  elif "Metazoa" in entry["organism"]["lineage"]:
    kd = "Metazoa"
  else:
    kd = "Other"
  return kd

In [4]:
ok_entries = []
for response, total in get_batch(url):
    data = response.json()
    for i in range(len(data['results'])):
      entry = data['results'][i]
      if data['results'][i]['features'][0]['location']['end']['value'] > 13 and data['results'][i]['features'][0]['description'] == '':
        ok_entries.append(entry)

In [5]:
with open("positive.tsv", 'w') as output_file, open("positive.fasta", "w") as fasta_outputfile:
  for entry in ok_entries:
    line = f'''{entry["primaryAccession"]}{'\t'}{entry["organism"]["scientificName"]}{'\t'}{get_kingdom(entry)}{'\t'}{entry["sequence"]["length"]}{'\t'}{entry["features"][0]["location"]["end"]["value"]}'''
    output_file.write(line + '\n')
    header = f""">{entry["primaryAccession"]} {entry["organism"]["scientificName"]}"""
    sequence = entry["sequence"]["value"]
    fasta_outputfile.write(header + "\n" + sequence + "\n")

# Negatives

In [7]:
urlo_neg = "https://rest.uniprot.org/uniprotkb/search?format=json&query=%28%28fragment%3Afalse%29+AND+%28taxonomy_id%3A2759%29+AND+%28length%3A%5B40+TO+*%5D%29+AND+%28reviewed%3Atrue%29+AND+%28existence%3A1%29+NOT+%28ft_signal%3A*%29+OR+%28cc_scl_term_exp%3ASL-0091%29+OR+%28cc_scl_term_exp%3ASL-0191%29+OR+%28cc_scl_term_exp%3ASL-0173%29+OR+%28cc_scl_term_exp%3ASL-0209%29+OR+%28cc_scl_term_exp%3ASL-0204%29+OR+%28cc_scl_term_exp%3ASL-0039%29%29&size=500"
response_neg = requests.get(urlo_neg)
session = requests.Session()

In [42]:
neg_entries = []
for response, total in get_batch(urlo_neg):
    data_neg = response.json()
    for i in range(len(data_neg['results'])):
      entry_neg = data_neg['results'][i]
      tm_present = False  # Initialize the boolean flag for this entry
      # Iterate through the features list for each entry
      for feature in entry_neg.get('features', []):
          # Check if the 'type' key exists and is 'Transmembrane'
          if feature.get('type') == 'Transmembrane':
              if re.search("Helical", feature.get("description", "")):
                if feature.get("location", {}).get("start", {}).get("value", float('inf')) <= 90:
                  tm_present = True
                  break # Found the specific TM domain, no need to check further features for this entry

      # Append the entry data along with the tm_present flag
      # This ensures that the boolean corresponds to the entry
      neg_entries.append((entry_neg, tm_present))

In [45]:
with open("negative.tsv", 'w') as output_file, open("negative.fasta", "w") as fasta_outputfile:
  for i, entry in enumerate(neg_entries):
    line = f'''{entry[0]["primaryAccession"]}{'\t'}{entry[0]["organism"]["scientificName"]}{'\t'}{get_kingdom(entry[0])}{'\t'}{entry[0]["sequence"]["length"]}{'\t'}{entry[1]}'''
    output_file.write(line + '\n')
    header = f""">{entry[0]["primaryAccession"]} {entry[0]["organism"]["scientificName"]}"""
    sequence = entry[0]["sequence"]["value"]
    fasta_outputfile.write(header + "\n" + sequence + "\n")

In [46]:
# Assuming the last column of negative.tsv is the boolean value for tm_present
negative_set = pd.read_csv("negative.tsv", sep="\t", header=None, names=['Accession', 'Organism', 'Kingdom', 'Length', 'TM_Present'])
positive_set = pd.read_csv("positive.tsv", sep="\t", header=None, names=['Accession', 'Organism', 'Kingdom', 'Length', 'Signal_Peptide_End'])

print(len(positive_set))
print(len(negative_set))
# Filter the negative set based on the 'TM_Present' column
print(len(negative_set[negative_set['TM_Present'] == True]))

2932
20615
2465


{'type': 'Chain', 'location': {'start': {'value': 30, 'modifier': 'EXACT'}, 'end': {'value': 215, 'modifier': 'EXACT'}}, 'description': 'CUE domain-containing protein 4, mitochondrial', 'featureId': 'PRO_0000310344'}
{'type': 'Region', 'location': {'start': {'value': 84, 'modifier': 'EXACT'}, 'end': {'value': 108, 'modifier': 'EXACT'}}, 'description': 'Disordered', 'evidences': [{'evidenceCode': 'ECO:0000256', 'source': 'SAM', 'id': 'MobiDB-lite'}]}
{'type': 'Region', 'location': {'start': {'value': 20, 'modifier': 'EXACT'}, 'end': {'value': 71, 'modifier': 'EXACT'}}, 'description': 'Disordered', 'evidences': [{'evidenceCode': 'ECO:0000256', 'source': 'SAM', 'id': 'MobiDB-lite'}]}
{'type': 'Modified residue', 'location': {'start': {'value': 200, 'modifier': 'EXACT'}, 'end': {'value': 200, 'modifier': 'EXACT'}}, 'description': 'Phosphoserine', 'evidences': [{'evidenceCode': 'ECO:0000269', 'source': 'PubMed', 'id': '18257517'}]}
{'type': 'Region', 'location': {'start': {'value': 166, 'mo

IndexError: list index out of range