# Configuração e Dependências
Importar as bibliotecas necessárias: requests, json e traceback.

Há outras possibilidades de exploração dos dados, incluindo visualização das estruturas, tudo dentro de um notebook, mas não serão abordadas no momento

In [56]:
# Importando as bibliotecas necessárias
import json
import traceback # Para debugar melhor
import requests
import time

# Certifique-se de que as bibliotecas necessárias estão instaladas, nesse caso apenas a requests, já que json, time e traceback vem instaladas por padrão
# pip install requests

# Recuperação de Dados da API UniProt
Funções para interagir com a API REST do UniProt, considerando os limites de requisições/requests e analisando os resultados.

In [59]:
# Function to fetch and parse data from UniProt API, minding rate limits
def get_uniprot_data(uniprot_accession: str) -> dict|None:
    """Retrieve data from UniProt API for a given primary accession code.

    Args:
        uniprot_accession (str): UniProt accession code.

    Returns:
        dict: Parsed JSON response from UniProt API.
    """
    url = f"https://www.ebi.ac.uk/proteins/api/proteins/{uniprot_accession}?format=json"
    retries = 3
    for attempt in range(retries):
        response = requests.get(url)
        if response.ok:
            try:
                return response.json()
            except json.JSONDecodeError:
                print(f"Failed to parse JSON for {uniprot_accession}")
                return {}
        elif response.status_code == 429:  # Too Many Requests
            retry_after = int(response.headers.get("Retry-After", 5))
            print(f"Rate limit hit. Retrying after {retry_after} seconds...")
            time.sleep(retry_after)
        else:
            print(f"Failed to retrieve data for {uniprot_accession}, status code: {response.status_code}")
            time.sleep(5)  # Wait before retrying
    print(f"Failed to retrieve data for {uniprot_accession} after {retries} attempts.")
    return {}

# Function to retrieve sequence features, taxonomic lineage, domain/family annotations, and evidence codes
def parse_protein_features(data: dict) -> dict:
    """Parse protein features from UniProt data.

    Args:
        data (dict): JSON response from UniProt API.

    Returns:
        dict: Parsed features including sequence features, taxonomic lineage, domain/family annotations, and evidence codes.
    """
    # Positional features to extract from UniProt data
    if not isinstance(data, dict):
        print("Invalid data type received: {}".format(type(data)))
        return {}

    pos_feature_types = ["DISULFID", "CROSSLNK", "CARBOHYD", "LIPID", "MOD_RES", "ACT_SITE", "BINDING", "DNA_BIND", "SITE", "MUTAGEN"]
    features = {
        "sequence_positional_features": [],
        "domain": [],
        "family": [],
        "protein_existence": []
    }

    if "features" in data:
        for feature in data["features"]:
            feature_type = feature.get("type")
            if feature_type in pos_feature_types:
                features["sequence_positional_features"].append(feature)
            elif feature_type in ["DOMAIN"]:
                features["domain"].append(feature)
            elif feature_type in ["FAMILY"]:
                features["family"].append(feature)

    if "organism" in data and "lineage" in data["organism"]:
        features["taxonomic_lineage"] = data["organism"]["lineage"]

    if "proteinExistence" in data:
        features["protein_existence"].append(data["proteinExistence"])

    return features

# Example usage
uniprot_accession = "P00520"  # Tyrosine-protein kinase ABL1_MOUSE
uniprot_data = get_uniprot_data(uniprot_accession)
if uniprot_data:
    protein_features = parse_protein_features(uniprot_data)
    print(json.dumps(protein_features, indent=2))
else:
    print(f"Could not retrieve data for {uniprot_accession}")

{
  "sequence_positional_features": [
    {
      "type": "ACT_SITE",
      "category": "DOMAINS_AND_SITES",
      "description": "Proton acceptor",
      "begin": "363",
      "end": "363",
      "molecule": "",
      "evidences": [
        {
          "code": "ECO:0000255",
          "source": {
            "name": "PROSITE-ProRule",
            "id": "PRU00159",
            "url": "https://prosite.expasy.org/unirule/PRU00159"
          }
        },
        {
          "code": "ECO:0000255",
          "source": {
            "name": "PROSITE-ProRule",
            "id": "PRU10028",
            "url": "https://prosite.expasy.org/unirule/PRU10028"
          }
        }
      ]
    },
    {
      "type": "BINDING",
      "category": "DOMAINS_AND_SITES",
      "description": "",
      "begin": "248",
      "end": "256",
      "molecule": "",
      "ligand": {
        "name": "ATP",
        "dbReference": {
          "name": "ChEBI",
          "id": "CHEBI:30616"
        }
      },
      "

# Carregar lista de UniProt primary accessions para buscas em lote/bulk
Podemos carregar um arquivo JSON contendo accessions de interesse de um domínio Pfam, útil para consultas em lote e subsequente escrita dos resultados em um arquivo.

In [60]:
def create_pfam_json(pfam_id: str, output_file: str = "pfam_proteins.json") -> None:
    """Create/update JSON mapping PFam IDs to protein accessions and names."""
    # Example data for PFam ID PF08919, one could get such data more naturally
    # by other means, this is for demonstration purposes.
    pfam_data = {
        "PF08919": {
            "P00520": "ABL1_MOUSE",
            "P00519": "ABL1_HUMAN",
            "P42684": "ABL2_HUMAN"
        }
    }

    # Write to JSON file
    with open(output_file, 'w') as f:
        json.dump(pfam_data, f, indent=2)

def get_accessions(pfam_id: str, json_file: str = "pfam_proteins.json") -> list:
    """Get protein accessions for PFam ID."""
    try:
        with open(json_file) as f:
            pfam_data = json.load(f)
        return list(pfam_data.get(pfam_id, {}).keys())
    except FileNotFoundError:
        print(f"Error: File not found: {json_file}")
        return []

def write_features_tsv(accession_list: list, output_file: str = "protein_features.tsv") -> None:
    """Write protein features to TSV without assuming the full structure of the parse_protein_features output."""
    with open(output_file, 'w') as f:
        # Define headers
        headers = ["Accession", "Type", "Category", "Begin", "End", "Description", "Molecule", "LigandName", "LigandDB", "EvidenceCodes"]
        f.write('\t'.join(headers) + '\n')

        for acc in accession_list:
            try:
                data = get_uniprot_data(acc)
                parsed = parse_protein_features(data)

                # Extract the list of sequence_positional_features
                seq_features = parsed.get("sequence_positional_features", [])
                for feat in seq_features:
                    # Safely extract fields from each feature dict
                    ftype = feat.get("type", "")
                    category = feat.get("category", "")
                    begin = feat.get("begin", "")
                    end = feat.get("end", "")
                    desc = feat.get("description", "")
                    molecule = feat.get("molecule", "")

                    # Handle ligand if present
                    ligand_data = feat.get("ligand", {})
                    ligand_name = ligand_data.get("name", "")
                    ligand_db = ""
                    if "dbReference" in ligand_data:
                        db_ref = ligand_data["dbReference"]
                        ligand_db = f"{db_ref.get('name','')}:{db_ref.get('id','')}"

                    # Collect evidence codes
                    evidences = feat.get("evidences", [])
                    evidence_codes = [ev.get("code", "") for ev in evidences]

                    # Write row
                    row = [
                        acc,
                        ftype,
                        category,
                        begin,
                        end,
                        desc,
                        molecule,
                        ligand_name,
                        ligand_db,
                        "|".join(evidence_codes),
                    ]
                    f.write('\t'.join(row) + '\n')
            except Exception as ex:
                print(f"Error processing {acc}: {ex}")

# Example usage:
create_pfam_json("PF08919")
acc_list = get_accessions("PF08919")
write_features_tsv(acc_list)

# Análise de Códigos de Evidência
Podemos separar as anotações/features com códigos de evidência experimental, por exemplo, o ECO:0000269 distinguindo entre anotações experimentais e computacionais.

In [61]:
def analyze_evidence_codes(accessions: list) -> dict:
    """
    Fetch UniProt data for each accession, parse features, and build a mapping of
    evidence_code -> list of features for each valid accession. Skips any that
    don't return proper data.
    """
    results = {}
    for acc in accessions:
        # Retrieve UniProt data
        data = get_uniprot_data(acc)

        # Validate we got a dictionary
        if not isinstance(data, dict) or not data:
            print(f"Skipping {acc}: no valid data returned.")
            continue

        # Parse features
        parsed = parse_protein_features(data)
        if not isinstance(parsed, dict):
            print(f"Skipping {acc}: parse_protein_features() didn't return a dict.")
            continue

        seq_features = parsed.get("sequence_positional_features", [])
        if not isinstance(seq_features, list) or not seq_features:
            print(f"Skipping {acc}: no sequence_positional_features found.")
            continue

        # Build dict of evidence_code -> list of features
        evidence_map = {}
        for feat in seq_features:
            for ev in feat.get("evidences", []):
                code = ev.get("code", "")
                if code:
                    if code not in evidence_map:
                        evidence_map[code] = []
                    evidence_map[code].append(feat)

        results[acc] = evidence_map

    return results

# Example usage
acc_list = get_accessions("PF08919")
evidence_codes_analysis = analyze_evidence_codes(acc_list)

# You can use the function above to easily retrieve features with a given evidence code of particular interest
print(json.dumps(evidence_codes_analysis["P00520"]["ECO:0000269"], indent=2))

# You could also make another variation on the common processing above to separate features by other criteria

[
  {
    "type": "MOD_RES",
    "category": "PTM",
    "description": "Phosphotyrosine; by autocatalysis",
    "begin": "226",
    "end": "226",
    "molecule": "",
    "evidences": [
      {
        "code": "ECO:0000269",
        "source": {
          "name": "PubMed",
          "id": "12748290",
          "url": "http://www.ncbi.nlm.nih.gov/pubmed/12748290",
          "alternativeUrl": "https://europepmc.org/abstract/MED/12748290"
        }
      }
    ]
  },
  {
    "type": "MOD_RES",
    "category": "PTM",
    "description": "Phosphotyrosine; by autocatalysis and SRC-type Tyr-kinases",
    "begin": "393",
    "end": "393",
    "molecule": "",
    "evidences": [
      {
        "code": "ECO:0000269",
        "source": {
          "name": "PubMed",
          "id": "10988075",
          "url": "http://www.ncbi.nlm.nih.gov/pubmed/10988075",
          "alternativeUrl": "https://europepmc.org/abstract/MED/10988075"
        }
      },
      {
        "code": "ECO:0000269",
        "sourc