## ProteinGym Processing Script
- The goal of this script is to extract metadata for all proteins found within ProteinGym
- This metadata is scraped from UniProt (covers 186/187 proteins). 

In [1]:
import pandas as pd
import requests
import time
import json

1. Load DF describing DMS assays

In [2]:
# https://raw.githubusercontent.com/OATML-Markslab/ProteinGym/main/reference_files/DMS_substitutions.csv
DMS_summary = pd.read_csv('../data/DMS_substitutions.csv')

2. We require "Accession" numbers to access the uniprot API - let's map our uniprot ID's to these

In [4]:
def get_uniprot_accessions(uniprot_ids):
    # URL for the UniProt ID mapping service
    url = "https://rest.uniprot.org/idmapping/run"
    
    # Prepare the data for the form submission
    form_data = {
        'from': 'UniProtKB_AC-ID',
        'to': 'UniProtKB',
        'ids': ",".join(uniprot_ids)  # joins the list of UniProt IDs into a single string separated by commas
    }
    
    # Send the request
    response = requests.post(url, data=form_data)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Extract the job ID from the response
        job_id = response.json().get('jobId')
    else:
        # Handle potential errors (simple print statement here, could be logging or raising an exception)
        print("Failed to submit job:", response.status_code, response.text)
        return None
    
    while True:
        url = f"https://rest.uniprot.org/idmapping/stream/{job_id}"

        # Send the GET request
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # You could return the response text directly or process it as needed
            raw_json = json.loads(response.text)
            mapping = {item['from']: item['to'] for item in raw_json['results']}

            return mapping

        time.sleep(1)

uniprot_ids = DMS_summary['UniProt_ID'].unique()
uniprot_mapping = get_uniprot_accessions(uniprot_ids)
DMS_summary['UniProt_Accession_id'] = DMS_summary['UniProt_ID'].apply(lambda x: uniprot_mapping.get(x, pd.NA))

In [5]:
def get_protein_features(protein_id):

    requestURL = f"https://www.ebi.ac.uk/proteins/api/features/{protein_id}"

    r = requests.get(requestURL, headers={ "Accept" : "application/json"})

    if not r.ok:
        return None

    features = json.loads(r.text)['features']

    #features = [feature for feature in features if feature['category'] == 'PTM' or feature['type'] == 'BINDING']

    return features

DMS_summary['uniprot_features'] = DMS_summary['UniProt_Accession_id'].apply(lambda x: get_protein_features(x))

3. Now we can process the uniprot features:

In [None]:
def process_DMS(row):

    # Initialize BINDING and PTM columns with default False values
    row['BINDING'] = []
    row['PTM'] = []
    row['HELIX'] = []
    row['STRAND'] = []
    row['TURN'] = []
    row['DISORDERED'] = []

    if row['uniprot_features'] is None:
        return row

    # Check each mutation if it falls within any binding or PTM site
    for feature in row['uniprot_features']:
        feature_type = feature['type']
        feature_category = feature['category']

        try:
            feature_begin = int(feature['begin'])
            feature_end = int(feature['end'])
        except:
            continue

        if feature_type == 'BINDING':
            row['BINDING'].append([feature_begin, feature_end])
        elif feature_category == 'PTM':
            row['PTM'].append([feature_begin, feature_end])
        elif feature_type == 'HELIX':
            row['HELIX'].append([feature_begin, feature_end])
        elif feature_type == 'STRAND':
            row['STRAND'].append([feature_begin, feature_end])
        elif feature_type == 'TURN':
            row['TURN'].append([feature_begin, feature_end])
        elif feature_type == 'REGION' and feature.get('description') == 'Disordered':
            row['DISORDERED'].append([feature_begin, feature_end])        

    return row

DMS_summary = DMS_summary.apply(process_DMS, axis=1)


In [None]:
for feat in ['BINDING', 'PTM', 'HELIX', 'STRAND', 'TURN', 'DISORDERED']:
    print(feat, DMS_summary[feat].apply(
        lambda ranges: sum([x[1] - x[0] for x in ranges])
    ).sum())

BINDING 551
PTM 12022
HELIX 20098
STRAND 13239
TURN 1773
DISORDERED 13362


In [None]:
DMS_summary = DMS_summary[['UniProt_ID', 'UniProt_Accession_id', 'target_seq', 'taxon', 'selection_type', 'BINDING', 'PTM', 'HELIX', 'STRAND', 'TURN', 'DISORDERED']]

In [None]:
DMS_summary.to_csv('../data/DMS_substitutions_with_features.csv', index=False)

Unnamed: 0,UniProt_ID,UniProt_Accession_id,taxon,selection_type,BINDING,PTM,HELIX,STRAND,TURN,DISORDERED
0,A0A140D2T1_ZIKV,A0A140D2T1,Virus,Growth,"[[2576, 2576], [2606, 2606], [2607, 2607], [26...","[[293, 320], [350, 406], [364, 395], [382, 411...",[],[],[],[]
1,A0A192B1T2_9HIV1,A0A192B1T2,Virus,Growth,[],"[[753, 753], [833, 833], [53, 73], [212, 241],...",[],[],[],"[[705, 729]]"
2,A0A1I9GEU1_NEIME,A0A1I9GEU1,Prokaryote,,[],"[[120, 154]]",[],[],[],"[[142, 161]]"
3,A0A247D711_LISMN,A0A247D711,Eukaryote,Flow cytometry,[],[],[],[],[],[]
4,A0A2Z5U3Z0_9INFA,A0A2Z5U3Z0,Virus,Growth,[],"[[554, 554], [561, 561], [564, 564], [21, 480]...",[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...
212,VKOR1_HUMAN,Q9BQB6,Human,enzymatic activity,"[[80, 80], [135, 135], [139, 139], [139, 139]]","[[43, 51], [132, 135]]","[[10, 15], [19, 35], [51, 56], [58, 60], [62, ...","[[44, 46]]",[],[]
213,VRPI_BPT7,P03704,Virus,cDNA display proteolysis,[],[],"[[36, 45], [48, 50]]","[[13, 21], [26, 31], [53, 60]]",[],[]
214,YAIA_ECOLI,P0AAN5,Prokaryote,cDNA display proteolysis,[],[],"[[47, 58]]","[[11, 19], [21, 35], [39, 46]]",[],"[[35, 63]]"
215,YAP1_HUMAN,P46937,Human,Binding,[],"[[61, 61], [63, 63], [105, 105], [109, 109], [...","[[61, 73], [75, 77], [86, 88], [93, 96], [202,...","[[52, 54], [169, 171], [177, 181], [183, 185],...","[[192, 195], [242, 244], [251, 254]]","[[1, 59], [91, 114], [133, 158], [275, 309], [..."
