## ProteinGym Processing Script
- The goal of this script is to extract metadata for all proteins found within ProteinGym
- This metadata is scraped from UniProt (covers 186/187 proteins). 

In [1]:
import pandas as pd
import requests
import time
import json

1. Load DF describing DMS assays

In [2]:
# https://raw.githubusercontent.com/OATML-Markslab/ProteinGym/main/reference_files/DMS_substitutions.csv
DMS_summary = pd.read_csv('../data/DMS_substitutions.csv')

2. We require "Accession" numbers to access the uniprot API - let's map our uniprot ID's to these

In [3]:
def get_uniprot_assessions(uniprot_ids):
    # URL for the UniProt ID mapping service
    url = "https://rest.uniprot.org/idmapping/run"
    
    # Prepare the data for the form submission
    form_data = {
        'from': 'UniProtKB_AC-ID',
        'to': 'UniProtKB',
        'ids': ",".join(uniprot_ids)  # joins the list of UniProt IDs into a single string separated by commas
    }
    
    # Send the request
    response = requests.post(url, data=form_data)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Extract the job ID from the response
        job_id = response.json().get('jobId')
    else:
        # Handle potential errors (simple print statement here, could be logging or raising an exception)
        print("Failed to submit job:", response.status_code, response.text)
        return None
    
    while True:
        url = f"https://rest.uniprot.org/idmapping/stream/{job_id}"

        # Send the GET request
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # You could return the response text directly or process it as needed
            raw_json = json.loads(response.text)
            mapping = {item['from']: item['to'] for item in raw_json['results']}

            return mapping

        time.sleep(1)

uniprot_ids = DMS_summary['UniProt_ID'].unique()
uniprot_mapping = get_uniprot_assessions(uniprot_ids)
DMS_summary['UniProt_Assession_id'] = DMS_summary['UniProt_ID'].apply(lambda x: uniprot_mapping.get(x, pd.NA))
DMS_summary

Unnamed: 0,DMS_id,DMS_filename,UniProt_ID,taxon,source_organism,target_seq,seq_len,includes_multiple_mutants,DMS_total_number_mutants,DMS_number_single_mutants,...,raw_DMS_phenotype_name,raw_DMS_directionality,raw_DMS_mutant_column,weight_file_name,pdb_file,pdb_range,ProteinGym_version,raw_mut_offset,coarse_selection_type,UniProt_Assession_id
0,A0A140D2T1_ZIKV_Sourisseau_2019,A0A140D2T1_ZIKV_Sourisseau_2019.csv,A0A140D2T1_ZIKV,Virus,Zika virus,MKNPKKKSGGFRIVNMLKRGVARVNPLGGLKRLPAGLLLGHGPIRM...,3423,False,9576,9576,...,effect,1,mutant,A0A140D2T1_ZIKV_theta_0.01.npy,A0A140D2T1_ZIKV.pdb,291-794,0.1,,OrganismalFitness,A0A140D2T1
1,A0A192B1T2_9HIV1_Haddox_2018,A0A192B1T2_9HIV1_Haddox_2018.csv,A0A192B1T2_9HIV1,Virus,HIV,MRVKGIQMNSQHLLRWGIMILGMIMICSVAGNLWVTVYYGVPVWKD...,852,False,12577,12577,...,fitness,1,mutant,A0A192B1T2_9HIV1_theta_0.01.npy,A0A192B1T2_9HIV1.pdb,1-852,0.1,,OrganismalFitness,A0A192B1T2
2,A0A1I9GEU1_NEIME_Kennouche_2019,A0A1I9GEU1_NEIME_Kennouche_2019.csv,A0A1I9GEU1_NEIME,Prokaryote,Neisseria meningitidis,FTLIELMIVIAIVGILAAVALPAYQDYTARAQVSEAILLAEGQKSA...,161,False,922,922,...,piliation_log2_ratio,1,mutants,A0A1I9GEU1_NEIME_theta_0.2.npy,A0A1I9GEU1_NEIME.pdb,1-161,0.1,,Activity,A0A1I9GEU1
3,A0A247D711_LISMN_Stadelmann_2021,A0A247D711_LISMN_Stadelmann_2021.csv,A0A247D711_LISMN,Eukaryote,Listeria monocytogenes,MNINDLIREIKNKDYTVKLSGTDSNSITQLIIRVNNDGNEYVISES...,87,False,1653,1653,...,mean_prediction,1,mutant,A0A247D711_LISMN_b03_theta_0.2.npy,A0A247D711_LISMN.pdb,1-87,1.0,,Activity,A0A247D711
4,A0A2Z5U3Z0_9INFA_Doud_2016,A0A2Z5U3Z0_9INFA_Doud_2016.csv,A0A2Z5U3Z0_9INFA,Virus,influenza H1N1,MKAKLLVLLYAFVATDADTICIGYHANNSTDTVDTILEKNVAVTHS...,565,False,10715,10715,...,transformed_pref,1,mutant,A0A2Z5U3Z0_9INFA_theta_0.01.npy,A0A2Z5U3Z0_9INFA.pdb,1-565,0.1,,OrganismalFitness,A0A2Z5U3Z0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,VKOR1_HUMAN_Chiasson_2020_activity,VKOR1_HUMAN_Chiasson_2020_activity.csv,VKOR1_HUMAN,Human,Homo sapiens,MGSTWGSPGWVRLALCLTGLVLSLYALHVKAARARDRDYRALCDVG...,163,False,697,697,...,activity_score,1,variant,VKOR1_HUMAN_theta_0.2.npy,VKOR1_HUMAN.pdb,1-163,0.1,,Activity,Q9BQB6
213,VRPI_BPT7_Tsuboyama_2023_2WNM,VRPI_BPT7_Tsuboyama_2023_2WNM.csv,VRPI_BPT7,Virus,Escherichia phage,SLSVDNKKFWATVESSEHSFEVPIYAETLDEALELAEWQYVPAGFE...,56,False,1047,1047,...,ddG_ML_float,1,mut_type,VRPI_BPT7_theta0.01_2023-08-07_b02.npy,VRPI_BPT7.pdb,1-56,1.0,,Stability,P03704
214,YAIA_ECOLI_Tsuboyama_2023_2KVT,YAIA_ECOLI_Tsuboyama_2023_2KVT.csv,YAIA_ECOLI,Prokaryote,Escherichia coli,PREAYIVTIEKGKPGQTVTWYQLRADHPKPDSLISEHPTAQEAMDA...,52,True,1890,928,...,ddG_ML_float,1,mut_type,YAIA_ECOLI_theta0.2_2023-08-07_b03.npy,YAIA_ECOLI.pdb,1-52,1.0,,Stability,P0AAN5
215,YAP1_HUMAN_Araya_2012,YAP1_HUMAN_Araya_2012.csv,YAP1_HUMAN,Human,Homo sapiens,MDPGQQPPPQPAPQGQGQPPSQPPQGQGPPSGPGQPAPAATQAAPQ...,504,True,10075,362,...,W,1,mutant,YAP1_HUMAN_theta_0.2.npy,YAP1_HUMAN.pdb,1-504,0.1,,Binding,P46937


3. Now we can add explode our data with all possible substitutions from the DMS assays:

In [4]:
def process_DMS(row):
    # https://marks.hms.harvard.edu/proteingym/DMS_ProteinGym_substitutions.zip
    substitutions = pd.read_csv(f"../data/DMS_ProteinGym_substitutions/{row['DMS_filename']}")['mutant'].tolist()
    return pd.Series({'UniProt_Assession_id': row['UniProt_Assession_id'], 'mutation': substitutions})

substitution_df = DMS_summary.apply(process_DMS, axis=1).explode('mutation')
substitution_df = substitution_df[~substitution_df['mutation'].str.contains(':')]

substitution_df['POS'] = substitution_df['mutation'].apply(lambda x: int(x[1:-1]))
substitution_df['REF'] = substitution_df['mutation'].apply(lambda x: x[0])
substitution_df['ALT'] = substitution_df['mutation'].apply(lambda x: x[-1])

substitution_df.drop('mutation', inplace=True, axis=1)


In [5]:
substitution_df

Unnamed: 0,UniProt_Assession_id,POS,REF,ALT
0,A0A140D2T1,291,I,A
0,A0A140D2T1,291,I,Y
0,A0A140D2T1,291,I,W
0,A0A140D2T1,291,I,V
0,A0A140D2T1,291,I,T
...,...,...,...,...
216,O31818,37,Y,R
216,O31818,37,Y,S
216,O31818,37,Y,T
216,O31818,37,Y,V
