In [2]:
import pandas as pd
import numpy as np
import os
from icecream import ic
import pickle
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from Bio import pairwise2



In [3]:
# combine a list of csv files and save it to a csv file
def combine_csv_files(file_list, output_file):
	df = pd.concat([pd.read_csv(f) for f in file_list])
	df.to_csv(output_file, index=False)


ppbs_file_paths = [
    				'/workspace/protein_lm/evaluation/binding_site_prediction/data/PPBS/labels_train_filtered.csv',
        			'/workspace/protein_lm/evaluation/binding_site_prediction/data/PPBS/labels_test_homology_filtered.csv',
					'/workspace/protein_lm/evaluation/binding_site_prediction/data/PPBS/labels_validation_homology_filtered.csv']

combined_ppbs_file_path = '/workspace/protein_lm/evaluation/binding_site_prediction/data/combined_ppbs.csv'
combine_csv_files(ppbs_file_paths, combined_ppbs_file_path)

In [4]:
import ast
ppbs_df = pd.read_csv(combined_ppbs_file_path)
ppbs_df['labels'] = ppbs_df['labels'].apply(ast.literal_eval)

ic(len(ppbs_df))
ppbs_df.head()

ic| len(ppbs_df): 15719


Unnamed: 0.1,Unnamed: 0,ID,sequence,residue,labels,fold,PDB ID,Sample weight,weight
0,6838,13gs_0-A,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,[['A' '0']\n ['A' '1']\n ['A' '2']\n ['A' '3']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",labels_train.txt,13gs_A,0.166667,
1,6839,14gs_0-A,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVYQLPKFQDGDLT...,[['A' '2']\n ['A' '3']\n ['A' '4']\n ['A' '5']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",labels_train.txt,14gs_A,0.166667,
2,6840,19hc_0-A,AALEPTDSGAPSAIVMFPVGEKPNPKGAAMKPVVFNHLIHEKKIAD...,[['A' '1']\n ['A' '2']\n ['A' '3']\n ['A' '4']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",labels_train.txt,19hc_A,0.5,
3,6841,1a05_0-A,MKKIAIFAGDGIGPEIVAAARQVLDAVDQAAHLGLRCTEGLVGGAA...,[['A' '1']\n ['A' '2']\n ['A' '3']\n ['A' '4']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",labels_train.txt,1a05_A,1.0,
4,6842,1a0c_0-A,NKYFENVSKIKYEGPKSNNPYSFKFYNPEEVIDGKTMEEHLRFSIA...,[['A' '1']\n ['A' '2']\n ['A' '3']\n ['A' '4']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",labels_train.txt,1a0c_A,0.5,


In [5]:
ppbs_with_methionine_df = ppbs_df.copy()
ppbs_with_methionine_df['sequence'] = ppbs_with_methionine_df['sequence'].apply(lambda x: f'M{x}' if x[0] != 'M' else x)
ppbs_with_methionine_df.head()

Unnamed: 0.1,Unnamed: 0,ID,sequence,residue,labels,fold,PDB ID,Sample weight,weight
0,6838,13gs_0-A,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,[['A' '0']\n ['A' '1']\n ['A' '2']\n ['A' '3']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",labels_train.txt,13gs_A,0.166667,
1,6839,14gs_0-A,MPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVYQLPKFQDGDL...,[['A' '2']\n ['A' '3']\n ['A' '4']\n ['A' '5']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",labels_train.txt,14gs_A,0.166667,
2,6840,19hc_0-A,MAALEPTDSGAPSAIVMFPVGEKPNPKGAAMKPVVFNHLIHEKKIA...,[['A' '1']\n ['A' '2']\n ['A' '3']\n ['A' '4']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",labels_train.txt,19hc_A,0.5,
3,6841,1a05_0-A,MKKIAIFAGDGIGPEIVAAARQVLDAVDQAAHLGLRCTEGLVGGAA...,[['A' '1']\n ['A' '2']\n ['A' '3']\n ['A' '4']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",labels_train.txt,1a05_A,1.0,
4,6842,1a0c_0-A,MNKYFENVSKIKYEGPKSNNPYSFKFYNPEEVIDGKTMEEHLRFSI...,[['A' '1']\n ['A' '2']\n ['A' '3']\n ['A' '4']...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",labels_train.txt,1a0c_A,0.5,


In [6]:
ptm_csv_path = '/workspace/protein_lm/evaluation/binding_site_prediction/data/ptm_data.csv'
ptm_df = pd.read_csv(ptm_csv_path)
ptm_df = ptm_df[['AC_ID', 'wt_seq', 'ptm_seq']]

ic(len(ptm_df))
ptm_df.head()

ic| len(ptm_df): 79707


Unnamed: 0,AC_ID,wt_seq,ptm_seq
0,O60296,MSQSQNAIFTSPTGEENLMNSNHRDSESITDVCSNEDLPEVELVSL...,MSQSQNAIFTSPTGEENLMNSNHRDSESITDVCSNEDLPEVELVSL...
1,B5EXH1,MTSRNYLLLTPGPLTTSRTVKEAMLFDSCTWDDDYNIGVVEQIRQQ...,MTSRNYLLLTPGPLTTSRTVKEAMLFDSCTWDDDYNIGVVEQIRQQ...
2,Q9BXI9,MQWLRVRESPGEATGHRVTMGTAALGPVWAALLLFLLMCEIPMVEL...,MQWLRVRESPGEATGHRVTMGTAALGPVWAALLLFLLMCEIPMVEL...
3,Q6ZFW0,MAASKGNAAAAACALVLVLLAVGAEAQGGGGGECVPQLNRLLACRA...,MAASKGNAAAAACALVLVLLAVGAEAQGGGGGECVPQLNRLLACRA...
4,Q3SYP2,MLGITVLAAILACASSCGDPTFPPNLSARVVGGEDAVPNSWPWQVS...,MLGITVLAAILACASSCGDPTFPP<N-linked (GlcNAc...) ...


In [7]:
ppbs_seqs_with_m = set(ppbs_with_methionine_df['sequence'])
ptm_seqs = set(ptm_df['wt_seq'])

matches = 0
for seq in ppbs_seqs_with_m:
	if seq in ptm_seqs:
		matches += 1
ic(matches)

ic| matches: 105


105

In [8]:
ac_ids = set(ptm_df['AC_ID'])
pdb_ids = set(ppbs_df['PDB ID'].apply(lambda x: x.split('_')[0].lower()))

In [9]:
def get_pdb_ids(uniprot_accession):
    url = f"https://www.uniprot.org/uniprot/{uniprot_accession}.txt"
    response = requests.get(url)
    pdb_ids = []

    if response.status_code == 200:
        for line in response.text.split('\n'):
            if line.startswith('DR   PDB;'):
                pdb_id = line.split(';')[1].strip()
                pdb_ids.append(pdb_id)
    else:
        print(f"Failed to retrieve data for accession ID {uniprot_accession}. Status code: {response.status_code}")

    return uniprot_accession, pdb_ids

def get_pdb_ids_for_accession_list(accession_list):
    results_dict = {}
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(get_pdb_ids, accession) for accession in accession_list]

        for future in tqdm(as_completed(futures), total=len(accession_list), desc="Fetching PDB IDs"):
            try:
                accession, pdb_ids = future.result()
                results_dict[accession] = pdb_ids
            except Exception as exc:
                print(f"Task generated an exception: {exc}")
                
            if len(results_dict) % 100 == 0:
                with open('/workspace/protein_lm/evaluation/binding_site_prediction/data/nested_pdb_ids.pkl', 'wb') as f:
                    pickle.dump(results_dict, f)

    return results_dict

# accession_list = ac_ids
# nested_pdb_ids_dict = get_pdb_ids_for_accession_list(accession_list)

# print("Nested list of PDB IDs for each UniProt accession ID:")
# for accession, pdb_list in zip(accession_list, nested_pdb_ids):
#     print(f"{accession}: {pdb_list}")

# # save the nested_pdb_ids
# with open('/workspace/protein_lm/evaluation/binding_site_prediction/data/nested_pdb_ids_dict.pkl', 'wb') as f:
#     pickle.dump(nested_pdb_ids_dict, f)

In [10]:
# this is because I accidentally saved a list instead of a dictionary
def parse_file_to_dict(filename):
    data_dict = {}
    with open(filename, 'r') as file:
        for line in file:
            key, value_str = line.strip().split(':')
            key = key.strip()
            values = [item.strip().strip('\'"') for item in value_str.strip()[1:-1].split(',') if item.strip()]
            data_dict[key] = values
    return data_dict

file_path = '/workspace/protein_lm/evaluation/nonptm_vs_ptm_classification/data/ac_to_pdb.txt'
ac_to_associated_pdb_ids = parse_file_to_dict(file_path)

associated_pdb_ids = pickle.load(open('/workspace/protein_lm/evaluation/binding_site_prediction/data/nested_pdb_ids.pkl', 'rb'))

ac_id_to_seq = dict(zip(ptm_df['AC_ID'], ptm_df['wt_seq']))

pdb_id_to_seqs = {}
for i, row in ppbs_df.iterrows():
    pdb_id = row['PDB ID'].split('_')[0].lower()
    pdb_id_with_chain_name = row['PDB ID'].lower()
    if pdb_id not in pdb_id_to_seqs:
        pdb_id_to_seqs[pdb_id] = []
    pdb_id_to_seqs[pdb_id].append((row['sequence'], pdb_id_with_chain_name))

In [11]:
ac_id = 'P09211'
ic(ac_id in ac_ids)
associate_pdb_ids = ac_to_associated_pdb_ids[ac_id]
ic(associate_pdb_ids)

for pdb_id in associate_pdb_ids:
    pdb_id = pdb_id.lower()
    if pdb_id in pdb_ids:
        ic(pdb_id)

ic| ac_id in ac_ids: True
ic| associate_pdb_ids: ['10GS',
                        '11GS',
                        '12GS',
                        '13GS',
                        '14GS',
                        '16GS',
                        '17GS',
                        '18GS',
                        '19GS',
                        '1AQV',
                        '1AQW',
                        '1AQX',
                        '1EOG',
                        '1EOH',
                        '1GSS',
                        '1KBN',
                        '1LBK',
                        '1MD3',
                        '1MD4',
                        '1PGT',
                        '1PX6',
                        '1PX7',
                        '1ZGN',
                        '20GS',
                        '22GS',
                        '2A2R',
                        '2A2S',
                        '2GSS',
                        '2J9H',
                        '2PGT',
              

In [12]:
# ac_to_alignment = {}

# for accession_id in tqdm(ac_ids):
#     associated_pdb_ids = ac_to_associated_pdb_ids[accession_id]
#     associated_pdb_ids = [pdb_id.lower() for pdb_id in associated_pdb_ids]
#     alignments = []
#     for pdb_id in associated_pdb_ids:
#         if pdb_id not in pdb_ids: continue
#         ppbs_seqs = pdb_id_to_seqs[pdb_id]
#         ptm_seq = ac_id_to_seq[accession_id]
#         for ppbs_seq, pdb_id_with_chain_name in ppbs_seqs:
#             alignment = pairwise2.align.globalxx(ppbs_seq, ptm_seq)
#             alignment_score = alignment[0].score
#             alignments.append((accession_id, pdb_id, pdb_id_with_chain_name, alignment))
#     if not alignments: continue
#     best_alignment = max(alignments, key=lambda x: x[3][0].score)
#     ac_to_alignment[best_alignment[0]] = best_alignment

# with open('/workspace/protein_lm/evaluation/binding_site_prediction/data/ac_to_alignment_dict.pkl', 'wb') as f:
# 	pickle.dump(ac_to_alignment, f)

In [13]:
ac_to_alignment = pickle.load(open('/workspace/protein_lm/evaluation/binding_site_prediction/data/ac_to_alignment_dict.pkl', 'rb'))

In [14]:
#display first 10 items of pdb_to_ac
ic(list(ac_to_alignment.items())[0])

ic| list(ac_to_alignment.items())[0]: ('P19483',
                                       ('P19483',
                                        '2wss',
                                        '2wss_a',
                                        [Alignment(seqA='-------------------------------------------QKTGTAEVSSILEERILGADTSVDLEETGRVLSIGDGIARVHGLRNVQAEEMVEFSSGLKGMSLNLEPDNVGVVVFGNDKLIKEGDIVKRTGAIVDVPVGEELLGRVVDALGNAIDGKGPIGSKARRRVGLKAPGIIPRISVREPMQTGIKAVDSLVPIGRGQRELIIGDRQTGKTSIAIDTIINQKRFNDGTDEKKKLYCIYVAIGQKRSTVAQLVKRLTDADAMKYTIVVSATASDAAPLQYLAPYSGCSMGEYFRDNGKHALIIYDDLSKQAVAYRQMSLLLRRPPGREAYPGDVFYLHSRLLERAAKMNDAFGGGSLTALPVIETQAGDVSAYIPTNVISITDGQIFLETELFYKGIRPAINVGLSVSRVGSAAQTRAMKQVAGTMKLELAQYREVAAFAQFGSDLDAATQQLLSRGVRLTELLKQGQYSPMAIEEQVAVIYAGVRGYLDKLEPSKITKFENAFLSHVISQHQALLG-KIRTDGKISEESDAKLKEIVTNFLAGFEA', seqB='MLSVRVAAAVARALPRRAGLVSKNALGSSFIAARNLHASNSRLQKTGTAEVSSILEERILGADTSVDLEETGRVLSIGDGIARVHGLRNVQAEEMVEFSSGLKGMSLNLEPDNVGVVVFGNDKLIKEGDIVKRTGAIVDVPVGEELLGRVVDALGNAIDGKGPIGSKARRRVGLKAPGIIPRI

('P19483',
 ('P19483',
  '2wss',
  '2wss_a',
  [Alignment(seqA='-------------------------------------------QKTGTAEVSSILEERILGADTSVDLEETGRVLSIGDGIARVHGLRNVQAEEMVEFSSGLKGMSLNLEPDNVGVVVFGNDKLIKEGDIVKRTGAIVDVPVGEELLGRVVDALGNAIDGKGPIGSKARRRVGLKAPGIIPRISVREPMQTGIKAVDSLVPIGRGQRELIIGDRQTGKTSIAIDTIINQKRFNDGTDEKKKLYCIYVAIGQKRSTVAQLVKRLTDADAMKYTIVVSATASDAAPLQYLAPYSGCSMGEYFRDNGKHALIIYDDLSKQAVAYRQMSLLLRRPPGREAYPGDVFYLHSRLLERAAKMNDAFGGGSLTALPVIETQAGDVSAYIPTNVISITDGQIFLETELFYKGIRPAINVGLSVSRVGSAAQTRAMKQVAGTMKLELAQYREVAAFAQFGSDLDAATQQLLSRGVRLTELLKQGQYSPMAIEEQVAVIYAGVRGYLDKLEPSKITKFENAFLSHVISQHQALLG-KIRTDGKISEESDAKLKEIVTNFLAGFEA', seqB='MLSVRVAAAVARALPRRAGLVSKNALGSSFIAARNLHASNSRLQKTGTAEVSSILEERILGADTSVDLEETGRVLSIGDGIARVHGLRNVQAEEMVEFSSGLKGMSLNLEPDNVGVVVFGNDKLIKEGDIVKRTGAIVDVPVGEELLGRVVDALGNAIDGKGPIGSKARRRVGLKAPGIIPRISVREPMQTGIKAVDSLVPIGRGQRELIIGDRQTGKTSIAIDTIINQKRFNDGTDEKKKLYCIYVAIGQKRSTVAQLVKRLTDADAMKYTIVVSATASDAAPLQYLAPYSGCSMGEYFRDNGKHALIIYDDLSKQAVAYRQMSLLLRRPPGREAYPGDVFYLHSRLLERAAKMNDAFGGGSLTALPVIETQ

In [17]:
def align_residue_labels(alignment, labels):
	ppbs_seq = alignment.seqA
	ptm_seq = alignment.seqB
	labels_list = next(iter(labels))
	
	aligned_labels = []
	i = 0
	for j, aa in enumerate(ptm_seq):
		if aa == '-':
			i += 1
		else:
			if ppbs_seq[j] == '-':
				aligned_labels.append(-1)
			else:
				aligned_labels.append(labels_list[i])
				i += 1
	return aligned_labels

final_rows = []
for i, row in tqdm(ptm_df.iterrows()):
	accession_id = row['AC_ID']
	if accession_id not in ac_to_alignment: continue
	accession_id, pdb_id, pdb_id_with_chain_name, alignment = ac_to_alignment[accession_id]
	alignment = alignment[0]
	pdb_id_with_chain_name = pdb_id_with_chain_name.split('_')[0] + '_' + pdb_id_with_chain_name.split('_')[1].upper()
	labels = ppbs_df[ppbs_df['PDB ID'] == pdb_id_with_chain_name]['labels']
	aligned_labels = align_residue_labels(alignment, labels)
	# ic(len(aligned_labels), len(row['wt_seq']))
	new_row = row.copy()
	new_row['pdb_id_with_chain_name'] = pdb_id_with_chain_name
	new_row['aligned_labels_with_gaps'] = aligned_labels
	new_row['aligned_labels'] = [1 if label == 1 else 0 for label in aligned_labels]
	new_row['labels'] = next(iter(labels))
	new_row['ppbs_seq_alignment'] = alignment.seqA
	new_row['ptm_seq_alignment'] = alignment.seqB
	final_rows.append(new_row)

residue_seqs_df = pd.DataFrame(final_rows)
residue_seqs_df.head()

79707it [00:12, 6178.11it/s]


Unnamed: 0,AC_ID,wt_seq,ptm_seq,pdb_id_with_chain_name,aligned_labels_with_gaps,aligned_labels,labels,ppbs_seq_alignment,ptm_seq_alignment
50,C4YMW2,MSTNKITFLLNWEAAPYHIPVYLANIKGYFKDENLDIAILEPSNPS...,MSTNKITFLLNWEAAPYHIPVYLANIKGYFKDENLDIAILEPSNPS...,4esw_A,"[1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, ...",GSHMSTNKITFLLNWEAAPYHIPVYLANIKGYFKDENLDIAILEPS...,---MSTNKITFLLNWEAAPYHIPVYLANIKGYFKDENLDIAILEPS...
62,P97291,MPERLAETLMDLWTPLIILWITLPSCVYTAPMNQAHVLTTGSPLEL...,MPERLAETLMDLWTPLIILWITLPSCVYTAPMNQAHVLTTGSPLEL...,1zxk_A,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, ...",------------------------S---------------------...,MPERLAETLMDLWTPLIILWITLPSCVYTAPMNQAHVLTTGSPLEL...
103,P93114,MAVPMDTISGPWGNNGGNFWSFRPVNKINQIVISYGGGGNNPIALT...,M<N-acetylalanine>VPMDTISGPWGNNGGNFWSFRPVNKINQ...,1ouw_B,"[-1, -1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1...","[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...","[1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, ...",--VPMDTISGPWGNNGGNFWSFRPVNKINQIVISYGGGGNNPIALT...,MAVPMDTISGPWGNNGGNFWSFRPVNKINQIVISYGGGGNNPIALT...
105,P42262,MQKIMHISVLLSPVLWGLIFGVSSNSIQIGGLFPRGADQEYSAFRV...,MQKIMHISVLLSPVLWGLIFGVSSNSIQIGGLFPRGADQEYSAFRV...,2xhd_A,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",------------------------N---------------------...,MQKIMHISVLLSPVLWGLIFGVSSNSIQIGGLFPRGADQEYSAFRV...
106,O75208,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...,4rhp_B,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",----------------------------------------------...,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...


In [18]:
residue_seqs_df.to_csv('/workspace/protein_lm/evaluation/binding_site_prediction/data/residue_seqs.csv', index=False)

## Separate rows into train and test sets

In [22]:
test_csv_path = '/workspace/protein_lm/evaluation/binding_site_prediction/data/PPBS/labels_test_homology_filtered.csv'
val_csv_path = '/workspace/protein_lm/evaluation/binding_site_prediction/data/PPBS/labels_validation_homology_filtered.csv'
train_csv_path = '/workspace/protein_lm/evaluation/binding_site_prediction/data/PPBS/labels_train_filtered.csv'

test_df = pd.read_csv(test_csv_path)
val_df = pd.read_csv(val_csv_path)
train_df = pd.read_csv(train_csv_path)

test_pdb_ids = set(test_df['PDB ID'])
val_pdb_ids = set(val_df['PDB ID'])
train_pdb_ids = set(train_df['PDB ID'])

residue_seqs_pdb_ids = set(residue_seqs_df['pdb_id_with_chain_name'])

In [23]:
ic(len(residue_seqs_pdb_ids))
ic(len(test_pdb_ids))
ic(len(val_pdb_ids))
ic(len(train_pdb_ids))

ic(len(test_pdb_ids.intersection(residue_seqs_pdb_ids)))
ic(len(val_pdb_ids.intersection(residue_seqs_pdb_ids)))
ic(len(train_pdb_ids.intersection(residue_seqs_pdb_ids)))

ic| len(residue_seqs_pdb_ids): 2782
ic| len(test_pdb_ids): 1488
ic| len(val_pdb_ids): 1458
ic| len(train_pdb_ids): 12773
ic| len(test_pdb_ids.intersection(residue_seqs_pdb_ids)): 229
ic| len(val_pdb_ids.intersection(residue_seqs_pdb_ids)): 241
ic| len(train_pdb_ids.intersection(residue_seqs_pdb_ids)): 2312


2312

In [25]:
residue_seqs_df['is_test'] = residue_seqs_df['pdb_id_with_chain_name'].apply(lambda x: 1 if x in test_pdb_ids else 0)
residue_seqs_df['is_val'] = residue_seqs_df['pdb_id_with_chain_name'].apply(lambda x: 1 if x in val_pdb_ids else 0)
residue_seqs_df['is_train'] = residue_seqs_df['pdb_id_with_chain_name'].apply(lambda x: 1 if x in train_pdb_ids else 0)

Unnamed: 0,AC_ID,wt_seq,ptm_seq,pdb_id_with_chain_name,aligned_labels_with_gaps,aligned_labels,labels,ppbs_seq_alignment,ptm_seq_alignment,is_test,is_val,is_train
50,C4YMW2,MSTNKITFLLNWEAAPYHIPVYLANIKGYFKDENLDIAILEPSNPS...,MSTNKITFLLNWEAAPYHIPVYLANIKGYFKDENLDIAILEPSNPS...,4esw_A,"[1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, ...",GSHMSTNKITFLLNWEAAPYHIPVYLANIKGYFKDENLDIAILEPS...,---MSTNKITFLLNWEAAPYHIPVYLANIKGYFKDENLDIAILEPS...,0,1,0
62,P97291,MPERLAETLMDLWTPLIILWITLPSCVYTAPMNQAHVLTTGSPLEL...,MPERLAETLMDLWTPLIILWITLPSCVYTAPMNQAHVLTTGSPLEL...,1zxk_A,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, ...",------------------------S---------------------...,MPERLAETLMDLWTPLIILWITLPSCVYTAPMNQAHVLTTGSPLEL...,0,0,1
103,P93114,MAVPMDTISGPWGNNGGNFWSFRPVNKINQIVISYGGGGNNPIALT...,M<N-acetylalanine>VPMDTISGPWGNNGGNFWSFRPVNKINQ...,1ouw_B,"[-1, -1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1...","[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...","[1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, ...",--VPMDTISGPWGNNGGNFWSFRPVNKINQIVISYGGGGNNPIALT...,MAVPMDTISGPWGNNGGNFWSFRPVNKINQIVISYGGGGNNPIALT...,1,0,0
105,P42262,MQKIMHISVLLSPVLWGLIFGVSSNSIQIGGLFPRGADQEYSAFRV...,MQKIMHISVLLSPVLWGLIFGVSSNSIQIGGLFPRGADQEYSAFRV...,2xhd_A,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",------------------------N---------------------...,MQKIMHISVLLSPVLWGLIFGVSSNSIQIGGLFPRGADQEYSAFRV...,0,0,1
106,O75208,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...,4rhp_B,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",----------------------------------------------...,MAAAAVSGALGRAGWRLLQLRCLPVARCRQALVPRAFHASAVGLRS...,0,0,1


In [26]:
residue_seqs_df.to_csv('/workspace/protein_lm/evaluation/binding_site_prediction/data/residue_seqs.csv', index=False)