# Extract and Convert System IDs to UniProt IDs

This notebook loads the annotation_table_filtered.parquet file and converts system IDs to UniProt IDs.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

## Load the Parquet File

In [2]:
# Load the annotation table
df = pd.read_parquet('data/annotation_table_filtered.parquet')

print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

Shape: (114537, 743)

Columns: ['entry_pdb_id', 'entry_release_date', 'entry_oligomeric_state', 'entry_determination_method', 'entry_keywords', 'entry_pH', 'entry_resolution', 'entry_validation_resolution', 'entry_validation_rfree', 'entry_validation_r', 'entry_validation_clashscore', 'entry_validation_percent_rama_outliers', 'entry_validation_percent_rota_outliers', 'entry_validation_data_completeness', 'entry_validation_percent_RSRZ_outliers', 'entry_validation_atom_count', 'entry_validation_molprobity', 'entry_validation_mean_b_factor', 'entry_validation_median_b_factor', 'entry_validation_pdbx_resolution', 'entry_validation_pdbx_reflns_resolution', 'entry_validation_meanI_over_sigI_obs', 'entry_validation_r_minus_rfree', 'entry_pass_validation_criteria', 'system_biounit_id', 'system_protein_chains_asym_id', 'system_id_no_biounit', 'system_ligand_chains', 'system_num_pocket_residues', 'system_proper_num_pocket_residues', 'system_num_interactions', 'system_proper_num_interactions', '

Unnamed: 0,entry_pdb_id,entry_release_date,entry_oligomeric_state,entry_determination_method,entry_keywords,entry_pH,entry_resolution,entry_validation_resolution,entry_validation_rfree,entry_validation_r,...,system_ligand_has_fragment,system_ligand_has_oligo,system_ligand_has_artifact,system_ligand_has_other,system_ligand_has_covalent,system_ligand_has_invalid,system_ligand_has_ion,system_protein_chains_total_length,system_unique_ccd_codes,system_proper_unique_ccd_codes
0,8grd,2022-09-01,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,5.1,2.699,2.7,0.2418,0.194,...,False,False,False,False,False,False,False,691,ADP,ADP
1,2grj,2006-04-24,monomeric,X-RAY DIFFRACTION,TRANSFERASE,5.2,2.6,2.6,0.24,0.19,...,False,False,False,False,False,False,False,192,ADP,ADP
2,2grj,2006-04-24,monomeric,X-RAY DIFFRACTION,TRANSFERASE,5.2,2.6,2.6,0.24,0.19,...,False,False,False,False,False,False,False,192,ADP,ADP
3,2grj,2006-04-24,monomeric,X-RAY DIFFRACTION,TRANSFERASE,5.2,2.6,2.6,0.24,0.19,...,False,False,False,False,False,False,False,192,ADP,ADP
4,2grj,2006-04-24,monomeric,X-RAY DIFFRACTION,TRANSFERASE,5.2,2.6,2.6,0.24,0.19,...,False,False,False,False,False,False,False,192,ADP,ADP


## Extract System IDs

In [3]:
# Get unique system IDs
system_ids = df['system_id'].unique()

print(f"Total unique system IDs: {len(system_ids)}")
print(f"\nSample system IDs:")
print(system_ids[:10])

Total unique system IDs: 104106

Sample system IDs:
['8grd__1__1.A_1.B__1.D' '2grj__1__1.A__1.I' '2grj__2__1.B__1.K'
 '2grj__3__1.C__1.N' '2grj__4__1.D__1.P' '2grj__5__1.E__1.T'
 '2grj__6__1.F__1.V' '2grj__7__1.G__1.X' '2grj__8__1.H__1.AA'
 '4grb__1__1.A__1.C']


## Convert System ID to UniProt ID

System IDs in PLINDER typically contain PDB codes. We'll extract the PDB ID and map it to UniProt IDs.

## Understanding PLINDER System IDs

PLINDER system IDs have the format: `<PDB ID>__<assembly>__<receptor chain>__<ligand chain>`

Example: `7eek__1__1.A__1.I`
- PDB ID: `7eek`
- Assembly: `1`
- Receptor chain: `1.A` (assembly 1, chain A)
- Ligand chain: `1.I` (assembly 1, chain I)

**Important**: We need to map the **specific receptor chain** to its UniProt ID, not just the PDB ID!
A PDB structure can have multiple chains with different proteins.

In [4]:
def parse_plinder_system_id(system_id):
    """
    Parse PLINDER system ID into components.
    Format: <PDB ID>__<assembly>__<receptor chain>__<ligand chain>
    
    Example: 7eek__1__1.A__1.I
    Returns: {
        'pdb_id': '7eek',
        'assembly': '1',
        'receptor_chain': '1.A',
        'ligand_chain': '1.I',
        'receptor_chain_only': 'A',  # Just the chain letter
        'assembly_id': '1'
    }
    """
    parts = system_id.split('__')
    
    if len(parts) == 4:
        pdb_id, assembly, receptor_chain, ligand_chain = parts
        # Extract just the chain letter (after the dot)
        receptor_chain_only = receptor_chain.split('.')[-1] if '.' in receptor_chain else receptor_chain
        
        return {
            'pdb_id': pdb_id.lower(),
            'assembly': assembly,
            'receptor_chain': receptor_chain,
            'ligand_chain': ligand_chain,
            'receptor_chain_only': receptor_chain_only,
            'assembly_id': assembly
        }
    else:
        # Fallback for non-standard format
        return {
            'pdb_id': system_id.split('__')[0].lower() if '__' in system_id else system_id.lower(),
            'assembly': None,
            'receptor_chain': None,
            'ligand_chain': None,
            'receptor_chain_only': None,
            'assembly_id': None
        }

# Test parsing
sample_system_ids = df['system_id'].head(10).tolist()
print("Parsing PLINDER system IDs:\n")
for sys_id in sample_system_ids[:5]:
    parsed = parse_plinder_system_id(sys_id)
    print(f"{sys_id}")
    print(f"  → PDB: {parsed['pdb_id']}, Receptor Chain: {parsed['receptor_chain_only']}, Ligand Chain: {parsed['ligand_chain']}")
    print()

Parsing PLINDER system IDs:

8grd__1__1.A_1.B__1.D
  → PDB: 8grd, Receptor Chain: B, Ligand Chain: 1.D

2grj__1__1.A__1.I
  → PDB: 2grj, Receptor Chain: A, Ligand Chain: 1.I

2grj__2__1.B__1.K
  → PDB: 2grj, Receptor Chain: B, Ligand Chain: 1.K

2grj__3__1.C__1.N
  → PDB: 2grj, Receptor Chain: C, Ligand Chain: 1.N

2grj__4__1.D__1.P
  → PDB: 2grj, Receptor Chain: D, Ligand Chain: 1.P



In [5]:
# Parse all system IDs and extract components
parsed_data = df['system_id'].apply(parse_plinder_system_id)

df['pdb_id'] = parsed_data.apply(lambda x: x['pdb_id'])
df['receptor_chain'] = parsed_data.apply(lambda x: x['receptor_chain_only'])
df['ligand_chain'] = parsed_data.apply(lambda x: x['ligand_chain'])
df['assembly_id'] = parsed_data.apply(lambda x: x['assembly_id'])

print(f"Sample parsed data:")
print(df[['system_id', 'pdb_id', 'receptor_chain', 'ligand_chain']].head(10))

Sample parsed data:
               system_id pdb_id receptor_chain ligand_chain
0  8grd__1__1.A_1.B__1.D   8grd              B          1.D
1      2grj__1__1.A__1.I   2grj              A          1.I
2      2grj__2__1.B__1.K   2grj              B          1.K
3      2grj__3__1.C__1.N   2grj              C          1.N
4      2grj__4__1.D__1.P   2grj              D          1.P
5      2grj__5__1.E__1.T   2grj              E          1.T
6      2grj__6__1.F__1.V   2grj              F          1.V
7      2grj__7__1.G__1.X   2grj              G          1.X
8     2grj__8__1.H__1.AA   2grj              H         1.AA
9      4grb__1__1.A__1.C   4grb              A          1.C


In [6]:
# Get unique PDB ID + Chain combinations (what we actually need to map)
unique_pdb_chain_combos = df[['pdb_id', 'receptor_chain']].drop_duplicates()
print(f"Total unique PDB ID + Chain combinations: {len(unique_pdb_chain_combos)}")
print(f"Total unique PDB IDs: {df['pdb_id'].nunique()}")
print(f"\nFirst 20 PDB ID + Chain combinations:")
print(unique_pdb_chain_combos.head(20))

Total unique PDB ID + Chain combinations: 67487
Total unique PDB IDs: 39436

First 20 PDB ID + Chain combinations:
   pdb_id receptor_chain
0    8grd              B
1    2grj              A
2    2grj              B
3    2grj              C
4    2grj              D
5    2grj              E
6    2grj              F
7    2grj              G
8    2grj              H
9    4grb              A
10   8grw              B
12   6gra              A
13   3grr              A
14   5grn              A
15   1grn              B
16   4gra              A
17   4gra              B
18   6gri              C
20   3gr7              A
21   3gr7              B


## Map PDB ID + Chain to UniProt IDs (Chain-Specific Mapping)

**Key difference**: We need to map specific chains to their UniProt IDs, not just the PDB ID.

The RCSB API provides chain-specific UniProt mappings.

In [7]:
import requests
import time
from collections import defaultdict

def map_pdb_chain_to_uniprot(pdb_id, chain_id):
    """
    Map PDB ID + specific chain to UniProt accessions using RCSB PDB API.
    
    This is the CORRECT way to map PLINDER systems to ChEMBL!
    
    Parameters:
    -----------
    pdb_id : str
        4-letter PDB code
    chain_id : str
        Chain identifier (e.g., 'A', 'B')
    
    Returns:
    --------
    list : UniProt IDs for this specific chain
    """
    url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id.upper()}/{chain_id.upper()}"
    
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            uniprot_ids = []
            
            # Check for UniProt accessions in the polymer entity data
            if 'rcsb_polymer_entity_container_identifiers' in data:
                identifiers = data['rcsb_polymer_entity_container_identifiers']
                if 'uniprot_ids' in identifiers:
                    uniprot_ids = identifiers['uniprot_ids']
            
            return list(set(uniprot_ids))
    except Exception as e:
        pass
    
    # Fallback: try the old endpoint
    try:
        url = f"https://data.rcsb.org/rest/v1/core/uniprot/{pdb_id.upper()}"
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            uniprot_ids = []
            
            for entry in data:
                # Filter by chain ID
                if 'rcsb_uniprot_container_identifiers' in entry:
                    chain_ids = entry.get('rcsb_uniprot_container_identifiers', {}).get('auth_asym_id')
                    if chain_ids and chain_id.upper() in [c.upper() for c in chain_ids]:
                        if 'rcsb_uniprot_protein' in entry:
                            uniprot_info = entry['rcsb_uniprot_protein']
                            if 'name' in uniprot_info:
                                uniprot_ids.append(uniprot_info['name']['value'])
            
            return list(set(uniprot_ids))
    except Exception as e:
        pass
    
    return []

def map_pdb_chain_to_uniprot_verbose(pdb_id, chain_id, verbose=False):
    """
    Map PDB ID + chain to UniProt accessions with optional verbose output.
    """
    if verbose:
        print(f"Mapping {pdb_id.upper()} chain {chain_id} → UniProt...")
    
    uniprot_ids = map_pdb_chain_to_uniprot(pdb_id, chain_id)
    
    if uniprot_ids:
        if verbose:
            print(f"  ✓ Found {len(uniprot_ids)} UniProt ID(s): {', '.join(uniprot_ids)}")
        return {
            'pdb_id': pdb_id.upper(),
            'chain_id': chain_id,
            'uniprot_ids': uniprot_ids,
            'method': 'RCSB_chain_specific'
        }
    else:
        if verbose:
            print(f"  ✗ No UniProt mapping found")
        return {
            'pdb_id': pdb_id.upper(),
            'chain_id': chain_id,
            'uniprot_ids': [],
            'method': None
        }

print("Chain-specific mapping functions loaded.")

Chain-specific mapping functions loaded.


In [8]:
# Test chain-specific mapping on a few examples
test_examples = unique_pdb_chain_combos.head(5)

print("Testing PDB + Chain to UniProt mapping:\n")
for idx, row in test_examples.iterrows():
    pdb_id = row['pdb_id']
    chain_id = row['receptor_chain']
    result = map_pdb_chain_to_uniprot_verbose(pdb_id, chain_id, verbose=True)
    time.sleep(0.3)  # Rate limiting

Testing PDB + Chain to UniProt mapping:

Mapping 8GRD chain B → UniProt...
  ✗ No UniProt mapping found
  ✗ No UniProt mapping found
Mapping 2GRJ chain A → UniProt...
Mapping 2GRJ chain A → UniProt...
  ✗ No UniProt mapping found
  ✗ No UniProt mapping found
Mapping 2GRJ chain B → UniProt...
Mapping 2GRJ chain B → UniProt...
  ✗ No UniProt mapping found
  ✗ No UniProt mapping found
Mapping 2GRJ chain C → UniProt...
Mapping 2GRJ chain C → UniProt...
  ✗ No UniProt mapping found
  ✗ No UniProt mapping found
Mapping 2GRJ chain D → UniProt...
Mapping 2GRJ chain D → UniProt...
  ✗ No UniProt mapping found
  ✗ No UniProt mapping found


## Map All PDB + Chain Combinations to UniProt IDs

This creates a **chain-specific** mapping that correctly associates each receptor chain with its UniProt ID.

In [None]:
# Map all unique PDB + Chain combinations to UniProt
import json

# Try to load existing progress
progress_file = 'data/pdb_chain_to_uniprot_mapping_progress.json'
if Path(progress_file).exists():
    with open(progress_file, 'r') as f:
        pdb_chain_to_uniprot_mapping = json.load(f)
    print(f"Loaded {len(pdb_chain_to_uniprot_mapping)} existing mappings from {progress_file}")
else:
    pdb_chain_to_uniprot_mapping = {}
    print("Starting fresh mapping...")

print(f"Mapping {len(unique_pdb_chain_combos)} PDB + Chain combinations to UniProt...\n")

for i, (idx, row) in enumerate(unique_pdb_chain_combos.iterrows(), 1):
    pdb_id = row['pdb_id']
    chain_id = row['receptor_chain']
    key = f"{pdb_id}_{chain_id}"
    
    # Skip if already mapped
    if key in pdb_chain_to_uniprot_mapping:
        continue
    
    if i % 10 == 0:
        print(f"Progress: {i}/{len(unique_pdb_chain_combos)}")
    
    result = map_pdb_chain_to_uniprot_verbose(pdb_id, chain_id, verbose=False)
    pdb_chain_to_uniprot_mapping[key] = result['uniprot_ids']
    
    # Save progress every 20 entries
    if i % 20 == 0:
        with open(progress_file, 'w') as f:
            json.dump(pdb_chain_to_uniprot_mapping, f, indent=2)
        print(f"  ✓ Progress saved ({len(pdb_chain_to_uniprot_mapping)} mappings)")
    
    time.sleep(0.3)  # Rate limiting

# Final save
with open(progress_file, 'w') as f:
    json.dump(pdb_chain_to_uniprot_mapping, f, indent=2)
print(f"\n✓ Final progress saved to {progress_file}")

print(f"\nMapping complete!")
print(f"Total PDB+Chain combinations mapped: {len(pdb_chain_to_uniprot_mapping)}")
print(f"Combinations with UniProt IDs: {sum(1 for uniprots in pdb_chain_to_uniprot_mapping.values() if uniprots)}")
print(f"Combinations without UniProt IDs: {sum(1 for uniprots in pdb_chain_to_uniprot_mapping.values() if not uniprots)}")

Mapping 67487 PDB + Chain combinations to UniProt...

Progress: 10/67487
Progress: 10/67487
Progress: 20/67487
Progress: 20/67487
Progress: 30/67487
Progress: 30/67487
Progress: 40/67487
Progress: 40/67487
Progress: 50/67487
Progress: 50/67487
Progress: 60/67487
Progress: 60/67487
Progress: 70/67487
Progress: 70/67487
Progress: 80/67487
Progress: 80/67487
Progress: 90/67487
Progress: 90/67487
Progress: 100/67487
Progress: 100/67487
Progress: 110/67487
Progress: 110/67487
Progress: 120/67487
Progress: 120/67487
Progress: 130/67487
Progress: 130/67487
Progress: 140/67487
Progress: 140/67487
Progress: 150/67487
Progress: 150/67487
Progress: 160/67487
Progress: 160/67487
Progress: 170/67487
Progress: 170/67487
Progress: 180/67487
Progress: 180/67487
Progress: 190/67487
Progress: 190/67487
Progress: 200/67487
Progress: 200/67487
Progress: 210/67487
Progress: 210/67487
Progress: 220/67487
Progress: 220/67487
Progress: 230/67487
Progress: 230/67487
Progress: 240/67487
Progress: 240/67487
Prog

KeyboardInterrupt: 

## Add UniProt IDs to DataFrame

In [10]:
# Add chain-specific UniProt IDs to the dataframe
df['pdb_chain_key'] = df['pdb_id'] + '_' + df['receptor_chain']
df['uniprot_ids'] = df['pdb_chain_key'].map(pdb_chain_to_uniprot_mapping)

# Convert list to string for easier viewing
df['uniprot_ids_str'] = df['uniprot_ids'].apply(lambda x: ', '.join(x) if x else '')

print("Sample data with chain-specific UniProt IDs:")
df[['system_id', 'pdb_id', 'receptor_chain', 'uniprot_ids_str']].head(20)

TypeError: can only join an iterable

In [None]:
# Summary statistics
print("Summary Statistics:")
print(f"Total systems: {len(df)}")
print(f"Systems with UniProt IDs: {df['uniprot_ids_str'].astype(bool).sum()}")
print(f"Systems without UniProt IDs: {(~df['uniprot_ids_str'].astype(bool)).sum()}")
print(f"\nUnique PDB IDs: {df['pdb_id'].nunique()}")
print(f"Unique PDB + Chain combinations: {len(unique_pdb_chain_combos)}")
print(f"Unique UniProt IDs found: {len(set([uid for uids in df['uniprot_ids'].dropna() for uid in uids if uids]))}")

# Show some examples where the same PDB has different chains with different UniProt IDs
print("\n" + "="*60)
print("Example: Same PDB, Different Chains → Different UniProt IDs")
print("="*60)
sample_pdb = df[df['uniprot_ids_str'] != '']['pdb_id'].iloc[0]
same_pdb_different_chains = df[df['pdb_id'] == sample_pdb][['pdb_id', 'receptor_chain', 'uniprot_ids_str']].drop_duplicates()
print(same_pdb_different_chains.head(10))

## Save Results

In [None]:
# Save the enhanced dataframe
output_file = 'data/annotation_table_with_uniprot.parquet'
df.to_parquet(output_file)
print(f"✓ Saved to: {output_file}")

# Save the PDB+Chain to UniProt mapping as JSON
import json
mapping_file = 'data/pdb_chain_to_uniprot_mapping.json'
with open(mapping_file, 'w') as f:
    json.dump(pdb_chain_to_uniprot_mapping, f, indent=2)
print(f"✓ Saved chain-specific mapping to: {mapping_file}")

# Also save a summary CSV for quick reference
summary_df = unique_pdb_chain_combos.copy()
summary_df['pdb_chain_key'] = summary_df['pdb_id'] + '_' + summary_df['receptor_chain']
summary_df['uniprot_ids'] = summary_df['pdb_chain_key'].map(pdb_chain_to_uniprot_mapping)
summary_df['uniprot_ids_str'] = summary_df['uniprot_ids'].apply(lambda x: ', '.join(x) if x else '')
summary_csv = 'data/pdb_chain_uniprot_summary.csv'
summary_df[['pdb_id', 'receptor_chain', 'uniprot_ids_str']].to_csv(summary_csv, index=False)
print(f"✓ Saved summary to: {summary_csv}")

In [None]:
# Display final summary
print("\n" + "="*70)
print("FINAL SUMMARY - Chain-Specific UniProt Mapping")
print("="*70)
print(f"Total PLINDER systems: {len(df)}")
print(f"Unique PDB IDs: {df['pdb_id'].nunique()}")
print(f"Unique PDB + Chain combinations: {len(unique_pdb_chain_combos)}")
print(f"Combinations with UniProt mapping: {sum(1 for uniprots in pdb_chain_to_uniprot_mapping.values() if uniprots)}")
print(f"Total unique UniProt IDs found: {len(set([uid for uids in df['uniprot_ids'].dropna() for uid in uids if uids]))}")

print("\n" + "="*70)
print("WHY CHAIN-SPECIFIC MAPPING MATTERS FOR ChEMBL")
print("="*70)
print("✓ Each PLINDER system specifies a receptor CHAIN")
print("✓ Different chains in the same PDB can be different proteins")
print("✓ Each protein (chain) has its own UniProt ID")
print("✓ ChEMBL data is linked to specific UniProt IDs")
print("✓ Therefore: Must map PDB+Chain → UniProt, not just PDB → UniProt")

print("\nOutput files:")
print(f"  1. {output_file}")
print(f"  2. {mapping_file}")
print(f"  3. {summary_csv}")

print("\n" + "="*70)
print("NEXT STEPS FOR ChEMBL RETRIEVAL")
print("="*70)
print("Use the 'uniprot_ids' column (chain-specific) to query ChEMBL,")
print("not just the PDB ID. This ensures you get activities for the")
print("correct protein target that corresponds to your receptor chain.")