# BIO510: Protein visualization
-----------------------
###### cc. Klevia Dishnica, Daniela Pampanin , Enrico Riccardi  - UiS 2025
-----------------------


#### Reminders :
#### * Every protein has a unique Accession ID in UniProt.
#### * Accession IDs never change, even if the data is updated.
#### * Accession IDs are used to retrieve protein sequences, structures, and functions.
#### * You can find them on UniProt or using API requests.
 
### Fetch protein sequences in FASTA format from the UniProt database using the UniProt REST API:
#### TIP: Use [request](https://pypi.org/project/requests/) library

In [1]:
import requests

# Set the base URL for the UniProt REST API
base_url = "https://rest.uniprot.org/uniprotkb"

# Set the accession IDs for the proteins you want to download
accession_ids = ["P0DTC2", "Q98765", "A12345"] # List of UniProt accession IDs

# Initialize an empty string to store sequences
protein_sequences = ""  # This will store all sequences as a single string

# Loop through each accession ID and fetch the sequence in FASTA format
for accession in accession_ids:
    url = base_url+"/"+accession+".fasta" #https://rest.uniprot.org/uniprotkb/P0DTC2.fasta
    response = requests.get(url)  # Send an HTTP GET request to fetch the FASTA sequence
    #print(type(response))
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200: # Status codes are documented in the requests.models.Response class
        
        protein_sequences += response.text + "\n" # Append the response text to the string with a newline
        
    else:
        print(f"Failed to retrieve {accession}, Status Code: {response.status_code}")

# Print the retrieved protein sequences
print(protein_sequences)

Failed to retrieve A12345, Status Code: 400
>sp|P0DTC2|SPIKE_SARS2 Spike glycoprotein OS=Severe acute respiratory syndrome coronavirus 2 OX=2697049 GN=S PE=1 SV=1
MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFS
NVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIV
NNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLE
GKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQT
LLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETK
CTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISN
CVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIAD
YNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPC
NGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVN
FNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITP
GTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSY
ECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTI
SVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQE
VFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFN

### From UniProt we can get access to PDB. 
### Manually, we can load the info directly as follows:
 * Go to the UniProt Website → https://www.uniprot.org/
 * Search for a protein by name or UniProt Accession ID (e.g., "P0DTC2" for SARS-CoV-2 Spike Protein).
 * Open the protein entry and scroll to the "Structure" section.
 * If available, click on the [PDB cross-references](https://www.uniprot.org/help/cross-references_in_uniprotkb) to view the 3D structure in RCSB PDB
### In Python, we can instead:
 

In [2]:
uniprot_id = "P0DTC2"  # Example: Spike protein of SARS-CoV-2

# UniProt API to fetch PDB cross-references
uniprot_api_url = "https://rest.uniprot.org/uniprotkb/"+uniprot_id+".json"

response = requests.get(uniprot_api_url)
if response.status_code == 200:
    data = response.json()  # We are reading a json, we need a json interpreter
    
    # Extract PDB ID if available
    pdb_ids = []
    if "uniProtKBCrossReferences" in data: 
        for entry in data["uniProtKBCrossReferences"]:   
            if entry["database"] == "PDB":   # Check if the database is PDB (Protein Data Bank)
                pdb_ids.append(entry["id"])  # Extract and store the PDB ID

    if pdb_ids:
        print(f"PDB ID(s) linked to {uniprot_id}: {pdb_ids}")
    else:
        print(f"No PDB structure found for UniProt ID {uniprot_id}.")
else:
    print(f"Failed to retrieve UniProt entry. Status code: {response.status_code}")



PDB ID(s) linked to P0DTC2: ['6LVN', '6LXT', '6LZG', '6M0J', '6M17', '6M1V', '6VSB', '6VW1', '6VXX', '6VYB', '6W41', '6WPS', '6WPT', '6X29', '6X2A', '6X2B', '6X2C', '6X45', '6X6P', '6X79', '6XC2', '6XC3', '6XC4', '6XC7', '6XCM', '6XCN', '6XDG', '6XE1', '6XEY', '6XF5', '6XF6', '6XKL', '6XKP', '6XKQ', '6XLU', '6XM0', '6XM3', '6XM4', '6XM5', '6XR8', '6XRA', '6XS6', '6YLA', '6YM0', '6YOR', '6YZ5', '6YZ7', '6Z2M', '6Z43', '6Z97', '6ZB4', '6ZB5', '6ZBP', '6ZCZ', '6ZDG', '6ZDH', '6ZER', '6ZFO', '6ZGE', '6ZGG', '6ZGI', '6ZH9', '6ZHD', '6ZLR', '6ZOW', '6ZOX', '6ZOY', '6ZOZ', '6ZP0', '6ZP1', '6ZP2', '6ZP5', '6ZP7', '6ZWV', '6ZXN', '7A25', '7A29', '7A4N', '7A5R', '7A5S', '7A91', '7A92', '7A93', '7A94', '7A95', '7A96', '7A97', '7A98', '7AD1', '7AKD', '7B0B', '7B14', '7B17', '7B18', '7B3O', '7B62', '7BEH', '7BEI', '7BEJ', '7BEK', '7BEL', '7BEM', '7BEN', '7BEO', '7BEP', '7BH9', '7BNM', '7BNN', '7BNO', '7BNV', '7BWJ', '7BYR', '7BZ5', '7C01', '7C2L', '7C53', '7C8D', '7C8J', '7C8V', '7C8W', '7CAB', '7C

 # Why so many different PDB Structures for a Single Accession ID ?

## 1. Different Experimental Methods  
### Proteins are studied using different structural biology techniques, such as:

* X-ray Crystallography → High-resolution structure but requires crystallization.
* Cryo-Electron Microscopy (Cryo-EM)  → Captures proteins in more native states.
* Nuclear Magnetic Resonance (NMR) Spectroscopy  → Useful for small, flexible proteins in solution.
* Each method can yield slightly different structures, leading to multiple PDB entries for the same UniProt protein.

## 2.Multiple Conformations  
### For example, the SARS-CoV-2 Spike Protein (P0DTC2) has:
* Open conformation (binding to human ACE2 receptor).
* Closed conformation (before binding).
* Different functional states lead to multiple PDB structures

## 3.Mutations and Variants 
## Example: the SARS-CoV-2 Spike Protein (P0DTC2) has: 
* The Omicron variant spike protein
* Wuhan strain
* More

## 4.Protein Interactions
### PDB structures can contain the protein alone (monomeric form) or bunded to:
* Other proteins (e.g., receptor binding complexes).
* Drugs or inhibitors (important in drug discovery).

# Exercize: 
### 1. Downloads a single [PDB](https://en.wikipedia.org/wiki/Protein_Data_Bank_(file_format)) structure file from the [RCSB Protein Data Bank](https://www.rcsb.org/) after retrieving the PDB ID from UniProt.

In [3]:

pdb_database = "https://files.rcsb.org/download/"

# Select the first PDB ID found
if len(pdb_ids)>0: 
    pdb_id = pdb_ids[1]
else:
    None 
 
if pdb_id:
    pdb_url = pdb_database + pdb_id + ".pdb"
    response = requests.get(pdb_url)
    # Check if the request was successful
    if response.status_code == 200:
      # Open a new file in write mode and save the PDB content
        with open(pdb_id + ".pdb", "w") as f:
            f.write(response.text)  # Write the response text (PDB file content) to the file
        print(f"PDB file {pdb_id}.pdb downloaded successfully!")
    else:
      # Print an error message if the PDB file could not be retrieved
        print(f"Failed to download PDB file for {pdb_id}. Status code: {response.status_code}")


PDB file 6LXT.pdb downloaded successfully!


### 2. Visualize the protein's 3D structure using [py3Dmol](https://www.insilicochemistry.io/tutorials/foundations/chemistry-visualization-with-py3dmol#h.7bo0u5g308as), a library for rendering molecular structures in an interactive 3D view.

In [4]:
import py3Dmol  # Import the py3Dmol library for 3D molecular visualization

# Load PDB content
if pdb_id:
    pdb_file = f"{pdb_id}.pdb"
    
    with open(pdb_file, "r") as f:
        pdb_data = f.read()

    # Create 3D viewer
    view = py3Dmol.view(width=800, height=600)
    view.addModel(pdb_data, "pdb")

    # Apply cartoon style
    view.setStyle({"cartoon": {"color": "spectrum"}})

    # Adjust the zoom level to fit the structure within the view
    view.zoomTo()
    # Render and display the 3D molecular structure
    view.show()


## What Is This?
 SARS-CoV-2 Spike Glycoprotein (S2 Subunit). It plays a key role in viral fusion and entry into human cells. The displayed structure represents the post-fusion state, meaning it shows the conformation after the virus has fused with the host cell.
 The S2 subunit is mainly composed of Alpha-helices (helical coiled-coil region, which provides structural stability.


In [5]:
import py3Dmol  # Import the py3Dmol library for 3D molecular visualization

# Define PDB ID (Change this to your desired protein)
pdb_id = "6LXT"  # Example: SARS-CoV-2 Spike protein PDB ID

# Construct the URL to fetch the PDB file from the RCSB PDB database
pdb_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"

# Send an HTTP GET request to download the PDB file
response = requests.get(pdb_url)

# Check if the request was successful (HTTP status code 200)
if response.status_code == 200:
    pdb_data = response.text  # Store the downloaded PDB file as a string
    print(f"PDB file {pdb_id} retrieved successfully!")  # Confirm successful retrieval
else:
    print(f"Failed to download PDB file for {pdb_id}. Status code: {response.status_code}")  
    exit()  # Terminate the script if the PDB file cannot be retrieved



PDB file 6LXT retrieved successfully!


### 3. Processes a PDB file, extracts atomic coordinates, stores them in a list of dictionaries, and visualizes the protein structure using py3Dmol.

In [6]:
# Initialize an empty list to store atom information
atoms = []

# Process each line in the PDB file
for line in pdb_data.split("\n"):  
    if line.startswith("ATOM"):  # Extract only lines that describe atomic coordinates
        atom_number = int(line[6:11].strip())  # Extract the atom serial number
        atom_name = line[12:16].strip()  # Extract the atom name (e.g., "CA", "N", "O")
        residue_name = line[17:20].strip()  # Extract the residue name (e.g., "ALA", "GLY")
        chain_id = line[21].strip()  # Extract the chain ID (e.g., "A", "B")
        x = float(line[30:38].strip())  # Extract the X-coordinate of the atom
        y = float(line[38:46].strip())  # Extract the Y-coordinate of the atom
        z = float(line[46:54].strip())  # Extract the Z-coordinate of the atom

       # Append the extracted atomic information as a dictionary to the atoms list
atoms.append({
    "atom_number": atom_number,  # Unique serial number of the atom
    "atom_name": atom_name,  # Atom type (e.g., "CA" for alpha carbon, "O" for oxygen)
    "residue_name": residue_name,  # Residue to which the atom belongs (e.g., "ALA", "GLY")
    "chain_id": chain_id,  # Protein chain identifier (e.g., "A", "B")
    "x": x,  # X-coordinate in 3D space
    "y": y,  # Y-coordinate in 3D space
    "z": z   # Z-coordinate in 3D space
})

# Print the first 5 atoms in the protein structure
print("First 5 atoms in the protein:")
for atom in atoms[:5]:  # Slice the list to print only the first 5 atoms
    print(atom)
    view.setStyle({"stick": {"colorscheme": "element"}})  # Color atoms based on element type



# Zoom and show visualization
view.zoomTo()
view.show()


First 5 atoms in the protein:
{'atom_number': 5210, 'atom_name': 'OE2', 'residue_name': 'GLU', 'chain_id': 'F', 'x': -13.639, 'y': 3.225, 'z': -59.926}


### Example: How This Atom Appears in a PDB File
## ATOM  22814  OG  SER C 134     220.152  209.659  117.730  1.00 20.00           O
## 22814 → Atom serial number.
## OG → Oxygen gamma (part of Serine's functional group).
## SER → Residue (Serine).
## C → Chain identifier.
## 134 → Residue sequence number.
## 220.152, 209.659, 117.730 → X, Y, Z 3D coordinates.

In [7]:
### Color code 

#Carbon (C) → Gray
#Oxygen (O) → Red
#Nitrogen (N) → Blue


## Exercise
### Pack everything to functions and use a for looop to show all the protein of the pdb database.


In [8]:
if pdb_ids:
    pdb_url = pdb_database + pdb_ids[0] + ".pdb" 
    
    response = requests.get(pdb_url)
    # Check if the request was successful
  
    # Create 3D viewer
    view = py3Dmol.view(width=800, height=600)
    view.addModel(response.text, "pdb")

    # Apply cartoon style
    view.setStyle({"cartoon": {"color": "spectrum"}})

    # Adjust the zoom level to fit the structure within the view
    view.zoomTo()
     
    # Render and display the 3D molecular structure
    view.show() 
 

In [9]:

def molecular_show(mol, style={"cartoon": {"color": "spectrum"}}): 
    # Create 3D viewer
    view = py3Dmol.view(width=800, height=600)
    view.addModel(mol, "pdb")

    # Apply cartoon style
    view.setStyle(style)

    # Adjust the zoom level to fit the structure within the view
    view.zoomTo()
     
    # Render and display the 3D molecular structure
    return view
    

pdb_url = pdb_database + pdb_ids[0] + ".pdb" 
response = requests.get(pdb_url)

viewer = molecular_show(response.text) 
viewer.show() 

    

In [11]:
from ipywidgets import interact, IntSlider, Dropdown

n_pdbs = 16  # max is len(pdb_ids)

structures = []
for i, pdb_id in enumerate(pdb_ids[:n_pdbs]):
    print(f'pdd_id {i}', end="\r")
    pdb_file_url = pdb_database + pdb_id + ".pdb" 
    structures.append(requests.get(pdb_file_url).text) 

def conf_viewer(idx, style):
    viewer = molecular_show(structures[idx], style) 
    print(pdb_ids[idx]) 
    return viewer.show()

interact(conf_viewer, idx=IntSlider(min=0,max=len(structures)-1, step=1), 
         style=Dropdown(options=['line', 'stick', 'sphere', 'cartoon'],
                        value='line', description='Style:'))


pdd_id 15

interactive(children=(IntSlider(value=0, description='idx', max=15), Dropdown(description='Style:', options=('…

<function __main__.conf_viewer(idx, style)>