# Scop3P

A comprehensive database of human phosphosites within their full context. Scop3P integrates sequences (UniProtKB/Swiss-Prot), structures (PDB), and uniformly reprocessed phosphoproteomics data (PRIDE) to annotate all known human phosphosites. 

Scop3P, available at https://iomics.ugent.be/scop3p, presents a unique resource for visualization and analysis of phosphosites and for understanding of phosphosite structure–function relationships.

Please cite: https://doi.org/10.1021/acs.jproteome.0c00306

## Install Dependencies

In [1]:
%%capture
!pip install bokeh pandas matplotlib b2btools==3.0.7b2

In [1]:
%%capture
import tempfile
import requests
import pandas as pd 
import py3Dmol
from b2bTools import SingleSeq, constants
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool

## Target protein

In [2]:
TARGET_PROTEIN_ID = "P07949" # Write here the Protein ID of your protein of interest
PDB_ID = "2IVS" # Write here the PDB ID of your protein of interest

## API Request

This function makes a GET request to SCOP3P API endpoint for a given protein accession ID and returns the protein sequence in string format.

In [3]:
def fetch_sequence_aminoacids(accession):
    BASE_URL = f"http://uniprot.org/uniprotkb/{accession}.fasta"
    url = f'{BASE_URL}?accession={accession}'
    response = requests.get(url)
    if response.status_code == 200:
        raw_fasta_sequence = response.content.decode("utf-8")
    else:
        raw_fasta_sequence = ""
    
    lines = raw_fasta_sequence.split('\n')
    protein_id = str(lines[0])
    amino_acids = "".join([str(l) for l in lines[1:]])
    
    return protein_id, amino_acids

def fetch_protein_modifications(accession):
    """
    Fetches protein modifications for a given UniProt ID.

    Parameters:
    accession (str): UniProt ID of the protein.

    Returns:
    dict: A dictionary containing protein modifications.
    """
    BASE_URL = "https://iomics.ugent.be/scop3p/api/modifications"
    url = f'{BASE_URL}?accession={accession}'
    headers = {'accept': 'application/json'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        return None


## Data parsing

For parsing the JSON as a table, we'll use pandas library.

In [4]:
scop3P_results = fetch_protein_modifications(TARGET_PROTEIN_ID)
_protein_id, sequence = fetch_sequence_aminoacids(TARGET_PROTEIN_ID)

protein_name = scop3P_results['proteinName']
entry_name = scop3P_results['entryName']
accession = scop3P_results['accession']
url = scop3P_results['url']
modifications = scop3P_results['modifications']

print("""
--------------------------------------------------------------------------------
Scop3P: A Comprehensive Resource of Human Phosphosites within Their Full Context
--------------------------------------------------------------------------------

{0}:{1}

Phospho-sites found: {3} entries.

Full entry available on SCOP3P website: {2}
""".format(entry_name, protein_name, url, len(modifications)))


--------------------------------------------------------------------------------
Scop3P: A Comprehensive Resource of Human Phosphosites within Their Full Context
--------------------------------------------------------------------------------

RET_HUMAN:Proto-oncogene tyrosine-protein kinase receptor Ret (EC 2.7.10.1) (Cadherin family member 12) (Proto-oncogene c-Ret) [Cleaved into: Soluble RET kinase fragment; Extracellular cell-membrane anchored RET cadherin 120 kDa fragment]

Phospho-sites found: 9 entries.

Full entry available on SCOP3P website: https://iomics.ugent.be/scop3p/index?protein=P07949



In [22]:
def get_modifications_table(modifications):
    """
    Displays the protein modifications in a pandas DataFrame.

    Parameters:
    modifications (list): A list of dictionaries, each representing a protein modification.
    """
    df = pd.DataFrame(modifications)
    df = df[['residue', 'name', 'evidence', 'position', 'source', 'reference', 'functionalScore', 'specificSinglyPhosphorylated']]
    
    return df 
    
modifications_table = get_modifications_table(modifications)
display(modifications_table)

## Predict biophysical features

In [6]:
%%capture
def predict_biophysical_features(accession, sequence):
    with tempfile.NamedTemporaryFile(prefix="seq_", suffix=".fasta", mode="w") as fp:
        fp.write(f">{accession}\n{sequence}\n")
        fp.flush()
        fp.seek(0)
        
        pred = SingleSeq(fp.name).predict(tools=[constants.TOOL_DYNAMINE, constants.TOOL_DISOMINE, constants.TOOL_EFOLDMINE]).get_all_predictions()
    
    return pred

biophysical_features = predict_biophysical_features(TARGET_PROTEIN_ID, sequence)

biophysical_features_target_protein = biophysical_features['proteins'][TARGET_PROTEIN_ID]

In [7]:
phosphorylated = list(modifications_table['position'])
biophysical_features_target_protein['BD_label'] = []
biophysical_features_target_protein['DO_label'] = []
biophysical_features_target_protein['EF_label'] = []
biophysical_features_target_protein['psites'] = []

for index, residue in enumerate(biophysical_features_target_protein['seq']):
    current_backbone = biophysical_features_target_protein['backbone'][index]
    biophysical_features_target_protein['BD_label'].append(1 if current_backbone > 1 else 2 if current_backbone > 0.8 else 3 if current_backbone > 0.69 else 4)

    current_disorder_propensity = biophysical_features_target_protein['disoMine'][index]
    biophysical_features_target_protein['DO_label'].append(1 if current_disorder_propensity > 0.5 else 0)

    current_early_folding = biophysical_features_target_protein['earlyFolding'][index]
    biophysical_features_target_protein['EF_label'].append(1 if current_early_folding > 0.169 else 0)
    
    biophysical_features_target_protein['psites'].append(1 if index + 1 in phosphorylated else 0)
    

### Get the dynamic properties in a pandas DataFrame object

In [8]:
dynamic_properties=pd.DataFrame(biophysical_features_target_protein)
dynamic_properties['seqpos']=range(1,len(dynamic_properties)+1)

## Plotting results

For visualization of dynamic properties we will use Bokeh interactive visualization package

In [9]:
output_notebook()

p = figure(width=1000, height=300,tools = "pan,box_zoom,reset",toolbar_location="below",
           toolbar_sticky=False)
p.title.text = 'Biophysical properties'


l1=p.line(dynamic_properties.seqpos, dynamic_properties.backbone, line_width=2, color='blue', alpha=0.8,
        muted_color='blue', muted_alpha=0.2, legend_label='backbone_dynamics')

l2=p.line(dynamic_properties.seqpos, dynamic_properties.disoMine, line_width=2, color='red', alpha=0.8,
        muted_color='red', muted_alpha=0.2, legend_label='disorder')

l3=p.line(dynamic_properties.seqpos, dynamic_properties.earlyFolding, line_width=2, color='grey', alpha=0.8,
        muted_color='grey', muted_alpha=0.2, legend_label='earlyFolding')

l4=p.scatter(modifications_table.position,[0.5]*len(modifications_table.position),\
             fill_alpha=0.6, color='grey',size=10,legend_label='P-sites')

p.add_tools(HoverTool(tooltips="Seqpos:@x, value:@y", renderers=[l1,l2,l3]))
p.add_tools(HoverTool(tooltips="Seqpos:@x", renderers=[l4]))
# p.add_tools(BoxZoomTool(match_aspect=False))


legend=p.legend[0]
p.legend.click_policy="mute"
p.add_layout(legend, 'right')


## visualize the plots and interpret

In [23]:
from bokeh.models import Div
from bokeh.layouts import column

notes = Div(
    width=1000,
    style={"padding": "10px"},
    text="""
    <div style="font-family: Arial; font-size: 12px; line-height: 1.4;">

      <b>How to read the tracks</b><br><br>

      <b style="color: blue;">Backbone dynamics</b>:
      <span style="color: blue;">
        &gt;1.0 membrane-spanning,
        0.8–1.0 rigid,
        0.69–0.80 context-dependent,
        &lt;0.69 flexible
      </span>
      <br><br>

      <b style="color: red;">Disorder (DisoMine)</b>:
      <span style="color: red;">
        values &gt;0.50 indicate disordered regions
      </span>
      <br><br>

      <b style="color: grey;">Early folding</b>:
      <span style="color: grey;">
        values &gt;0.169 suggest early-folding propensity
      </span>
      <br><br>

      <b style="color: grey;">P-sites</b>:
      <span style="color: grey;">
        phosphorylation positions (grey dots)
      </span>

    </div>
    """
)

show(column(p, notes))


## Mutation analysis on Target protein

We will try to mutate some amino acids in one or multiple positions and 
predict the Biophysical properties for the mutated sequence

In [11]:
# Global store (used later by inference tables)
MUTATIONS = []

def mutateSeq_and_store(sequence, store_global=True):
    """
    Ask once for mutation positions + mutant AAs.
    Returns:
      mutated_sequence (str)
      mutations (list of tuples): [(pos_1based, mutAA), ...]
    Side effect (optional):
      sets global MUTATIONS so later cells can reuse it without asking again.
    """

    pos_raw = input("Please enter the positions to mutate (comma-separated, 1-based): ").strip()
    aa_raw  = input("Please enter the aminoacid(s) to mutate TO (comma-separated): ").strip()

    poslis = [p.strip() for p in pos_raw.split(",") if p.strip()]
    aalis  = [a.strip().upper() for a in aa_raw.split(",") if a.strip()]

    if len(poslis) != len(aalis):
        raise ValueError("Number of positions and amino acids must match.")

    standardAA = set(list("ARNDCEQGHILKMFPSTWYV"))
    bad = [a for a in aalis if a not in standardAA]
    if bad:
        raise ValueError(f"Some amino acid(s) not recognized: {bad}. Use standard 1-letter codes.")

    positions = [int(p) for p in poslis]
    L = len(sequence)

    # Validate positions
    out_of_range = [p for p in positions if p < 1 or p > L]
    if out_of_range:
        raise ValueError(f"Position(s) out of range (1..{L}): {out_of_range}")

    # Apply mutations (supports multiple)
    mut_seq_list = list(sequence)
    mutations = []

    # If user repeats a position, last one wins
    for pos, aa in zip(positions, aalis):
        mut_seq_list[pos - 1] = aa
        mutations.append((pos, aa))

    mutated_sequence = "".join(mut_seq_list)

    if store_global:
        global MUTATIONS
        MUTATIONS = mutations

    # Quick summary
    summary = ", ".join([f"{sequence[p-1]}{p}{aa}" for p, aa in mutations])
    print("Applied mutations:", summary)

    return mutated_sequence, mutations




## Mutate here
Now let's try to enter where we want to mutate and replace the amino acids at the position

In [24]:
mutated_sequence, MUTATIONS = mutateSeq_and_store(sequence)

### Predict Biophysical properties for the mutated sequence

In [13]:
%%capture

biophysical_features_mut = predict_biophysical_features(TARGET_PROTEIN_ID, mutated_sequence)

biophysical_features_mut_protein = biophysical_features_mut['proteins'][TARGET_PROTEIN_ID]

biophysical_features_mut_protein['BD_label'] = []
biophysical_features_mut_protein['DO_label'] = []
biophysical_features_mut_protein['EF_label'] = []

for index, residue in enumerate(biophysical_features_mut_protein['seq']):
    mut_backbone = biophysical_features_mut_protein['backbone'][index]
    biophysical_features_mut_protein['BD_label'].append(1 if mut_backbone > 1 else 2 if mut_backbone > 0.8 else 3 if mut_backbone > 0.69 else 4)

    mut_disorder_propensity = biophysical_features_mut_protein['disoMine'][index]
    biophysical_features_mut_protein['DO_label'].append(1 if mut_disorder_propensity > 0.5 else 0)

    mut_early_folding = biophysical_features_mut_protein['earlyFolding'][index]
    biophysical_features_mut_protein['EF_label'].append(1 if mut_early_folding > 0.169 else 0)
    

### Covert biophysical properties into a pandas DataFrame

In [14]:
dynamic_properties_mut=pd.DataFrame(biophysical_features_mut_protein)
dynamic_properties_mut['seqpos']=range(1,len(dynamic_properties_mut)+1)

### Plot results of mutated sequence
Let's plot the Biophysical properties for the Original and Mutated sequences

For visualization of dynamic properties we will use Bokeh interactive visualization package

In [25]:
output_notebook()

p_mut = figure(width=1000, height=300,tools = "pan,box_zoom,reset",toolbar_location="below",
           toolbar_sticky=False)
p_mut.title.text = 'Biophysical properties'


b1=p_mut.line(dynamic_properties.seqpos, dynamic_properties.backbone, line_width=2, color='skyblue', alpha=0.8,
        muted_color='skyblue', muted_alpha=0.2, legend_label='backbone_dynamics')

b2=p_mut.line(dynamic_properties_mut.seqpos, dynamic_properties_mut.backbone, line_width=2, color='blue', alpha=0.8,
        muted_color='blue', muted_alpha=0.2, legend_label='backbone_mut')


d1=p_mut.line(dynamic_properties.seqpos, dynamic_properties.disoMine, line_width=2, color='salmon', alpha=0.8,
        muted_color='salmon', muted_alpha=0.2, legend_label='disorder')
d2=p_mut.line(dynamic_properties_mut.seqpos, dynamic_properties_mut.disoMine, line_width=2, color='red', alpha=0.8,
        muted_color='red', muted_alpha=0.2, legend_label='disorder_mut')

e1=p_mut.line(dynamic_properties.seqpos, dynamic_properties.earlyFolding, line_width=2, color='grey', alpha=0.8,
        muted_color='grey', muted_alpha=0.2, legend_label='earlyFolding')
e2=p_mut.line(dynamic_properties_mut.seqpos, dynamic_properties_mut.earlyFolding, line_width=2, color='black', alpha=0.8,
        muted_color='black', muted_alpha=0.2, legend_label='earlyFolding_mut')

l4=p_mut.scatter(modifications_table.position,[0.5]*len(modifications_table.position),\
             fill_alpha=0.6, color='grey',size=10,legend_label='P-sites')

p_mut.add_tools(HoverTool(tooltips="Seqpos:@x, value:@y", renderers=[b1,b2,d1,d2,e1,e2]))
p_mut.add_tools(HoverTool(tooltips="Seqpos:@x", renderers=[l4]))


legend=p_mut.legend[0]
p_mut.legend.click_policy="mute"
p_mut.add_layout(legend, 'right')


In [16]:
from bokeh.models import Div
from bokeh.layouts import column

notes = Div(
    width=1000,
    style={"padding": "10px"},
    text="""
    <div style="font-family: Arial; font-size: 12px; line-height: 1.4;">

      <b>How to read the tracks</b><br><br>

      <b style="color: blue;">Backbone dynamics</b>:
      <span style="color: blue;">
        &gt;1.0 membrane-spanning,
        0.8–1.0 rigid,
        0.69–0.80 context-dependent,
        &lt;0.69 flexible
      </span>
      <br><br>

      <b style="color: red;">Disorder (DisoMine)</b>:
      <span style="color: red;">
        values &gt;0.50 indicate disordered regions
      </span>
      <br><br>

      <b style="color: grey;">Early folding</b>:
      <span style="color: grey;">
        values &gt;0.169 suggest early-folding propensity
      </span>
      <br><br>

      <b style="color: grey;">P-sites</b>:
      <span style="color: grey;">
        phosphorylation positions (grey dots)
      </span>

    </div>
    """
)



In [26]:
show(column(p_mut,notes))

### Save the plot as svg, png, or jpg

In [None]:
plot.output_backend = "svg"
export_svgs(plot, filename="plot.svg")

### Inference (What does the mutataion predictions tells us about the biophysical properties?)

In [18]:
import pandas as pd
from IPython.display import display

# --- Label functions based on your cutoffs ---

def label_backbone(x: float) -> str:
    if x > 1.0:
        return "membrane-spanning"
    if x > 0.8:
        return "rigid"
    if x > 0.69:
        return "context-dependent"
    return "flexible"

def label_disorder(x: float) -> str:
    return "disordered" if x > 0.50 else "ordered"

def label_earlyfold(x: float) -> str:
    return "early-folding" if x > 0.169 else "low"

LABEL_FUNCS = {
    "backbone": (label_backbone, "Backbone"),
    "disoMine": (label_disorder, "Disorder"),
    "earlyFolding": (label_earlyfold, "EarlyFolding"),
}

def mutation_effect_table_with_label_shift(
    wt_df: pd.DataFrame,
    mut_df: pd.DataFrame,
    feature: str,
    window: int = 5,
    mutations=None,   # uses MUTATIONS by default
):
    if mutations is None:
        mutations = MUTATIONS

    if feature not in LABEL_FUNCS:
        raise ValueError(f"feature must be one of {list(LABEL_FUNCS.keys())}")

    label_fn, pretty = LABEL_FUNCS[feature]

    wt = wt_df.set_index("seqpos", drop=False)
    mu = mut_df.set_index("seqpos", drop=False)
    max_pos = int(wt["seqpos"].max())

    rows = []

    for pos, aa_to in mutations:
        if pos not in wt.index or pos not in mu.index:
            rows.append({
                "pos": pos, "WT_AA": "", "Mut_AA": aa_to,
                "mutation": f"?{pos}{aa_to}", "window": "",
                "WT_label@pos": "", "Mut_label@pos": "",
                "WT_label_mean": "", "Mut_label_mean": "",
                "Δ@pos": None, "Δ_mean": None,
                "inference": "Position out of range", "note": ""
            })
            continue

        wt_aa = str(wt.loc[pos, "seq"])
        lo = max(1, pos - window)
        hi = min(max_pos, pos + window)

        wt_center = float(wt.loc[pos, feature])
        mu_center = float(mu.loc[pos, feature])
        d_center = mu_center - wt_center

        wt_mean = float(wt.loc[lo:hi, feature].astype(float).mean())
        mu_mean = float(mu.loc[lo:hi, feature].astype(float).mean())
        d_mean = mu_mean - wt_mean

        wt_lab_pos = label_fn(wt_center)
        mu_lab_pos = label_fn(mu_center)
        wt_lab_mean = label_fn(wt_mean)
        mu_lab_mean = label_fn(mu_mean)

        # Inference focuses on label transitions
        if wt_lab_pos != mu_lab_pos:
            shift_pos = f"{wt_lab_pos} → {mu_lab_pos}"
        else:
            shift_pos = f"{wt_lab_pos} (no class change)"

        if wt_lab_mean != mu_lab_mean:
            shift_mean = f"{wt_lab_mean} → {mu_lab_mean}"
        else:
            shift_mean = f"{wt_lab_mean} (no class change)"

        inference = (
            f"{pretty}: {shift_pos} at site (Δ {d_center:+.3f}); "
            f"window mean: {shift_mean} (Δ {d_mean:+.3f})"
        )

        note_parts = []
        # sanity check: if your mutated_sequence is used, this should match
        mut_aa_seq = str(mu.loc[pos, "seq"])
        if mut_aa_seq.upper() != aa_to.upper():
            note_parts.append(f"Mut seq AA={mut_aa_seq} (expected {aa_to})")

        rows.append({
            "pos": pos,
            "WT_AA": wt_aa,
            "Mut_AA": aa_to,
            "mutation": f"{wt_aa}{pos}{aa_to}",
            "window": f"{lo}-{hi}",

            f"{pretty}_WT@pos": wt_center,
            f"{pretty}_Mut@pos": mu_center,
            "WT_label@pos": wt_lab_pos,
            "Mut_label@pos": mu_lab_pos,
            "Δ@pos": d_center,

            f"{pretty}_WT_mean(±{window})": wt_mean,
            f"{pretty}_Mut_mean(±{window})": mu_mean,
            "WT_label_mean": wt_lab_mean,
            "Mut_label_mean": mu_lab_mean,
            "Δ_mean": d_mean,

            "inference": inference,
            "note": "; ".join(note_parts)
        })

    df = pd.DataFrame(rows)

    # Pretty rounding for numeric cols
    num_cols = [c for c in df.columns if any(k in c for k in ["_WT@", "_Mut@", "_WT_mean", "_Mut_mean", "Δ@pos", "Δ_mean"])]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").round(3)

    # display(df.style.set_caption(f"{pretty} label-shift summary (±{window} AA)"))
    return df


In [27]:
# Backbone label shifts
mutation_effect_table_with_label_shift(dynamic_properties, dynamic_properties_mut, feature="backbone", mutations=MUTATIONS, window=5)


In [28]:
# Disorder label shifts
mutation_effect_table_with_label_shift(dynamic_properties, dynamic_properties_mut, feature="disoMine", mutations=MUTATIONS, window=5)

In [29]:
# Disorder label shifts
mutation_effect_table_with_label_shift(dynamic_properties, dynamic_properties_mut, feature="earlyFolding", mutations=MUTATIONS, window=5)