In [1]:
from pyProBound.scoring import ProBoundModel

# Load model

### Load model from motifcentral. 

These models generally have only one binding mode.

In [2]:
# model = ProBoundModel(1000, motifcentral=True) 

### Load model from probound training output.

Training is available from ProBound web server. The following model contains three binding modes (the first being non-specific binding).

In [3]:
model = ProBoundModel("fit.sox2.json", fitjson=True)  

### Load model from probound training output and select a particular binding mode

In [4]:
# model = ProBoundModel("fit.sox2.json", fitjson=True, bindingMode=1)

It is also possible to load the entire model and pick the binding mode ex post or change the selected binding mode. It is not possible to return to scoring all binding modes at once or to score a larger subset of binding modes.

In [5]:
# model = ProBoundModel("fit.sox2.json", fitjson=True)  
# model.select_binding_mode(1)

# Define input sequences.

We will now define some test sequences. This can be a list or a numpy array, but it should not be a pandas series (yet)

In [6]:
seqs = [
    "AAAAGACGACTGCGGTCACTGAGGTGTAAA",
    "ACTGTTTGCTCTATGCGGAGGAGCCCCTTA",
    "AAAAGACGACTGCGGTCACTGAGGTGTA",
    "TTAACTGGGTATAGGGGCGAATATGGCGAC",
    "TTAGCCGGGAGGGGGCGCTCCGTAGTGGAT",
    "TTAGCCGGGAGGGGGCGCTCCGTAGTGGATAAA",
    "ATAGTAGTCGTGCGCCCCCACTGGTGACAA",
    "TGTTCCTTGCTTTTATAAGGTAAATGCAGG",
    "ATAGTAGTCGTGCGCCCCCACTGGTGACAAAAA",
    "AAAAGACGACTGCGGTCACTGAGGTGTA",
    "AAAAGACGACTGCGGTCACTGAGGTGTA",
    "AAAAGACGACTGCGGTCACTGTGTA",
]

# Calculate affinity sum for each sequence

Returns a numpy array of size (number of sequences) X 1

In [7]:
result = model.score_affinity_sum(seqs)
result

array([[0.01215323],
       [0.01563752],
       [0.00787506],
       [0.01265492],
       [0.00506125],
       [0.00732187],
       [0.00685346],
       [0.07234593],
       [0.00948732],
       [0.00787506],
       [0.00787506],
       [0.01180993]])

# Calculate a total binding mode score for each sequence and selected binding mode

Available score_format options: "sum", "max", "mean"

Returns a numpy array of size (number of sequences) X (number of binding modes)

In [8]:

result = model.score_binding_mode_scores(seqs, score_format="sum")
result

array([[3.80000000e+01, 1.87767557e-03, 2.07118962e-01],
       [3.80000000e+01, 2.51163174e-03, 2.64799981e-01],
       [3.40000000e+01, 1.75157922e-03, 1.23181782e-01],
       [3.80000000e+01, 1.52430415e-03, 2.24422594e-01],
       [3.80000000e+01, 8.66304463e-04, 8.40678798e-02],
       [4.40000000e+01, 9.35627296e-04, 1.28282413e-01],
       [3.80000000e+01, 7.83029277e-04, 1.22021354e-01],
       [3.80000000e+01, 3.98743258e-02, 6.56291982e-01],
       [4.40000000e+01, 7.91526068e-04, 1.75019956e-01],
       [3.40000000e+01, 1.75157922e-03, 1.23181782e-01],
       [3.40000000e+01, 1.75157922e-03, 1.23181782e-01],
       [2.80000000e+01, 5.59357338e-03, 1.25191203e-01]])

# Analyze input sequences using a PSAM

Using the PSAM of the model, slide through each input sequence and get array of scores.

Parameter profile_aggregate ("sum"/"max"/"mean"/"forward"/None) specificies how to aggregate the scores from each DNA strand on the same position. If None, scores are not aggregated. 

Returns a list of arrays.
- (no of sequences) X (no of binding modes) X slides X 2 (forward, reverse)) (no aggregation) 
- a list of arrays (no of sequences) X model_binding_modes X slides (with aggregation)

In [9]:
result = model.score_binding_mode_scores(seqs, 
                                         score_format="profile", 
                                         profile_aggregate="max"
                                        )
result

[array([[1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
         1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
         1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
         1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
         1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
        [2.20362971e-05, 8.74388562e-05, 1.13578652e-08, 2.08892892e-04,
         5.70876605e-05, 1.92854680e-06, 1.96429909e-05, 9.67967193e-08,
         1.52329223e-05, 8.05396263e-07, 3.95909749e-06, 5.12811568e-08,
         2.39034605e-04, 8.44623174e-04, 2.28682124e-06, 1.83143196e-04,
         1.32499644e-06, 1.20042237e-04, 7.97231402e-08],
        [3.21790978e-03, 2.12704465e-02, 1.24631838e-03, 5.56818299e-03,
         5.69560228e-03, 2.01225577e-03, 1.62436022e-03, 7.10758325e-04,
         1.15242364e-02, 8.61105588e-03, 2.90278024e-03, 2.16180544e-03,
         1.06587300e-02, 3.23369867e-03, 2.58635452e-03, 4.61506643e-03,
        