In [22]:
import pandas
import numpy
import seaborn
import logging
from matplotlib import pyplot

import mhcflurry

print("MHCflurry version: %s" % (mhcflurry.__version__))

MHCflurry version: 1.0.0


# Download data and models

In [2]:
!mhcflurry-downloads fetch

Fetching 0/6 downloads from release 1.0.0
DOWNLOAD NAME                             ALREADY DOWNLOADED?    WILL DOWNLOAD NOW?    URL                  
models_class1                             YES                    NO                    http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1.tar.bz2 
models_class1_experiments1                NO                     NO                    http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2 
cross_validation_class1                   NO                     NO                    http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2 
data_iedb                                 NO                     NO                    https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 
data_kim2014                              NO                     NO                    http://github.com/hammerlab/mhcflurry/re

# Making predictions with `Class1AffinityPredictor`

In [3]:
help(mhcflurry.Class1AffinityPredictor)

Help on class Class1AffinityPredictor in module mhcflurry.class1_affinity_predictor:

class Class1AffinityPredictor(builtins.object)
 |  High-level interface for peptide/MHC I binding affinity prediction.
 |  
 |  This is the class most users will want to use.
 |  
 |  This class delegates to one or more `Class1NeuralNetwork` instances.
 |  It supports prediction across multiple alleles using ensembles of single-
 |  or pan-allele predictors.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, allele_to_allele_specific_models=None, class1_pan_allele_models=None, allele_to_pseudosequence=None, manifest_df=None, allele_to_percent_rank_transform=None)
 |      Parameters
 |      ----------
 |      allele_to_allele_specific_models : dict of string -> list of Class1NeuralNetwork
 |          Ensemble of single-allele models to use for each allele. 
 |      
 |      class1_pan_allele_models : list of Class1NeuralNetwork
 |          Ensemble of pan-allele models.
 |      
 |      allele_to_p

In [4]:
downloaded_predictor = mhcflurry.Class1AffinityPredictor.load()

In [5]:
downloaded_predictor.predict(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"])

Using TensorFlow backend.


array([ 4899.04784343,  5685.25682682])

In [6]:
downloaded_predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"])

Unnamed: 0,allele,peptide,prediction,prediction_low,prediction_high,prediction_percentile
0,HLA-A0201,SIINFEKL,4899.047843,2767.763654,7269.683643,6.509787
1,HLA-A0201,SIINFEQL,5685.256827,3815.923563,7476.714466,7.436687


In [7]:
downloaded_predictor.predict_to_dataframe(alleles=["HLA-A0201", "HLA-B*57:01"], peptides=["SIINFEKL", "SIINFEQL"])

Unnamed: 0,allele,peptide,prediction,prediction_low,prediction_high,prediction_percentile
0,HLA-A0201,SIINFEKL,4899.047942,2767.763654,7269.683643,6.509787
1,HLA-B*57:01,SIINFEQL,26704.220115,23198.059394,30635.11461,47.611925


In [8]:
downloaded_predictor.predict_to_dataframe(
    allele="HLA-A0201",
    peptides=["SIINFEKL", "SIINFEQL"],
    include_individual_model_predictions=True)

Unnamed: 0,allele,peptide,prediction,prediction_low,prediction_high,model_single_0,model_single_1,model_single_2,model_single_3,model_single_4,model_single_5,model_single_6,model_single_7,prediction_percentile
0,HLA-A0201,SIINFEKL,4899.047843,2767.763654,7269.683643,5245.313773,4131.368053,4599.034976,7350.344042,2230.774145,6754.462616,4220.768251,7122.22673,6.509787
1,HLA-A0201,SIINFEQL,5685.256827,3815.923563,7476.714466,5711.583293,5718.509616,5459.967973,7709.914896,3394.800793,7062.179615,4741.49566,6983.180148,7.436687


In [9]:
downloaded_predictor.predict_to_dataframe(
    allele="HLA-A0201",
    peptides=["SIINFEKL", "SIINFEQL", "TAAAALANGGGGGGGG"],
    throw=False)  # Without throw=False, you'll get a ValueError for invalid peptides or alleles

Unnamed: 0,allele,peptide,prediction,prediction_low,prediction_high,prediction_percentile
0,HLA-A0201,SIINFEKL,4899.047843,2767.763654,7269.683643,6.509787
1,HLA-A0201,SIINFEQL,5685.256827,3815.923563,7476.714466,7.436687
2,HLA-A0201,TAAAALANGGGGGGGG,,,,100.0


# Instantiating a `Class1AffinityPredictor`  from a saved model on disk

In [10]:
models_dir = mhcflurry.downloads.get_path("models_class1", "models")
models_dir

'/Users/tim/Library/Application Support/mhcflurry/4/1.0.0/models_class1/models'

In [11]:
# This will be the same predictor we instantiated above. We're just being explicit about what models to load.
downloaded_predictor = mhcflurry.Class1AffinityPredictor.load(models_dir)
downloaded_predictor.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"], allele="HLA-A0301")

array([ 28227.29890915,  26568.72745054,  39043.95304442])

# Fit a model: first load some data

In [12]:
# This is the data the downloaded models were trained on
data_path = mhcflurry.downloads.get_path("data_curated", "curated_training_data.csv.bz2")
data_path

'/Users/tim/Library/Application Support/mhcflurry/4/1.0.0/data_curated/curated_training_data.csv.bz2'

In [13]:
data_df = pandas.read_csv(data_path)
data_df

Unnamed: 0,allele,peptide,measurement_value,measurement_type,measurement_source,original_allele
0,BoLA-1*21:01,AENDTLVVSV,7817.0,quantitative,Barlow - purified MHC/competitive/fluorescence,BoLA-1*02101
1,BoLA-1*21:01,NQFNGGCLLV,1086.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-1*02101
2,BoLA-2*08:01,AAHCIHAEW,21.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
3,BoLA-2*08:01,AAKHMSNTY,1299.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
4,BoLA-2*08:01,DSYAYMRNGW,2.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
5,BoLA-2*08:01,HTTNTQNNDW,40.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
6,BoLA-2*08:01,KVYANIAPTY,10000.0,quantitative,Barlow - purified MHC/competitive/fluorescence,BoLA-2*00801
7,BoLA-2*08:01,KVYNPPRTNY,393.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
8,BoLA-2*08:01,LAAKHMSNT,1380.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
9,BoLA-2*08:01,LLVAMVPEW,2.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801


# Fit a model: Low level `Class1NeuralNetwork` interface

In [14]:
# We'll use mostly the default hyperparameters here. Could also specify them as kwargs.
new_model = mhcflurry.Class1NeuralNetwork(layer_sizes=[16])
new_model.hyperparameters

{'activation': 'relu',
 'batch_normalization': False,
 'dense_layer_l1_regularization': 0.001,
 'dense_layer_l2_regularization': 0.0,
 'dropout_probability': 0.0,
 'early_stopping': True,
 'embedding_init_method': 'glorot_uniform',
 'embedding_input_dim': 21,
 'embedding_output_dim': 8,
 'init': 'glorot_uniform',
 'kmer_size': 15,
 'layer_sizes': [16],
 'left_edge': 4,
 'locally_connected_layers': [{'activation': 'tanh',
   'filters': 8,
   'kernel_size': 3}],
 'loss': 'mse',
 'max_epochs': 500,
 'min_delta': 0,
 'minibatch_size': 128,
 'mode': 'auto',
 'monitor': 'val_loss',
 'optimizer': 'rmsprop',
 'output_activation': 'sigmoid',
 'patience': 10,
 'peptide_amino_acid_encoding': 'one-hot',
 'pseudosequence_use_embedding': False,
 'random_negative_affinity_max': 50000.0,
 'random_negative_affinity_min': 20000.0,
 'random_negative_constant': 25,
 'random_negative_distribution_smoothing': 0.0,
 'random_negative_match_distribution': True,
 'random_negative_rate': 0.0,
 'right_edge': 4,
 

In [16]:
train_data = data_df.loc[
    (data_df.allele == "HLA-B*57:01") &
    (data_df.peptide.str.len() >= 8) &
    (data_df.peptide.str.len() <= 15)
]
% time new_model.fit(train_data.peptide.values, train_data.measurement_value.values)

Train on 2491 samples, validate on 623 samples
Epoch 1/1
Epoch   0 / 500: loss=0.0173307. Min val loss (None) at epoch None
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Early stopping at epoch  12 / 500: loss=0.0168712. Min val loss (0.0261514389179) at epoch 1
CPU times: user 1.92 s, sys: 167 ms, total: 2.09 s
Wall time: 1.72 s


In [17]:
new_model.predict(["SYNPEPII"])

array([ 26802.58186135])

# Fit a model: high level `Class1AffinityPredictor` interface

In [18]:
affinity_predictor = mhcflurry.Class1AffinityPredictor()

# This can be called any number of times, for example on different alleles, to build up the ensembles.
affinity_predictor.fit_allele_specific_predictors(
    n_models=1,
    architecture_hyperparameters={"layer_sizes": [16], "max_epochs": 10},
    peptides=train_data.peptide.values,
    affinities=train_data.measurement_value.values,
    allele="HLA-B*57:01",
)

Train on 2491 samples, validate on 623 samples
Epoch 1/1
Epoch   0 /  10: loss=0.251054. Min val loss (None) at epoch None
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1
Train on 2491 samples, validate on 623 samples
Epoch 1/1


[<mhcflurry.class1_neural_network.Class1NeuralNetwork at 0x124ad7d30>]

In [19]:
affinity_predictor.predict(["SYNPEPII"], allele="HLA-B*57:01")

array([ 17405.26823281])

# Save and restore the fit model

In [20]:
!mkdir /tmp/saved-affinity-predictor
affinity_predictor.save("/tmp/saved-affinity-predictor")
!ls /tmp/saved-affinity-predictor

manifest.csv
weights_HLA-B*57:01-0-7be58b1094489f2d.npz


In [21]:
affinity_predictor2 = mhcflurry.Class1AffinityPredictor.load("/tmp/saved-affinity-predictor")
affinity_predictor2.predict(["SYNPEPII"], allele="HLA-B*57:01")

array([ 17405.26823281])