In [1]:
import pandas
import numpy
import seaborn
import logging
from matplotlib import pyplot

import mhcflurry

print("MHCflurry version: %s" % (mhcflurry.__version__))

Using Theano backend.


MHCflurry version: 0.9.0


# Download data and models

In [2]:
!mhcflurry-downloads fetch

Using Theano backend.
Fetching 0/4 downloads from release 1.0.0
DOWNLOAD NAME                             ALREADY DOWNLOADED?    WILL DOWNLOAD NOW?    URL                  
models_class1                             YES                    NO                    http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/models_class1.tar.bz2 
data_curated                              YES                    NO                    https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/data_curated.tar.bz2 
data_kim2014                              YES                    NO                    http://github.com/hammerlab/mhcflurry/releases/download/0.0.8/data_kim2014.tar.bz2 
data_iedb                                 YES                    NO                    https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/data_iedb.tar.bz2 


# Making predictions with `Class1AffinityPredictor`

In [3]:
help(mhcflurry.Class1AffinityPredictor)

Help on class Class1AffinityPredictor in module mhcflurry.class1_affinity_prediction.class1_affinity_predictor:

class Class1AffinityPredictor(builtins.object)
 |  High-level interface for peptide/MHC I binding affinity prediction.
 |  
 |  This is the class most users will want to use.
 |  
 |  This class delegates to one or more `Class1NeuralNetwork` instances.
 |  It supports prediction across multiple alleles using ensembles of single-
 |  or pan-allele predictors.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, allele_to_allele_specific_models=None, class1_pan_allele_models=None, allele_to_pseudosequence=None, manifest_df=None)
 |      Parameters
 |      ----------
 |      allele_to_allele_specific_models : dict of string -> list of Class1NeuralNetwork
 |          Ensemble of single-allele models to use for each allele. 
 |      
 |      class1_pan_allele_models : list of Class1NeuralNetwork
 |          Ensemble of pan-allele models.
 |      
 |      allele_to_pseudosequenc

In [4]:
downloaded_predictor = mhcflurry.Class1AffinityPredictor.load()

In [5]:
downloaded_predictor.predict(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"])

array([ 6029.07861328,  4798.79443359], dtype=float32)

In [6]:
downloaded_predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"])

Unnamed: 0,allele,peptide,prediction,prediction_low,prediction_high
0,HLA-A0201,SIINFEKL,6029.078613,4474.103253,7771.292885
1,HLA-A0201,SIINFEQL,4798.794434,3089.979654,6757.660606


In [7]:
downloaded_predictor.predict_to_dataframe(alleles=["HLA-A0201", "HLA-B*57:01"], peptides=["SIINFEKL", "SIINFEQL"])

Unnamed: 0,allele,peptide,prediction,prediction_low,prediction_high
0,HLA-A0201,SIINFEKL,6029.080248,4474.103332,7771.29555
1,HLA-B*57:01,SIINFEQL,26494.172574,23963.167585,28407.840921


In [8]:
downloaded_predictor.predict_to_dataframe(
    allele="HLA-A0201",
    peptides=["SIINFEKL", "SIINFEQL"],
    include_individual_model_predictions=True)

Unnamed: 0,allele,peptide,prediction,prediction_low,prediction_high,model_single_0,model_single_1,model_single_2,model_single_3,model_single_4,model_single_5,model_single_6,model_single_7
0,HLA-A0201,SIINFEKL,6029.078613,4474.103253,7771.292885,6342.672852,5768.515625,6045.880371,6485.166016,4922.885742,4249.643066,7165.508301,8118.428711
1,HLA-A0201,SIINFEQL,4798.794434,3089.979654,6757.660606,4940.594727,5556.735352,4746.480469,4656.201172,3591.689453,2849.514893,6637.148926,6823.45459


In [9]:
downloaded_predictor.predict_to_dataframe(
    allele="HLA-A0201",
    peptides=["SIINFEKL", "SIINFEQL", "TAAAALANGGGGGGGG"],
    throw=False)  # Without throw=False, you'll get a ValueError for invalid peptides or alleles

Unnamed: 0,allele,peptide,prediction,prediction_low,prediction_high
0,HLA-A0201,SIINFEKL,6029.07975,4474.103332,7771.292208
1,HLA-A0201,SIINFEQL,4798.795518,3089.980068,6757.66013
2,HLA-A0201,TAAAALANGGGGGGGG,,,


# Instantiating a `Class1AffinityPredictor`  from a saved model on disk

In [10]:
models_dir = mhcflurry.downloads.get_path("models_class1", "models")
models_dir

'/Users/tim/Library/Application Support/mhcflurry/4/1.0.0/models_class1/models'

In [11]:
# This will be the same predictor we instantiated above. We're just being explicit about what models to load.
downloaded_predictor = mhcflurry.Class1AffinityPredictor.load(models_dir)
downloaded_predictor.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"], allele="HLA-A0301")

array([ 25589.67773438,  29587.88476562,  35768.203125  ], dtype=float32)

# Fit a model: first load some data

In [12]:
# This is the data the downloaded models were trained on
data_path = mhcflurry.downloads.get_path("data_curated", "curated_training_data.csv.bz2")
data_path

'/Users/tim/Library/Application Support/mhcflurry/4/1.0.0/data_curated/curated_training_data.csv.bz2'

In [13]:
data_df = pandas.read_csv(data_path)
data_df

Unnamed: 0,allele,peptide,measurement_value,measurement_type,measurement_source,original_allele
0,BoLA-1*21:01,AENDTLVVSV,7817.0,quantitative,Barlow - purified MHC/competitive/fluorescence,BoLA-1*02101
1,BoLA-1*21:01,NQFNGGCLLV,1086.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-1*02101
2,BoLA-2*08:01,AAHCIHAEW,21.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
3,BoLA-2*08:01,AAKHMSNTY,1299.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
4,BoLA-2*08:01,DSYAYMRNGW,2.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
5,BoLA-2*08:01,HTTNTQNNDW,40.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
6,BoLA-2*08:01,KVYANIAPTY,10000.0,quantitative,Barlow - purified MHC/competitive/fluorescence,BoLA-2*00801
7,BoLA-2*08:01,KVYNPPRTNY,393.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
8,BoLA-2*08:01,LAAKHMSNT,1380.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801
9,BoLA-2*08:01,LLVAMVPEW,2.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-2*00801


# Fit a model: Low level `Class1NeuralNetwork` interface

In [14]:
# We'll use mostly the default hyperparameters here. Could also specify them as kwargs.
new_model = mhcflurry.Class1NeuralNetwork(layer_sizes=[16])
new_model.hyperparameters

{'activation': 'relu',
 'batch_normalization': False,
 'dense_layer_l1_regularization': 0.001,
 'dense_layer_l2_regularization': 0.0,
 'dropout_probability': 0.0,
 'early_stopping': True,
 'embedding_init_method': 'glorot_uniform',
 'embedding_input_dim': 21,
 'embedding_output_dim': 8,
 'init': 'glorot_uniform',
 'kmer_size': 15,
 'layer_sizes': [16],
 'left_edge': 4,
 'locally_connected_layers': [{'activation': 'tanh',
   'filters': 8,
   'kernel_size': 3},
  {'activation': 'tanh', 'filters': 8, 'kernel_size': 3}],
 'loss': 'mse',
 'max_epochs': 500,
 'min_delta': 0,
 'mode': 'auto',
 'monitor': 'val_loss',
 'optimizer': 'rmsprop',
 'output_activation': 'sigmoid',
 'patience': 10,
 'pseudosequence_use_embedding': False,
 'random_negative_affinity_max': 50000.0,
 'random_negative_affinity_min': 20000.0,
 'random_negative_constant': 25,
 'random_negative_distribution_smoothing': 0.0,
 'random_negative_match_distribution': True,
 'random_negative_rate': 0.0,
 'right_edge': 4,
 'take_bes

In [15]:
train_data = data_df.ix[
    (data_df.allele == "HLA-B*57:01") &
    (data_df.peptide.str.len() >= 8) &
    (data_df.peptide.str.len() <= 15)
]
% time new_model.fit(train_data.peptide.values, train_data.measurement_value.values)

Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate

In [16]:
new_model.predict(["SYNPEPII"])

array([ 25132.52929688], dtype=float32)

# Fit a model: high level `Class1AffinityPredictor` interface

In [17]:
affinity_predictor = mhcflurry.Class1AffinityPredictor()

# This can be called any number of times, for example on different alleles, to build up the ensembles.
affinity_predictor.fit_allele_specific_predictors(
    n_models=1,
    architecture_hyperparameters={"layer_sizes": [16], "max_epochs": 10},
    peptides=train_data.peptide.values,
    affinities=train_data.measurement_value.values,
    allele="HLA-B*57:01",
)

Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1
Train on 2489 samples, validate on 623 samples
Epoch 1/1


<generator object Class1AffinityPredictor._fit_predictors at 0x1252b5fc0>

In [18]:
affinity_predictor.predict(["SYNPEPII"], allele="HLA-B*57:01")

array([ 25200.29882812], dtype=float32)

# Save and restore the fit model

In [19]:
!mkdir /tmp/saved-affinity-predictor
affinity_predictor.save("/tmp/saved-affinity-predictor")
!ls /tmp/saved-affinity-predictor

mkdir: /tmp/saved-affinity-predictor: File exists
manifest.csv
weights_HLA-B*57:01-0-7d2f64641ccdd312.npz
weights_HLA-B*57:01-0-93498abc9bbd5291.npz
weights_HLA-B*57:01-0-9e5317fef54dc1f6.npz


In [20]:
affinity_predictor2 = mhcflurry.Class1AffinityPredictor.load("/tmp/saved-affinity-predictor")
affinity_predictor2.predict(["SYNPEPII"], allele="HLA-B*57:01")

array([ 25200.29882812], dtype=float32)