# GPT2 for Directed or Undirected Antibody Generation

In [None]:
from helpers import api_caller

TOKEN = ''

In [36]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=4)

import pandas as pd
import os, sys
import requests
import json
import datetime
import urllib3

# Add Python module with utility functions for interacting with BioLM API
repo_root = os.path.join(os.getcwd(), '..', '..')
src_dir = os.path.join(repo_root, 'src')
if src_dir not in sys.path:
    sys.path.append(src_dir)
    
import biolm_util

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [57]:
# Have env vars 'BIOLM_USER' and 'BIOLM_PASS' set for this to work!
tok = biolm_util.get_api_token()

os.environ['BIOLM_ACCESS'] = tok['access']
os.environ['BIOLM_REFRESH'] = tok['refresh']

In [6]:
M = 10  # Minute delay to refresh token
tmp_now = datetime.datetime.now()
refresh_in = tmp_now + datetime.timedelta(minutes=M)
global refresh_in


def generate_gpt2_cv2_hchain(seed_seq=None):
    """POST create a new GPT2 antibody from fine-tuned SARS-Cov2 immune responses."""
    global refresh_in

    url = "https://biolm.ai/api/v1/models/gpt2_sarscovd_heavy/generate/"
    
    if not seed_seq:
        seed_seq = ''
        
    if datetime.datetime.now() > refresh_in:
        tok = biolm_util.get_api_token()
        os.environ['BIOLM_ACCESS'] = tok['access']
        os.environ['BIOLM_REFRESH'] = tok['refresh']
        refresh_in = datetime.datetime.now() + datetime.timedelta(minutes=M)

    payload = json.dumps({
      "instances": [
        {
          "data": {
            "text": seed_seq
          }
        }
      ]
    })
    
    headers = {
      'Cookie': 'access={};refresh={}'.format(os.environ['BIOLM_ACCESS'], os.environ['BIOLM_REFRESH']),
      'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload, timeout=480)
    
    resp_json = response.json()
    
    return resp_json['predictions']['generated']

In [7]:
resp = generate_gpt2_cv2_hchain('EVQ')

resp

[{'text': 'EVQMELSSLRSEDTAVYYCARLPPTAMANYFDYWGQGTLVTVSS',
  'perplexity': 546.3818136109178},
 {'text': 'EVQISGYAQGGSSLWIWGGFSVDKATLTRLISSLGHYYSLKASMERPWVFSRDHP',
  'perplexity': 359.5568987430205},
 {'text': 'EVQTFSGMTRGYFQHWGQGTLVTVSS', 'perplexity': 10547.178905954976},
 {'text': 'EVQVDTSEDTAVYYCARAKYDSSGYYDFDLWGQGTQVTVSS',
  'perplexity': 794.4173610839085},
 {'text': 'EVQMDPSLKSRVTISADKSKNQFSLRLTSVTAADTAVYYCARPHPYCYSYGSGEYNWFDPWGQGTLVTVSS',
  'perplexity': 153.0960441547317},
 {'text': 'EVQMELSCAASGFAFTNSAMHWVRQAPGKGLEWVSGIRYVGKSGTYYADSVKGRFTISRDNAKDSGYLFLQVNSLRAEDTAVYYCARVSGYYDILTGNSLFDYWGQGTLVTVSS',
  'perplexity': 80.10001054464183},
 {'text': 'EVQTLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAADTAVYFCVRSDSSGWNAFDIWGLGRGTVVTVSS',
  'perplexity': 28.18934858102672},
 {'text': 'EVQIIPVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGWVTMTRDTSTMTLQMNNMDPVDTATYYCARNYGYYFDTYWGQGTLVTVSS',
  'perplexity': 26.263950567275604},
 {'text': 'EVQVSSSDSLVVISVDTSKAHEIRA

In [8]:
df = pd.DataFrame(['EVQ' for _ in range(10000)], columns=['seed_seq'])

In [9]:
def apply_gen_abs(seed_seq):
    g = generate_gpt2_cv2_hchain(seed_seq)
    _d = pd.DataFrame.from_dict(g)
    _d = _d.query('perplexity <= 125.0').reset_index(drop=True)
    _d = _d.loc[_d.text.str.len() <= 256, :].reset_index(drop=True)
    return _d

In [10]:
generated_seq_dfs = df.seed_seq.iloc[:2500].parallel_apply(apply_gen_abs)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=625), Label(value='0 / 625'))), HB…

In [11]:
generated_seqs = pd.concat(list(generated_seq_dfs), axis=0)
generated_seqs['len'] = generated_seqs.text.str.len()

In [12]:
generated_seqs.sort_values('perplexity', ascending=True).head(10)

Unnamed: 0,text,perplexity,len
0,EVQTLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGST...,10.49419,113
0,EVQVSSSGPTLVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKA...,10.755682,119
1,EVQVSSPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALE...,10.948971,121
1,EVQVSSASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYN...,11.02318,119
0,EVQVSSPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALE...,11.135759,116
2,EVQVSSPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALE...,11.754165,122
1,EVQVSSPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALE...,11.781453,120
3,EVQTVSRDNSKPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPP...,11.869919,128
3,EVQFDPWGQGTLVTVSSASVKVSCKASGYTFTGYYMHWVRQAPGQG...,11.978776,120
4,EVQVSSKNVAPNWTRPAYFSVPLDKSFRKYTAFTIPSTNNETPGIR...,12.084831,141


In [13]:
generated_seqs.sample(10)

Unnamed: 0,text,perplexity,len
1,EVQVSSPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRRPPGKALE...,45.633365,117
0,EVQTLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGST...,29.09335,112
5,EVQMELSCAASGFTFSSYWMSWIRQAPGQGLEWVSVIYSGGDTYYA...,28.239773,100
3,EVQTLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGST...,20.990061,108
0,EVQTLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGST...,17.170283,109
3,EVQTLTLTCTVSGGSISSSNWWSWVRQPPGKGLEWIGEIGHSGSTN...,31.193839,103
1,EVQTLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNY...,28.300893,116
0,EVQTLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGEIYHSGST...,17.034665,103
5,EVQMSSDSLRLSCAASGFIFSNSAIHWVRQAPGKGLEWVSTVYAST...,74.415393,109
3,EVQVSSASVKVSCKASGYTFTDNWIGWVRQAPGQGLEWVGWINPNT...,20.603018,111


In [14]:
generated_seqs.to_csv('generated_sars_cov2_ab_seqs.csv', index=False)

In [62]:
generated_seqs.shape

(9506, 3)

The `perplexity` measure is correlated with similarity to known molecules - the lower the values, the more likely the sequence folds into something real. There are ~9.5k sequences with a `perplexity <= 125.0`, to be further ranked and selected using _other_ models now.

## Rank with ESM-1v & Other Evaluations

In order to pull out likely functional sequences, we could also score these with ESM-1v - or any ESM flavor - since those models were trained on functional sequences only. See [In silico Deep Mutational Scan](../inference/Deep_Mutational_Scan_Proteins.ipynb) for more info.


We could also see how close the low-perplexity generated sequences are to those in the test set. Align or calculate Levenshtein distances from antibodies in the test set. Number the antibodies so we can assess their CDR loops comapred to known SARS-Cov-2 antibodies. And of course many other evaluations we could make, which will be up to you.