# Predict 5'UTR TISs

Here, we'll use the fine-tuned `uBERTa_classifier` to predict TISs in 5'UTR sequences of the protein-coding genes of the human genome.

## Prerequisites

This notebook requires:
- [DS_BASE.tsv](https://drive.google.com/file/d/1gPjOoxWOAPpfPmKFbQlVT0hncjIpst5E/view?usp=sharing) (`../data/DS_BASE.tsv`)
- [dataset_labeling.tsv](https://drive.google.com/file/d/1z_dQtERIPvf_ZqnGLCNGLx_l6GcKHJZ1/view?usp=sharing) (`../data/dataset_labeling.tsv`)
- [trained_model](https://drive.google.com/file/d/1YAiyZXNzu49GoLaLEw4ocKX_45dRwC3e/view?usp=sharing) (`../models/ws100_step25_ACG_ATC_ATG_ATT_CTG_GTG_pretrain_tokenlevel_signal`)

One can either download or obtain the requirements manually: `prepare_base_dataset.ipynb` for the first two, and `train_uBERTa.ipynb` for the fine-tuned model.

For instance, starting from the project's root:
```bash
gdown --fuzzy https://drive.google.com/file/d/1gPjOoxWOAPpfPmKFbQlVT0hncjIpst5E/view?usp=sharing
gdown --fuzzy https://drive.google.com/file/d/1z_dQtERIPvf_ZqnGLCNGLx_l6GcKHJZ1/view?usp=sharing
tar -xzf DS_BASE.tsv.tar.gz
tar -xzf dataset_labeling.tsv.tar.gz
mkdir -p ../models
cd ../models
gdown --fuzzy https://drive.google.com/file/d/1YAiyZXNzu49GoLaLEw4ocKX_45dRwC3e/view?usp=sharing
tar -xzf ws100_step25_ACG_ATC_ATG_ATT_CTG_GTG_pretrain_tokenlevel_signal.tar.gz
```

In [1]:
import logging
import operator as op
from itertools import chain
from math import ceil
from pathlib import Path

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from more_itertools import sliced
from scipy.stats import pearsonr
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from tqdm.auto import tqdm
from transformers import DistilBertConfig

from uBERTa.loader import uBERTaLoader
from uBERTa.model import uBERTa_classifier, WeightedDistilBertClassifier
from uBERTa.tokenizer import DNATokenizer
from uBERTa.utils import split_values, kmerize

In [2]:
MIN_SEQ_SIZE = 10
WINDOW = 100
STEP = WINDOW // 4
BASE = Path('../data')
BASE.mkdir(exist_ok=True)
MODEL_PATH = Path('../models/ws100_step25_ACG_ATC_ATG_ATT_CTG_GTG_pretrain_tokenlevel_signal/')
DS = BASE / 'DS_BASE.tsv'
DS_LABELS = BASE / 'dataset_labeling.tsv'
STARTS = ('ACG', 'ATC', 'ATG', 'ATT', 'CTG', 'GTG')

In [3]:
logging.basicConfig(level=logging.DEBUG)

In [4]:
def parse_base(path_base, path_labels, min_seq_size):
    """
    Read base dataset, merge gene labels (splitting genes into Train, 
        Test, and Valiation) and filter by seq size
    """
    df = pd.read_csv(path_base, sep='\t')
    df['SeqSize'] = df['Seq'].apply(len)
    print(f'Initial ds: {len(df)}')
    df_labels = pd.read_csv(path_labels, sep='\t')
    df = df.merge(df_labels, on='GeneID', how='left')
    df = df[df.SeqSize >= min_seq_size]
    print(f'Conforming to size threshold: {len(df)}')
    split_values(df, 'SeqEnum')
    split_values(df, 'SeqEnumPositive')
    split_values(df, 'Classes')
    split_values(df, 'Signal', dtype=float)
    return df

def aggregate_predictions(predictions):
    """
    Detach and concatenate raw batch predictions
    """
    y_prob = [x[1].detach().cpu().numpy() for x in predictions]
    y_true = [x[2].detach().cpu().numpy() for x in predictions]
    
    return np.concatenate(y_prob), np.concatenate(y_true)

def safe_take_fst(xs):
    assert len(xs.unique()) == 1
    return xs.iloc[0]

def take_fst(xs):
    return xs.iloc[0]

def unravel_base_ds(path, keep_cols=('GeneID', 'TranscriptID')):
    """
    Unravel sequence data of the base dataset into the codon-per-row format.
    """
    
    def unravel_row(row):
        keep_values = [row[col] for col in keep_cols]
        for i, (codon, en, cls) in enumerate(
            zip(row.Seq.split(), row.SeqEnum, row.Classes)
        ):
            if cls != -100:
                yield (row.Chrom, row.Strand, en, codon, *keep_values)
    
    df = pd.read_csv(path, sep='\t')
    split_values(df, 'SeqEnum')
    df['Seq'] = df['Seq'].apply(lambda s: kmerize(s, 3))
    unraveled = chain.from_iterable(
        map(unravel_row, map(op.itemgetter(1), df.iterrows())))
    # print(next(unraveled))

    return pd.DataFrame(
        unraveled,
        columns=['Chrom', 'Strand', 'Start', 'Codon'] + list(keep_cols))

def unravel_and_group(df, y_prob, y_true, threshold=0.5, pred_agg='mean'):
    """
    Unravel sequences in the dataset to restructure as codon-per-row.
    Aggregate predictions for the same codon.
    """
    def unravel_row(row):
        for i, (codon, en, cls, sig) in enumerate(
            zip(row.Seq.split(), row.SeqEnum, row.Classes, row.Signal)
        ):
            if cls != -100:
                yield row.Chrom, row.Strand, en, codon, cls, sig, row.Dataset
    
    # Unravel the dataset with predictions
    unraveled = map(unravel_row, map(op.itemgetter(1), df.iterrows()))
    _df = pd.DataFrame(
        chain.from_iterable(unraveled), 
        columns=['Chrom', 'Strand', 'Start', 'Codon', 
                 'Label', 'Signal', 'Dataset'])
    
    # This serves as additional sanity check
    # working iff the number of codons match
    _df['y_prob'] = np.squeeze(y_prob[:, 1])  # Take the probability of the positive class
    _df['y_true'] = np.squeeze(y_true)
    
    _df = _df[_df.Start != 0]
    
    # Average the predictions for each start codon across transcripts and windows
    _df = _df.groupby(
        ['Chrom', 'Strand', 'Start', 'Codon'], 
        as_index=False
    ).agg({
        'y_prob': pred_agg, 
        'y_true': safe_take_fst,
        'Signal': take_fst,
        'Dataset': take_fst,
    })
    
    _df['y_pred'] = (_df['y_prob'] > threshold).astype(int)
    
    return _df

def calc_pred_scores(df, threshold=0.5):
    y_prob = df['y_prob'].values
    y_pred = (df['y_prob'].values > threshold).astype(int)
    y_true = df['y_true'].values
    return {
        'f1': f1_score(y_true, y_pred), 
        'prc': precision_score(y_true, y_pred), 
        'rec': recall_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_prob)}

def score(df, threshold):
    scores = {
        codon: calc_pred_scores(group, threshold) 
        for codon, group in df.groupby('Codon')}
    scores['All'] = calc_pred_scores(df, threshold)
    return scores

def split_into_loaders(tds, chunk_size, batch_size = 2 ** 8):
    for tensors in sliced(tds, chunk_size):
        _tds = TensorDataset(*tensors)
        yield DataLoader(
            _tds, 
            sampler=SequentialSampler(_tds), 
            batch_size=2**7,
            num_workers=4)
        
def split_df(df, chunk_size):
    n = ceil(len(df) // chunk_size)
    for i in range(n):
        yield df.iloc[i * chunk_size: (i + 1) * chunk_size]

def predict(loader, ds, model, trainer, threshold, agg_fn):
    predictions = trainer.predict(model, loader)
    y_prob, y_true = aggregate_predictions(predictions)
    df = unravel_and_group(ds, y_prob, y_true, threshold, agg_fn)
    return df

def unravel_scores(scores):
    for ds_name, ds_vs in scores.items():
        for codon_name, codon_scores in ds_vs.items():
            for score_name, score_val in codon_scores.items():
                yield ds_name, codon_name, score_name, score_val
                
def get_color(y_pred, y_true, dataset):
    green, blue, red, black = (
        '0,255,0', '0,0,255', '255,0,0', '0,0,0')
    if dataset == 'Inference':
        if y_pred == 1:
            return green
        return blue
    if y_pred == 1 and y_true == 1:
        return green
    if y_pred == 0 and y_true == 0:
        return blue
    if y_pred == 0 and y_true == 1:
        return red
    return black

def wrap_row(row, ts=0.5):
    label = row.Dataset
    y_pred = int(row.y_prob >= 0.5)
    color = get_color(y_pred, row.y_true, row.Dataset)
    start = row.Start if row.Strand == '+' else row.Start - 2
    end = start + 3
    return (f'{row.Chrom} {start} {end} {label} '
            f'{int(row.y_prob * 1000)} {row.Strand} {start} {end} {color}')

def pred2bed(df, out_path):
    with open(out_path, 'w') as f:
        print('track name="uBERTa predictions v.1" '
              'itemRgb="On"', file=f)
        for _, row in tqdm(df.iterrows()):
            print(wrap_row(row), file=f)

## Prepare data

We'll initalize `uBERTaLoader` without base dataset and use its methods to prepare the sequence data for predictions. This will take care of encoding inputs and sliding the window over the sequence data. Be careful to use the same setup as in `train_uBERTa.ipynb`, especially wrt window parameters and experimental signal bounds.

In [5]:
tokenizer = DNATokenizer(kmer=3)
loader = uBERTaLoader(
    None, WINDOW, STEP, tokenizer, 
    scale_signal_bounds=(0.0, 10.0),
    is_mlm_task=False,
    valid_start_codons=STARTS,
    batch_size=2 ** 6)

In [6]:
ds = parse_base(DS, DS_LABELS, MIN_SEQ_SIZE)

Initial ds: 79677
Conforming to size threshold: 78702


In [7]:
ds = loader._prep_token_level(ds, 'Main')

INFO:uBERTa.loader:Preparing Main with 78702 records for token-level task
INFO:uBERTa.loader:Using kmer 3 on ('Seq', 'SeqEnum', 'Signal', 'Classes')
DEBUG:uBERTa.loader:Reducing kmers for Main
  return asarray(a).ndim
DEBUG:uBERTa.loader:Filtering to ('ACG', 'ATC', 'ATG', 'ATT', 'CTG', 'GTG') for Main
DEBUG:uBERTa.loader:Capping and scaling signal for Main
DEBUG:uBERTa.loader:Capped signal in (0.1, 5000.0)
DEBUG:uBERTa.loader:Scaled signal between 0 and 1. Min 0.1, Max 5000.0
INFO:uBERTa.loader:Rolling window with size 98, step 25


In [12]:
tds = loader._prep_tds_cls(ds)

Wrap the tensor dataset into `DataLoader`. Adjust the batch size and the number of processes as needed.

In [13]:
predict_loader = DataLoader(
    tds, 
    sampler=SequentialSampler(tds), 
    batch_size=2**7,
    num_workers=4)

`idsmap`, obtained below, associates gene and transcript IDs with all putative TISs.

In [14]:
idsmap = unravel_base_ds(DS)

## Load model

In [15]:
config = DistilBertConfig.from_pretrained(MODEL_PATH)
model = uBERTa_classifier(
    model=WeightedDistilBertClassifier,
    config=config,
)
model.model = model.model.from_pretrained(MODEL_PATH)
model.model.config.use_signal = True

## Predict

In some cases, using the whole dataset may overflow RAM, hence we'll split loaders into sizeable chunks and use them with the `Trainer` API in `predict` function.

In [16]:
gpus = [1]
trainer = pl.Trainer(
    accelerator="gpu",
    precision=16,
    gpus=gpus,
)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [18]:
chunk_size = 20000
loaders = split_into_loaders(tds, chunk_size)
dss = split_df(ds, chunk_size)

dfs = (predict(l, d, model, trainer, 0.5, 'mean') for l, d in zip(loaders, dss))

In [19]:
df_pred = pd.concat(dfs)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Missing logger folder: /home/ivan/code/uBERTa/notebooks/lightning_logs


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

## Parse and dump results

Explicitly mark the dataset for data not present in the modeling dataset.

In [20]:
df_pred.loc[df_pred.Dataset.isna(), 'Dataset'] = 'Inference'

Due to splitting loader into chunks, we'll aggregate the probabilities the second time.

In [21]:
df_pred = df_pred.groupby(
        ['Chrom', 'Strand', 'Start', 'Codon'], 
        as_index=False
    ).agg({
        'y_prob': 'mean',
        'y_true': safe_take_fst,
        'Signal': take_fst,
        'Dataset': take_fst,
    })

Finally, we'll incorporate gene and transcript IDs for each start codon. If a TIS is associated with multiple transcripts, they'll be concatenated with ";".

In [22]:
df_pred = df_pred.merge(
    idsmap, 
    on=['Chrom', 'Strand', 'Start', 'Codon'], 
    how='left'
).groupby(
    ['Chrom', 'Strand', 'Start', 'Codon'], as_index=False
).agg({
    'y_prob': take_fst, 
    'y_true': take_fst, 
    'Signal': take_fst, 
    'Dataset': take_fst,
    'GeneID': take_fst,
    'TranscriptID': lambda vs: ';'.join(vs)
})

Score predictions separately for each part of the modeling dataset. The scores may be slightly different compared to the paper due to different seed used in splitting the modeling dataset in `prepare_base_dataset.ipynb` (the part of creating the `dataset_labeling.tsv`).

In [23]:
scores = {ds_name: score(df_pred[df_pred.Dataset == ds_name], 0.5) for ds_name in ('Train', 'Val', 'Test')}

In [24]:
scores

{'Train': {'ACG': {'f1': 0.6203208556149733,
   'prc': 0.6397058823529411,
   'rec': 0.6020761245674741,
   'roc_auc': 0.9867718790285803},
  'ATC': {'f1': 0.5767790262172285,
   'prc': 0.6062992125984252,
   'rec': 0.55,
   'roc_auc': 0.9919805535703257},
  'ATG': {'f1': 0.6751543209876543,
   'prc': 0.6396198830409356,
   'rec': 0.7148692810457516,
   'roc_auc': 0.9589901790567034},
  'ATT': {'f1': 0.5490196078431373,
   'prc': 0.6086956521739131,
   'rec': 0.5,
   'roc_auc': 0.9916290452351153},
  'CTG': {'f1': 0.6693629929221436,
   'prc': 0.645224171539961,
   'rec': 0.6953781512605042,
   'roc_auc': 0.9852017486769029},
  'GTG': {'f1': 0.617351598173516,
   'prc': 0.6462715105162524,
   'rec': 0.5909090909090909,
   'roc_auc': 0.9863136101826635},
  'All': {'f1': 0.6532247641204602,
   'prc': 0.6407200811359026,
   'rec': 0.6662272607434748,
   'roc_auc': 0.9843558318976546}},
 'Val': {'ACG': {'f1': 0.626865671641791,
   'prc': 0.6,
   'rec': 0.65625,
   'roc_auc': 0.991214698331

In [25]:
df_scores = pd.DataFrame(
    unravel_scores(scores), 
    columns=['Dataset', 'Codon', 'ScoreType', 'ScoreVal'])

Dump score, predictions, and predictions in bed format.

In [26]:
df_scores.to_csv(BASE / 'prediction_scores.tsv', sep='\t', index=False)

In [27]:
df_pred.to_csv(BASE / 'predictions_5UTR.tsv', sep='\t', index=False)

In [28]:
pred2bed(df_pred, BASE / 'predictions_5UTR.tsv')

0it [00:00, ?it/s]

## Format tables

### Check correlations of predictions with signal values

In [29]:
(pearsonr(df_pred.Signal.values, df_pred.y_prob), 
 pearsonr(df_pred[df_pred.Dataset != 'Inference'].Signal.values, 
          df_pred[df_pred.Dataset != 'Inference'].y_prob))

((0.535706693063442, 0.0), (0.532731281037158, 0.0))

### Format scores as latex table

In [30]:
def format_scores(df_scores):
    
    def format_codon_scores(group):
        return group.groupby('ScoreType').apply(
            lambda gg: ','.join(map(str, gg['ScoreVal'])))
        
    df = df_scores.copy()
    df['ScoreVal'] = df['ScoreVal'].round(2)
    return df.groupby('Codon').apply(format_codon_scores) 

In [31]:
_df_scores = format_scores(df_scores)

In [32]:
print(_df_scores.to_latex())

\begin{tabular}{lllll}
\toprule
ScoreType &              f1 &             prc &             rec &         roc\_auc \\
Codon &                 &                 &                 &                 \\
\midrule
ACG   &  0.62,0.63,0.49 &   0.64,0.6,0.42 &   0.6,0.66,0.58 &  0.99,0.99,0.98 \\
ATC   &  0.58,0.68,0.57 &  0.61,0.65,0.56 &  0.55,0.71,0.59 &   0.99,1.0,0.99 \\
ATG   &   0.68,0.7,0.64 &  0.64,0.67,0.61 &  0.71,0.74,0.68 &  0.96,0.96,0.96 \\
ATT   &  0.55,0.59,0.56 &  0.61,0.55,0.47 &   0.5,0.65,0.69 &  0.99,0.99,0.99 \\
All   &  0.65,0.66,0.62 &   0.64,0.64,0.6 &  0.67,0.67,0.64 &  0.98,0.98,0.98 \\
CTG   &  0.67,0.64,0.64 &  0.65,0.63,0.63 &   0.7,0.66,0.64 &  0.99,0.98,0.98 \\
GTG   &  0.62,0.57,0.63 &   0.65,0.6,0.65 &   0.59,0.54,0.6 &  0.99,0.99,0.99 \\
\bottomrule
\end{tabular}



  print(_df_scores.to_latex())


### Format codon counts for positive and negative classes across datasets

In [33]:
def format_counts(df_counts):
    
    def format_codon_counts(group):
        return group.groupby('Dataset').apply(
            lambda gg: '/'.join(map(str, list(gg['Start'])[::-1])))
        
    df = df_counts.copy()
    return df.groupby('Codon').apply(format_codon_counts) 

In [34]:
counts = df_pred.loc[
    df_pred.Dataset != 'Inference', 
    ['Dataset', 'Codon', 'y_true', 'Start']
].groupby(
    ['Dataset', 'Codon', 'y_true'], as_index=False).count()
counts = pd.concat([counts[counts.Dataset == ds_name] for ds_name in ['Train', 'Val', 'Test']])

In [35]:
format_counts(counts)

Dataset,Test,Train,Val
Codon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACG,24/1326,289/11131,32/1558
ATC,17/2022,140/16191,21/2080
ATG,151/2079,1224/16511,198/2161
ATT,13/2542,140/19176,17/2709
CTG,171/5623,1428/46792,177/6354
GTG,83/3751,572/30837,67/3976


### Check overall codon counts in modeling dataset

In [36]:
counts_total = counts.groupby(
    ['Codon', 'y_true'], as_index=False
).agg({'Start': 'sum'})

In [37]:
def infer_balance(gg):
    neg, pos = gg['Start'].values
    total = neg + pos
    return neg, pos, total, pos / total * 100

counts_total.groupby('Codon').apply(infer_balance)

Codon
ACG      (14015, 345, 14360, 2.402506963788301)
ATC     (20293, 178, 20471, 0.8695227394851253)
ATG     (20751, 1573, 22324, 7.046228274502778)
ATT     (24427, 170, 24597, 0.6911411960808228)
CTG    (58769, 1776, 60545, 2.9333553555206873)
GTG      (38564, 722, 39286, 1.837804815964975)
dtype: object