In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import os
import typing as t
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
from toolz import curry
from transformers import BertConfig, DNATokenizer
from uBERTa.base import ModelSetup, RunSetup, OptSetup, StopSetup
from uBERTa.datasets_generator import DatasetGenerator, prepare_sequences, train_test_split
from uBERTa.model import uBERTa, BertCentralPooler
from uBERTa.utils import train, setup_logger

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [4]:
# Paths to a reference, log file and the trained model
REF, LOG, MODEL, = (
    Path('hg38.fa'), Path('log.txt'), Path('models/trained/3-new-12w-0/'))
# Paths to a full, eval and train datasets
DS, DEV, TRAIN = (
    Path('DS_BASE.tsv'), Path('DEV_BASE.tsv'), Path('TRAIN_BASE.tsv'))
# Kmer size, flank size, and the fraction of the eval examples
KMER, FLANK, DEV_FRAC = 3, 100, 0.2

In [5]:
LOGGER = setup_logger(
    './log2.txt', file_level=logging.DEBUG, 
    stdout_level=logging.INFO, 
    stderr_level=logging.CRITICAL)

# Prepare the data

In [6]:
# Set up the generator
dsg = DatasetGenerator(
    DS, REF, 
    neg_multiplier=4, 
    neg_fractions=(
        0.00,  # Completely random samples 
        0.05,  # Random samples centered on start codons
        0.70,  # Valid uORFs without experimental support
        0.25,  # Valid uORFs with experimental support
    ),
    pos_fractions=(
        1.0,  # u samples  (uORF start codons)
        1.0,  # ma samples (alternative start codons)
        1.0   # m samples  (CDS start codons)
    ),
    flank_size=1,  # This will fetch just the start codon
    kmer_size=KMER
)

In [7]:
def prepare_datasets(dev_path, train_path, dsg, flank_size, kmer_size, dev_fraction=0.2):
    if dev_path.exists() and train_path.exists():
        dev, train = map(
            lambda p: pd.read_csv(p, sep='\t'), 
            [dev_path, train_path])
    else:
        dev, train = train_test_split(dsg(), dev_fraction)
    _prepare = curry(prepare_sequences)(
        ref=dsg.ref, flank_size=flank_size, kmer_size=kmer_size)
    dev, train = map(_prepare, [dev, train])
    return dev, train

In [8]:
# Either read or generate datasets
dev_ds, train_ds = prepare_datasets(DEV, TRAIN, dsg, FLANK, KMER, DEV_FRAC)
print(dev_ds.IsPositive.value_counts(), 
      train_ds.IsPositive.value_counts(), sep='\n')

0    5192
1    1278
Name: IsPositive, dtype: int64
0    23399
1     5872
Name: IsPositive, dtype: int64


In [9]:
# Optionally, save the datasets 
dev_ds.to_csv(DEV, sep='\t', index=False)
train_ds.to_csv(TRAIN, sep='\t', index=False)

# Setup model

In [10]:
# Derive and validate the central token position
central_codon = len(dev_ds.Seq.iloc[0].split(' ')) // 2
print(Counter(s.split(' ')[central_codon] for s in train_ds.Seq), 
      Counter(s.split(' ')[central_codon] for s in dev_ds.Seq), 
      sep='\n')
central_token = central_codon + 1

Counter({'ATG': 14510, 'CTG': 7316, 'GTG': 2878, 'ACG': 1355, 'TTG': 1348, 'ATC': 717, 'ATT': 684, 'ATA': 193, 'AGG': 184, 'AAG': 86})
Counter({'ATG': 3254, 'CTG': 1563, 'GTG': 636, 'ACG': 314, 'TTG': 296, 'ATC': 167, 'ATT': 141, 'ATA': 51, 'AGG': 30, 'AAG': 18})


In [11]:
model_setup = ModelSetup(BertConfig, uBERTa, DNATokenizer, str(MODEL))
# Batch size, Num epochs, Fraction of warmup steps
# run_setup = RunSetup(16, 30, 0.1)
run_setup = RunSetup(16, 2, 0.1)
# Learning rate, Epsilon, Betas, Weight decay
opt_setup = OptSetup(5e-6, 1e-8, (0.9, 0.999), 0.1)
# Number of rounds, Improvement
stop_setup = StopSetup(10, 0.01)

In [12]:
config_kwargs = dict(
    finetuning_task='uORF',
    hidden_dropout_prob=0.2,
)
model_kwargs = dict()
tokenizer_kwargs = dict()

In [13]:
config = model_setup.Config.from_pretrained(
    model_setup.ModelPath, **config_kwargs)
config.central_position = central_token

tokenizer = model_setup.Tokenizer.from_pretrained(
    model_setup.ModelPath, **tokenizer_kwargs)
model = model_setup.Model.from_pretrained(
    model_setup.ModelPath, config=config, **model_kwargs)
model.bert.pooler = BertCentralPooler(config)
model.to('cuda');

2021-12-11 16:31:40,830 INFO [configuration_utils--get_config_dict]: loading configuration file models/trained/3-new-12w-0/config.json
2021-12-11 16:31:40,831 INFO [configuration_utils--from_dict]: Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": "uORF",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "num_rnn_layer": 1,
  "output_attentions": false,
  "output_hidden_s

# Fine-tune the model

In [14]:
# results = train(
#     model, tokenizer, train_ds, dev_ds, 
#     run_setup, opt_setup, stop_setup, 
#     Path('checkpoint_cache'))
scores_train, scores_eval = train(
    model, tokenizer, 
    train_ds.sample(int(0.1 * len(train_ds))), 
    dev_ds.sample(int(0.1 * len(dev_ds))), 
    run_setup, opt_setup, stop_setup, 
    Path('checkpoint_cache'))

Running epochs:   0%|          | 0/2 [00:00<?, ?it/s]

Running train batches:   0%|          | 0/183 [00:00<?, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /opt/conda/conda-bld/pytorch_1634272115665/work/torch/csrc/utils/python_arg_parser.cpp:1050.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


2021-12-11 16:32:44,756 INFO [utils--train]: Finished epoch 1 with loss 115.26578903198242 and (unweighted) scores Scores(acc=0.5490263067987701, roc_auc=0.4665075059112488, f1=0.23787528868360278, prec=0.1840929401251117, rec=0.3360522022838499)


Running predict batches:   0%|          | 0/41 [00:00<?, ?it/s]

2021-12-11 16:32:48,784 INFO [utils--train]: Fininshed evaluation with loss 23.58084124326706 and scores Scores(acc=0.7002032520325203, roc_auc=0.7827996028051883, f1=0.6844919786096262, prec=0.7223476297968409, rec=0.6504065040650407)
2021-12-11 16:32:48,787 INFO [utils--__call__]: The model has improved the score by 0.7827996028051883-0=0.7827996028051883
2021-12-11 16:32:48,789 INFO [configuration_utils--save_pretrained]: Configuration saved in checkpoint_cache/config.json
2021-12-11 16:32:51,006 INFO [modeling_utils--save_pretrained]: Model weights saved in checkpoint_cache/pytorch_model.bin
2021-12-11 16:32:55,353 INFO [utils--train]: Dumped the current state to checkpoint_cache


Running train batches:   0%|          | 0/183 [00:00<?, ?it/s]

2021-12-11 16:33:50,522 INFO [utils--train]: Finished epoch 2 with loss 98.80516465008259 and (unweighted) scores Scores(acc=0.5937820293816194, roc_auc=0.5147100914921726, f1=0.2709993868792152, prec=0.21709233791748528, rec=0.3605220228384992)


Running predict batches:   0%|          | 0/41 [00:00<?, ?it/s]

2021-12-11 16:33:54,558 INFO [utils--train]: Fininshed evaluation with loss 23.305896788835526 and scores Scores(acc=0.7080850865760565, roc_auc=0.7881524235089679, f1=0.6928671123191074, prec=0.7309739085507634, rec=0.6585365853658538)
2021-12-11 16:33:54,560 INFO [utils--__call__]: The model hasn't improved the score 0.7827996028051883 for 1 rounds out of 10


In [15]:
pd.DataFrame(scores_train)

Unnamed: 0,acc,roc_auc,f1,prec,rec
0,0.549026,0.466508,0.237875,0.184093,0.336052
1,0.593782,0.51471,0.270999,0.217092,0.360522


In [16]:
pd.DataFrame(scores_eval)

Unnamed: 0,acc,roc_auc,f1,prec,rec
0,0.700203,0.7828,0.684492,0.722348,0.650407
1,0.708085,0.788152,0.692867,0.730974,0.658537
