# run_expt.py contents

## 1) Preamble

In [123]:
import os, psutil; print(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)

ModuleNotFoundError: No module named 'psutil'

In [1]:
import os, csv
import time
import argparse
import numpy as np, pandas as pd
import torch
import torch.nn as nn
import torchvision
import sys
from collections import defaultdict

from wilds.common.data_loaders import get_train_loader, get_eval_loader
from wilds.common.grouper import CombinatorialGrouper

from utils import set_seed, Logger, BatchLogger, log_config, ParseKwargs, load, initialize_wandb, log_group_data, parse_bool
from train import train, evaluate
from algorithms.initializer import initialize_algorithm
from transforms import initialize_transform
from configs.utils import populate_defaults
import configs.supported as supported

In [2]:
''' set default hyperparams in default_hyperparams.py '''
parser = argparse.ArgumentParser()

# Required arguments
parser.add_argument('-d', '--dataset', choices=supported.datasets, required=True)
parser.add_argument('--algorithm', required=True, choices=supported.algorithms)
parser.add_argument('--root_dir', required=True,
                    help='The directory where [dataset]/data can be found (or should be downloaded to, if it does not exist).')

# Dataset
parser.add_argument('--split_scheme', help='Identifies how the train/val/test split is constructed. Choices are dataset-specific.')
parser.add_argument('--dataset_kwargs', nargs='*', action=ParseKwargs, default={})
parser.add_argument('--download', default=False, type=parse_bool, const=True, nargs='?',
                    help='If true, tries to downloads the dataset if it does not exist in root_dir.')
parser.add_argument('--frac', type=float, default=1.0,
                    help='Convenience parameter that scales all dataset splits down to the specified fraction, for development purposes.')

# Loaders
parser.add_argument('--loader_kwargs', nargs='*', action=ParseKwargs, default={})
parser.add_argument('--train_loader', choices=['standard', 'group'])
parser.add_argument('--uniform_over_groups', type=parse_bool, const=True, nargs='?')
parser.add_argument('--distinct_groups', type=parse_bool, const=True, nargs='?')
parser.add_argument('--n_groups_per_batch', type=int)
parser.add_argument('--batch_size', type=int)
parser.add_argument('--eval_loader', choices=['standard'], default='standard')

# Model
parser.add_argument('--model', choices=supported.models)
parser.add_argument('--model_kwargs', nargs='*', action=ParseKwargs, default={},
    help='keyword arguments for model initialization passed as key1=value1 key2=value2')

# Transforms
parser.add_argument('--train_transform', choices=supported.transforms)
parser.add_argument('--eval_transform', choices=supported.transforms)
parser.add_argument('--target_resolution', nargs='+', type=int, help='target resolution. for example --target_resolution 224 224 for standard resnet.')
parser.add_argument('--resize_scale', type=float)
parser.add_argument('--max_token_length', type=int)

# Objective
parser.add_argument('--loss_function', choices = supported.losses)

# Algorithm
parser.add_argument('--groupby_fields', nargs='+')
parser.add_argument('--group_dro_step_size', type=float)
parser.add_argument('--coral_penalty_weight', type=float)
parser.add_argument('--irm_lambda', type=float)
parser.add_argument('--irm_penalty_anneal_iters', type=int)
parser.add_argument('--algo_log_metric')

# Model selection
parser.add_argument('--val_metric')
parser.add_argument('--val_metric_decreasing', type=parse_bool, const=True, nargs='?')

# Optimization
parser.add_argument('--n_epochs', type=int)
parser.add_argument('--optimizer', choices=supported.optimizers)
parser.add_argument('--lr', type=float)
parser.add_argument('--weight_decay', type=float)
parser.add_argument('--max_grad_norm', type=float)
parser.add_argument('--optimizer_kwargs', nargs='*', action=ParseKwargs, default={})

# Scheduler
parser.add_argument('--scheduler', choices=supported.schedulers)
parser.add_argument('--scheduler_kwargs', nargs='*', action=ParseKwargs, default={})
parser.add_argument('--scheduler_metric_split', choices=['train', 'val'], default='val')
parser.add_argument('--scheduler_metric_name')

# Evaluation
parser.add_argument('--evaluate_all_splits', type=parse_bool, const=True, nargs='?', default=True)
parser.add_argument('--eval_splits', nargs='+', default=[])
parser.add_argument('--eval_only', type=parse_bool, const=True, nargs='?', default=False)
parser.add_argument('--eval_epoch', default=None, type=int)

# Misc
parser.add_argument('--device', type=int, default=0)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--log_dir', default='./logs')
parser.add_argument('--log_every', default=50, type=int)
parser.add_argument('--save_step', type=int)
parser.add_argument('--save_best', type=parse_bool, const=True, nargs='?', default=True)
parser.add_argument('--save_last', type=parse_bool, const=True, nargs='?', default=True)
parser.add_argument('--no_group_logging', type=parse_bool, const=True, nargs='?')
parser.add_argument('--use_wandb', type=parse_bool, const=True, nargs='?', default=False)
parser.add_argument('--progress_bar', type=parse_bool, const=True, nargs='?', default=False)
parser.add_argument('--resume', type=parse_bool, const=True, nargs='?', default=False)

_StoreAction(option_strings=['--resume'], dest='resume', nargs='?', const=True, default=False, type=<function parse_bool at 0x7f34c944bee0>, choices=None, help=None, metavar=None)

In [3]:
argstr_camelyon = "--dataset camelyon17 --algorithm ERM --root_dir data"
config_camelyon = parser.parse_args(argstr_camelyon.split())
config_camelyon = populate_defaults(config_camelyon)

argstr_encode = "--dataset encode-tfbs --algorithm ERM --root_dir data"
config_encode = parser.parse_args(argstr_encode.split())
config_encode = populate_defaults(config_encode)

config = config_camelyon
# config = config_encode

In [4]:
# set device
config.device = torch.device("cuda:" + str(config.device)) if torch.cuda.is_available() else torch.device("cpu")

## Initialize logs
if os.path.exists(config.log_dir) and config.resume:
    resume=True
    mode='a'
elif os.path.exists(config.log_dir) and config.eval_only:
    resume=False
    mode='a'
else:
    resume=False
    mode='w'

if not os.path.exists(config.log_dir):
    os.makedirs(config.log_dir)
logger = Logger(os.path.join(config.log_dir, 'log.txt'), mode)

# Record config
log_config(config, logger)

# Set random seed
set_seed(config.seed)

# Data
full_dataset = supported.datasets[config.dataset](
    root_dir=config.root_dir,
    download=config.download,
    split_scheme=config.split_scheme,
    **config.dataset_kwargs)

# To implement data augmentation (i.e., have different transforms
# at training time vs. test time), modify these two lines:
train_transform = initialize_transform(
    transform_name=config.train_transform,
    config=config,
    dataset=full_dataset)
eval_transform = initialize_transform(
    transform_name=config.eval_transform,
    config=config,
    dataset=full_dataset)



Dataset: camelyon17
Algorithm: ERM
Root dir: data
Split scheme: official
Dataset kwargs: {}
Download: False
Frac: 1.0
Loader kwargs: {'num_workers': 4, 'pin_memory': True}
Train loader: standard
Uniform over groups: False
Distinct groups: None
N groups per batch: 2
Batch size: 32
Eval loader: standard
Model: densenet121
Model kwargs: {'pretrained': False}
Train transform: image_base
Eval transform: image_base
Target resolution: (224, 224)
Resize scale: None
Max token length: None
Loss function: cross_entropy
Groupby fields: ['hospital']
Group dro step size: None
Coral penalty weight: 0.1
Irm lambda: 1.0
Irm penalty anneal iters: None
Algo log metric: accuracy
Val metric: acc_avg
Val metric decreasing: False
N epochs: 5
Optimizer: SGD
Lr: 0.001
Weight decay: 0.01
Max grad norm: None
Optimizer kwargs: {'momentum': 0.9}
Scheduler: None
Scheduler kwargs: {}
Scheduler metric split: val
Scheduler metric name: None
Evaluate all splits: True
Eval splits: []
Eval only: False
Eval epoch: None
De

In [5]:
import copy
full_dataset_camelyon17 = copy.deepcopy(full_dataset)

# supported.datasets[config_encode.dataset]
# print(config_camelyon.train_transform, config_encode.train_transform)

## 2) Initialize dataset object

In [6]:
import os, time
import torch
import pandas as pd
import numpy as np
from wilds.datasets.wilds_dataset import WILDSDataset
from wilds.common.grouper import CombinatorialGrouper
from wilds.common.metrics.all_metrics import Accuracy

root_dir='data'
download=False
split_scheme='official'

itime = time.time()
_dataset_name = 'encode-tfbs'
_version = '1.0'
_download_url = 'https://worksheets.codalab.org/rest/bundles/0x8b3255e21e164cd98d3aeec09cd0bc26/contents/blob/'
_data_dir = 'data/encode-tfbs_v1.0'
_y_size = 1
_n_classes = 2

# _train_chroms = ['chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr22', 'chrX']
_train_chroms = ['chr2', 'chr9', 'chr11']
_test_chroms = ['chr1', 'chr8', 'chr21']
_transcription_factor = 'MAX'
_train_celltypes = ['H1-hESC', 'HCT116', 'HeLa-S3', 'HepG2', 'K562']
_val_celltype = ['A549']
_test_celltype = ['GM12878']
_all_chroms = _train_chroms + _test_chroms
_all_celltypes = _train_celltypes + _val_celltype + _test_celltype

_metadata_map = {}
_metadata_map['chr'] = _all_chroms
_metadata_map['celltype'] = _all_celltypes

# Get the splits
if split_scheme=='official':
    split_scheme = 'standard'

_split_scheme = split_scheme
_split_dict = {
    'train': 0,
    'id_val': 1,
    'test': 2,
    'val': 3
}
_split_names = {
    'train': 'Train',
    'id_val': 'Validation (ID)',
    'test': 'Test',
    'val': 'Validation (OOD)',
}

# Load sequence and DNase features
sequence_filename = os.path.join(_data_dir, 'sequence.npz')
seq_arr = np.load(sequence_filename)
_seq_bp = {}
for chrom in _all_chroms: #seq_arr:
    _seq_bp[chrom] = seq_arr[chrom]
    print(chrom, time.time() - itime)

_dnase_allcelltypes = {}
for ct in _all_celltypes:
    """
    dnase_filename = os.path.join(_data_dir, '{}_dnase.npz'.format(ct))
    dnase_npz_contents = np.load(dnase_filename)
    _dnase_allcelltypes[ct] = {}
    for chrom in _all_chroms: #_seq_bp:
        _dnase_allcelltypes[ct][chrom] = dnase_npz_contents[chrom]
    """
    _dnase_allcelltypes[ct] = 'DNASE.{}.fc.signal.bigwig'
    print(ct, time.time() - itime)

chr2 3.764267683029175
chr9 5.914910078048706
chr11 7.964999675750732
chr1 11.748822927474976
chr8 14.01279878616333
chr21 14.737261772155762
H1-hESC 14.73790693283081
HCT116 14.737961292266846
HeLa-S3 14.737993240356445
HepG2 14.738024950027466
K562 14.73805570602417
A549 14.738086223602295
GM12878 14.738116979598999


In [7]:
itime = time.time()

# Read in metadata dataframe from training+validation data
train_regions_labeled = pd.read_csv(os.path.join(_data_dir, 'labels/{}.train.labels.tsv.gz'.format(_transcription_factor)), sep='\t')
val_regions_labeled = pd.read_csv(os.path.join(_data_dir, 'labels/{}.val.labels.tsv.gz'.format(_transcription_factor)), sep='\t')
training_df = train_regions_labeled[np.isin(train_regions_labeled['chr'], _train_chroms)]
val_df = val_regions_labeled[np.isin(val_regions_labeled['chr'], _test_chroms)]
all_df = pd.concat([training_df, val_df])

print(time.time() - itime)

66.32568740844727


In [59]:
itime = time.time()

# Get the y values, and remove ambiguous labels by default.
pd_list = []
for ct in _all_celltypes:
    tc_chr = all_df[['chr', 'start', 'stop', ct]]
    tc_chr.columns = ['chr', 'start', 'stop', 'y']
    y_array = tc_chr['y'].replace({'U': 0, 'B': 1, 'A': -1}).values
    
    # Now filter out ambiguous labels
    non_ambig_mask = (y_array != -1)
    tc_chr['y'] = y_array
    tc_chr = tc_chr[non_ambig_mask]
    
    tc_chr.insert(len(tc_chr.columns), 'celltype', ct)
    pd_list.append(tc_chr)
    print(time.time() - itime)
metadata_df = pd.concat(pd_list)

print(time.time() - itime)

# y_array = metadata_df['y'].replace({'U': 0, 'B': 1, 'A': -1}).values
# non_ambig_mask = (y_array != -1)
# metadata_df['y'] = y_array
# _metadata_df = metadata_df[non_ambig_mask]

# print(time.time() - itime)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tc_chr['y'] = y_array


11.363114833831787
21.872379302978516
32.51760506629944
42.88175559043884
53.35902285575867
63.94557332992554
74.44822382926941
92.237633228302


In [42]:
# window_size = 12800
# window_interval = window_size/2
# trl_mask = (train_regions_labeled['start']%window_interval == 0)
# train_regions_labeled[trl_mask]

In [108]:
(metadata_df['y'] == 1).sum()
# pd_list[0][non_ambig_mask]

686900

In [88]:
# Downsample negatives to balance each celltype
samp_ndces = []
itime = time.time()
neg_msk = (_metadata_df['y'] == 0)
pos_msk = (_metadata_df['y'] == 1)
for ct in _all_celltypes:
    celltype_msk = (_metadata_df['celltype'] == ct)
    print(ct, time.time() - itime)
    neg_ct_msk = np.logical_and(celltype_msk, neg_msk)
    pos_ct_msk = np.logical_and(celltype_msk, pos_msk)
    print(ct, time.time() - itime)
    neg_ndces = np.where(neg_ct_msk)[0]
    pos_ndces = np.where(pos_ct_msk)[0]
    np.random.seed(42)
    samp_neg_ndces = np.random.choice(neg_ndces, size=len(pos_ndces), replace=False)
    samp_ndces.extend(samp_neg_ndces)
    samp_ndces.extend(pos_ndces)
    print(ct, time.time() - itime)
_metadata_df = _metadata_df.iloc[samp_ndces, :]

train_regions_mask = np.isin(_metadata_df['chr'], _train_chroms)
val_regions_mask = np.isin(_metadata_df['chr'], _test_chroms)
train_celltype_mask = np.isin(_metadata_df['celltype'], _train_celltypes)
val_celltype_mask = np.isin(_metadata_df['celltype'], _val_celltype)
test_celltype_mask = np.isin(_metadata_df['celltype'], _test_celltype)

split_array = -1*np.ones(_metadata_df.shape[0]).astype(int)
split_array[np.logical_and(train_regions_mask, train_celltype_mask)] = _split_dict['train']
split_array[np.logical_and(val_regions_mask, test_celltype_mask)] = _split_dict['test']
# Validate using test chr, either using a designated validation cell line ('val') or a training cell line ('id_val')
split_array[np.logical_and(val_regions_mask, val_celltype_mask)] = _split_dict['val']
split_array[np.logical_and(val_regions_mask, train_celltype_mask)] = _split_dict['id_val']

if _split_scheme=='standard':
    _metadata_df.insert(len(_metadata_df.columns), 'split', split_array)
else:
    raise ValueError(f'Split scheme {_split_scheme} not recognized')

_metadata_df = _metadata_df[_metadata_df['split'] != -1]
_split_array = _metadata_df['split'].values

chr_ints = _metadata_df['chr'].replace(dict( [(y, x) for x, y in enumerate(_metadata_map['chr'])] )).values
celltype_ints = _metadata_df['celltype'].replace(dict( [(y, x) for x, y in enumerate(_metadata_map['celltype'])] )).values
_y_array = torch.LongTensor(np.array(_metadata_df['y']))

_metadata_array = torch.stack(
    (torch.LongTensor(chr_ints), 
     torch.LongTensor(celltype_ints), 
     _y_array),
    dim=1)
_metadata_fields = ['chr', 'celltype', 'y']

H1-hESC 8.10781979560852
H1-hESC 8.47616195678711
H1-hESC 9.822284698486328
HCT116 17.048683881759644
HCT116 17.41142964363098
HCT116 18.752415657043457
HeLa-S3 26.464386463165283
HeLa-S3 26.860748291015625
HeLa-S3 28.151614665985107
HepG2 35.439460039138794
HepG2 35.83507966995239
HepG2 37.079824924468994
K562 44.71583318710327
K562 45.092923164367676
K562 46.389798402786255
A549 53.895429372787476
A549 54.27841639518738
A549 55.64506816864014
GM12878 63.17967939376831
GM12878 63.545384883880615
GM12878 64.84915113449097


# Initialize dataset object

In [106]:
import os, time
import torch
import pandas as pd
import numpy as np
from wilds.datasets.wilds_dataset import WILDSDataset
from wilds.common.grouper import CombinatorialGrouper
from wilds.common.metrics.all_metrics import Accuracy

class EncodeTFBSDataset(WILDSDataset):
    """
    ENCODE-DREAM-wilds dataset of transcription factor binding sites. 
    This is a subset of the dataset from the ENCODE-DREAM in vivo Transcription Factor Binding Site Prediction Challenge. 
    
    Input (x):
        1000-base-pair regions of sequence with a quantified chromatin accessibility readout.

    Label (y):
        y is binary. It is 1 if the central 200bp region is bound by the transcription factor MAX, and 0 otherwise.

    Metadata:
        Each sequence is annotated with the celltype of origin (a string) and the chromosome of origin (a string).
    
    Website:
        https://www.synapse.org/#!Synapse:syn6131484
    """

    def __init__(self, root_dir='data', download=False, split_scheme='official'):
        itime = time.time()
        self._dataset_name = 'encode-tfbs'
        self._version = '1.0'
        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x8b3255e21e164cd98d3aeec09cd0bc26/contents/blob/'
        self._data_dir = self.initialize_data_dir(root_dir, download)
        self._y_size = 1
        self._n_classes = 2
        
        # self._train_chroms = ['chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr22', 'chrX']
        self._train_chroms = ['chr2', 'chr9', 'chr11']
        self._test_chroms = ['chr1', 'chr8', 'chr21']
        self._transcription_factor = 'MAX'
        self._train_celltypes = ['H1-hESC', 'HCT116', 'HeLa-S3', 'HepG2', 'K562']
        self._val_celltype = ['A549']
        self._test_celltype = ['GM12878']
        self._all_chroms = self._train_chroms + self._test_chroms
        self._all_celltypes = self._train_celltypes + self._val_celltype + self._test_celltype
        
        self._metadata_map = {}
        self._metadata_map['chr'] = self._all_chroms
        self._metadata_map['celltype'] = self._all_celltypes
        
        # Get the splits
        if split_scheme=='official':
            split_scheme = 'standard'
        
        self._split_scheme = split_scheme
        self._split_dict = {
            'train': 0,
            'id_val': 1,
            'test': 2,
            'val': 3
        }
        self._split_names = {
            'train': 'Train',
            'id_val': 'Validation (ID)',
            'test': 'Test',
            'val': 'Validation (OOD)',
        }
        
        # Load sequence and DNase features
        sequence_filename = os.path.join(self._data_dir, 'sequence.npz')
        seq_arr = np.load(sequence_filename)
        self._seq_bp = {}
        for chrom in self._all_chroms: #seq_arr:
            self._seq_bp[chrom] = seq_arr[chrom]
            print(chrom, time.time() - itime)
        
        self._dnase_allcelltypes = {}
        for ct in self._all_celltypes:
            dnase_filename = os.path.join(self._data_dir, '{}_dnase.npz'.format(ct))
            dnase_npz_contents = np.load(dnase_filename)
            self._dnase_allcelltypes[ct] = {}
            for chrom in self._all_chroms: #self._seq_bp:
                self._dnase_allcelltypes[ct][chrom] = dnase_npz_contents[chrom]
            print(ct, time.time() - itime)
        
        # Read in metadata dataframe from training+validation data
        train_regions_labeled = pd.read_csv(os.path.join(self._data_dir, 'labels/{}.train.labels.tsv.gz'.format(self._transcription_factor)), sep='\t')
        val_regions_labeled = pd.read_csv(os.path.join(self._data_dir, 'labels/{}.val.labels.tsv.gz'.format(self._transcription_factor)), sep='\t')
        training_df = train_regions_labeled[np.isin(train_regions_labeled['chr'], self._train_chroms)]
        val_df = val_regions_labeled[np.isin(val_regions_labeled['chr'], self._test_chroms)]
        all_df = pd.concat([training_df, val_df])
        
        # Filter by start/stop coordinate if needed (TODO: remove for final version)
        # filter_msk = all_df['start'] >= 0
        # filter_msk = all_df['start']%1000 == 0
        # all_df = all_df[filter_msk]
        
        pd_list = []
        for ct in self._all_celltypes:
            tc_chr = all_df[['chr', 'start', 'stop', ct]]
            tc_chr.columns = ['chr', 'start', 'stop', 'y']
            tc_chr.insert(len(tc_chr.columns), 'celltype', ct)
            pd_list.append(tc_chr)
        metadata_df = pd.concat(pd_list)
        
        # Get the y values, and remove ambiguous labels by default.
        y_array = metadata_df['y'].replace({'U': 0, 'B': 1, 'A': -1}).values
        non_ambig_mask = (y_array != -1)
        metadata_df['y'] = y_array
        self._metadata_df = metadata_df[non_ambig_mask]
        
        # Downsample negatives to balance each celltype
        samp_ndces = []
        itime = time.time()
        neg_msk = (self._metadata_df['y'] == 0)
        pos_msk = (self._metadata_df['y'] == 1)
        for ct in _all_celltypes:
            celltype_msk = (self._metadata_df['celltype'] == ct)
            print(ct, time.time() - itime)
            neg_ct_msk = np.logical_and(celltype_msk, neg_msk)
            pos_ct_msk = np.logical_and(celltype_msk, pos_msk)
            print(ct, time.time() - itime)
            neg_ndces = np.where(neg_ct_msk)[0]
            pos_ndces = np.where(pos_ct_msk)[0]
            np.random.seed(42)
            samp_neg_ndces = np.random.choice(neg_ndces, size=len(pos_ndces), replace=False)
            samp_ndces.extend(samp_neg_ndces)
            samp_ndces.extend(pos_ndces)
            print(ct, time.time() - itime)
        self._metadata_df = self._metadata_df.iloc[samp_ndces, :]
        
        train_regions_mask = np.isin(self._metadata_df['chr'], self._train_chroms)
        val_regions_mask = np.isin(self._metadata_df['chr'], self._test_chroms)
        train_celltype_mask = np.isin(self._metadata_df['celltype'], self._train_celltypes)
        val_celltype_mask = np.isin(self._metadata_df['celltype'], self._val_celltype)
        test_celltype_mask = np.isin(self._metadata_df['celltype'], self._test_celltype)
        
        split_array = -1*np.ones(self._metadata_df.shape[0]).astype(int)
        split_array[np.logical_and(train_regions_mask, train_celltype_mask)] = self._split_dict['train']
        split_array[np.logical_and(val_regions_mask, test_celltype_mask)] = self._split_dict['test']
        # Validate using test chr, either using a designated validation cell line ('val') or a training cell line ('id_val')
        split_array[np.logical_and(val_regions_mask, val_celltype_mask)] = self._split_dict['val']
        split_array[np.logical_and(val_regions_mask, train_celltype_mask)] = self._split_dict['id_val']
        
        if self._split_scheme=='standard':
            self._metadata_df.insert(len(self._metadata_df.columns), 'split', split_array)
        else:
            raise ValueError(f'Split scheme {self._split_scheme} not recognized')
        
        self._metadata_df = self._metadata_df[self._metadata_df['split'] != -1]
        self._split_array = self._metadata_df['split'].values
        
        chr_ints = self._metadata_df['chr'].replace(dict( [(y, x) for x, y in enumerate(self._metadata_map['chr'])] )).values
        celltype_ints = self._metadata_df['celltype'].replace(dict( [(y, x) for x, y in enumerate(self._metadata_map['celltype'])] )).values
        self._y_array = torch.LongTensor(np.array(self._metadata_df['y']))
        
        self._metadata_array = torch.stack(
            (torch.LongTensor(chr_ints), 
             torch.LongTensor(celltype_ints), 
             self._y_array),
            dim=1)
        self._metadata_fields = ['chr', 'celltype', 'y']
        
        self._eval_grouper = CombinatorialGrouper(
            dataset=self,
            groupby_fields=['celltype'])
        
        self._metric = Accuracy()
        
        super().__init__(root_dir, download, split_scheme)

    def get_input(self, idx):
        """
        Returns x for a given idx.
        Computes this from: 
        (1) sequence features in self._seq_bp
        (2) DNase features in self._dnase_allcelltypes
        (3) Metadata for the index (location along the genome with 200bp window width)
        """
        this_metadata = self._metadata_df.iloc[idx, :]
        flank_size = 400
        interval_start = this_metadata['start'] - flank_size
        interval_end = this_metadata['stop'] + flank_size
        dnase_this = self._dnase_allcelltypes[this_metadata['celltype']][this_metadata['chr']][interval_start:interval_end]
        seq_this = self._seq_bp[this_metadata['chr']][interval_start:interval_end]
        return torch.tensor(np.column_stack([seq_this, dnase_this]))

    def eval(self, y_pred, y_true, metadata):
        return self.standard_group_eval(
            self._metric,
            self._eval_grouper,
            y_pred, y_true, metadata)

In [107]:
full_dataset_encode = EncodeTFBSDataset(
    root_dir=config.root_dir,
    download=config.download,
    split_scheme=config.split_scheme,
    **config.dataset_kwargs)

chr2 3.962329387664795
chr9 6.259538888931274
chr11 8.446826934814453
chr1 12.49940538406372
chr8 14.91869592666626
chr21 15.700694799423218
H1-hESC 23.95099449157715
HCT116 31.26502823829651
HeLa-S3 39.382277488708496
HepG2 47.24500226974487
K562 55.079211711883545
A549 62.405343532562256
GM12878 70.00356984138489
H1-hESC 8.160386562347412
H1-hESC 8.546203374862671
H1-hESC 9.868412971496582
HCT116 17.121587991714478
HCT116 17.524660110473633
HCT116 18.90956425666809
HeLa-S3 26.98938488960266
HeLa-S3 27.376858234405518
HeLa-S3 28.7989022731781
HepG2 36.29348182678223
HepG2 36.668752908706665
HepG2 38.151512145996094
K562 45.96789216995239
K562 46.33995985984802
K562 47.87280249595642
A549 55.380892276763916
A549 55.75924301147461
A549 57.22686314582825
GM12878 65.09361720085144
GM12878 65.50619888305664
GM12878 66.9196424484253


In [118]:
# full_dataset = copy.deepcopy(full_dataset_encode)
full_dataset = copy.deepcopy(full_dataset_camelyon17)
# full_dataset_camelyon17.split_dict

In [39]:
a = np.random.choice(1210796, size=128)
seta = [full_dataset_encode.get_input(x) for x in a]
seta[0].shape

In [111]:
full_dataset.metadata_fields
config = config_camelyon
#config_encode.groupby_fields

train_grouper = CombinatorialGrouper(
    dataset=full_dataset,
    groupby_fields=config.groupby_fields)

In [104]:
full_dataset

<wilds.datasets.camelyon17_dataset.Camelyon17Dataset at 0x7f68ac718f70>

# Initialize algorithm

In [113]:
datasets = defaultdict(dict)
for split in full_dataset.split_dict.keys():
    if split=='train':
        transform = train_transform
        verbose = True
    elif split == 'val':
        transform = eval_transform
        verbose = True
    else:
        transform = eval_transform
        verbose = False
    # Get subset
    datasets[split]['dataset'] = full_dataset.get_subset(
        split,
        frac=config.frac,
        transform=transform)

    if split == 'train':
        datasets[split]['loader'] = get_train_loader(
            loader=config.train_loader,
            dataset=datasets[split]['dataset'],
            batch_size=config.batch_size,
            uniform_over_groups=config.uniform_over_groups,
            grouper=train_grouper,
            distinct_groups=config.distinct_groups,
            n_groups_per_batch=config.n_groups_per_batch,
            **config.loader_kwargs)
    else:
        datasets[split]['loader'] = get_eval_loader(
            loader=config.eval_loader,
            dataset=datasets[split]['dataset'],
            grouper=train_grouper,
            batch_size=config.batch_size,
            **config.loader_kwargs)

    # Set fields
    datasets[split]['split'] = split
    datasets[split]['name'] = full_dataset.split_names[split]
    datasets[split]['verbose'] = verbose
    # Loggers
    # Loggers
    datasets[split]['eval_logger'] = BatchLogger(
        os.path.join(config.log_dir, f'{split}_eval.csv'), mode=mode, use_wandb=(config.use_wandb and verbose))
    datasets[split]['algo_logger'] = BatchLogger(
        os.path.join(config.log_dir, f'{split}_algo.csv'), mode=mode, use_wandb=(config.use_wandb and verbose))

    if config.use_wandb:
        initialize_wandb(config)

# Logging dataset info
if config.no_group_logging and full_dataset.is_classification and full_dataset.y_size==1:
    log_grouper = CombinatorialGrouper(
        dataset=full_dataset,
        groupby_fields=['y'])
elif config.no_group_logging:
    log_grouper = None
else:
    log_grouper = train_grouper
log_group_data(datasets, log_grouper, logger)

## Initialize algorithm
algorithm = initialize_algorithm(
    config=config,
    datasets=datasets,
    train_grouper=train_grouper)

Train data...
    hospital = 0: n = 53425
    hospital = 1: n = 0
    hospital = 2: n = 0
    hospital = 3: n = 116959
    hospital = 4: n = 132052
Validation (ID) data...
    hospital = 0: n = 6011
    hospital = 1: n = 0
    hospital = 2: n = 0
    hospital = 3: n = 12879
    hospital = 4: n = 14670
Test data...
    hospital = 0: n = 0
    hospital = 1: n = 0
    hospital = 2: n = 85054
    hospital = 3: n = 0
    hospital = 4: n = 0
Validation (OOD) data...
    hospital = 0: n = 0
    hospital = 1: n = 34904
    hospital = 2: n = 0
    hospital = 3: n = 0
    hospital = 4: n = 0
Dout: 2


RuntimeError: CUDA error: out of memory

In [91]:
# algorithm.device
_metadata_df
# datasets['train']['loader']

Unnamed: 0,chr,start,stop,y,celltype,split
3831225,chr1,191799250,191799450,0,H1-hESC,1
4190052,chr1,209740600,209740800,0,H1-hESC,1
7241915,chr8,66306500,66306700,0,H1-hESC,1
21449377,chr2,38487450,38487650,0,H1-hESC,0
45876013,chr9,5697700,5697900,0,H1-hESC,0
...,...,...,...,...,...,...
8841297,chr8,146277750,146277950,1,GM12878,2
8841298,chr8,146277800,146278000,1,GM12878,2
8841299,chr8,146277850,146278050,1,GM12878,2
8841300,chr8,146277900,146278100,1,GM12878,2


In [90]:
for batch in datasets['train']['loader']:
    x, y_true, metadata = batch
    break

NameError: name 'datasets' is not defined

In [134]:
# datasets['train']['dataset'].size()
algorithm.model(x.to(algorithm.device))

RuntimeError: CUDA out of memory. Tried to allocate 14.00 MiB (GPU 0; 11.93 GiB total capacity; 10.94 GiB already allocated; 5.06 MiB free; 11.32 GiB reserved in total by PyTorch)

# Train

In [None]:
if not config.eval_only:
    ## Load saved results if resuming
    resume_success = False
    if resume:
        save_path = os.path.join(config.log_dir, 'last_model.pth')
        if not os.path.exists(save_path):
            epochs = [
                int(file.split('_')[0])
                for file in os.listdir(config.log_dir) if file.endswith('.pth')]
            if len(epochs) > 0:
                latest_epoch = max(epochs)
                save_path = os.path.join(config.log_dir, f'{latest_epoch}_model.pth')
        try:
            prev_epoch, best_val_metric = load(algorithm, save_path)
            epoch_offset = prev_epoch + 1
            logger.write(f'Resuming from epoch {epoch_offset} with best val metric {best_val_metric}')
            resume_success = True
        except FileNotFoundError:
            pass

    if resume_success == False:
        epoch_offset=0
        best_val_metric=None


    train(
        algorithm=algorithm,
        datasets=datasets,
        general_logger=logger,
        config=config,
        epoch_offset=epoch_offset,
        best_val_metric=best_val_metric)
else:
    if config.eval_epoch is None:
        eval_model_path = os.path.join(config.log_dir, 'best_model.pth')
    else:
        eval_model_path = os.path.join(config.log_dir, f'{config.eval_epoch}_model.pth')
    best_epoch, best_val_metric = load(algorithm, eval_model_path)
    if config.eval_epoch is None:
        epoch = best_epoch
    else:
        epoch = config.eval_epoch
    evaluate(
        algorithm=algorithm,
        datasets=datasets,
        epoch=epoch,
        general_logger=logger,
        config=config)

logger.close()
for split in datasets:
    datasets[split]['eval_logger'].close()
    datasets[split]['algo_logger'].close()

In [126]:
for b in full_dataset:
    break

<wilds.datasets.camelyon17_dataset.Camelyon17Dataset at 0x7f0332cdd520>

In [33]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

class Beagle(nn.Module):
    """
    Neural net models over genomic sequence.
    Input:
        - sequence_length: int (default 1000) 
        - Shape: (N, 5, sequence_length, 1) with batch size N.
    
    Output:
        - prediction (Tensor): float torch tensor of shape (N, )
    
    TODO: Finish docstring.
    """
    def __init__(self):
        """
        Parameters
        ----------
        sequence_length : int
        n_genomic_features : int
        """
        super(Beagle, self).__init__()

        self.dropout = 0.3
        self.num_cell_types = 1
        self.conv1 = nn.Conv2d(5, 50, (19, 1), stride = (1, 1), padding=(9,0))
        self.conv2 = nn.Conv2d(50, 50, (11, 1), stride = (1, 1), padding = (5,0))
        self.conv3 = nn.Conv2d(50, 50, (7, 1), stride = (1, 1), padding = (4,0))
        self.bn1 = nn.BatchNorm2d(50)
        self.bn2 = nn.BatchNorm2d(50)
        self.bn3 = nn.BatchNorm2d(50)
        self.maxpool1 = nn.MaxPool2d((3, 1))
        self.maxpool2 = nn.MaxPool2d((4, 1))
        self.maxpool3 = nn.MaxPool2d((4, 1))

        self.fc1 = nn.Linear(4200, 1000)
        self.bn4 = nn.BatchNorm1d(1000)

        self.fc2 = nn.Linear(1000, 1000)
        self.bn5 = nn.BatchNorm1d(1000)

        self.fc3 = nn.Linear(1000, self.num_cell_types)

    def forward(self, s):
        s = s.permute(0, 2, 1).contiguous()                          # batch_size x 5 x 1000
        s = s.view(-1, 5, 1000, 1)                                   # batch_size x 5 x 1000 x 1 [5 channels]
        s = self.maxpool1(F.relu(self.bn1(self.conv1(s))))           # batch_size x 300 x 333 x 1
        s = self.maxpool2(F.relu(self.bn2(self.conv2(s))))           # batch_size x 200 x 83 x 1
        s = self.maxpool3(F.relu(self.bn3(self.conv3(s))))           # batch_size x 200 x 21 x 1
        s = s.view(-1, 4200)
        conv_out = s

        s = F.dropout(F.relu(self.bn4(self.fc1(s))), p=self.dropout, training=self.training)  # batch_size x 1000
        s = F.dropout(F.relu(self.bn5(self.fc2(s))), p=self.dropout, training=self.training)  # batch_size x 1000
        
        s = self.fc3(s)

        return s, conv_out

In [100]:
def double_conv(in_channels, out_channels):    
    return nn.Sequential(
        nn.Conv1d(in_channels, out_channels, 7, padding=2), 
        nn.BatchNorm1d(out_channels), 
        nn.ReLU(inplace=True),
        nn.Conv1d(out_channels, out_channels, 7, padding=3), 
        nn.BatchNorm1d(out_channels), 
        nn.ReLU(inplace=True)
    )


class UNet(nn.Module):

    def __init__(self, n_class):
        super().__init__()
        
        self.dconv_down1 = double_conv(6, 15)
        self.dconv_down2 = double_conv(15, 22)
        self.dconv_down3 = double_conv(22, 33)
        self.dconv_down4 = double_conv(33, 49)
        self.dconv_down5 = double_conv(49, 73)
        self.dconv_down6 = double_conv(73, 109)

        self.maxpool = nn.MaxPool1d(2)
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)        
        
        self.dconv_up5 = double_conv(73 + 109, 73)
        self.dconv_up4 = double_conv(49 + 73, 49)
        self.dconv_up3 = double_conv(33 + 49, 33)
        self.dconv_up2 = double_conv(22 + 33, 22)
        self.dconv_up1 = double_conv(15 + 22, 15)
        
        self.conv_last = nn.Conv2d(15, n_class, 1)
        
        
    def forward(self, x):
        conv1 = self.dconv_down1(x)
        x = self.maxpool(conv1)

        conv2 = self.dconv_down2(x)
        x = self.maxpool(conv2)
        
        conv3 = self.dconv_down3(x)
        x = self.maxpool(conv3)
        
        conv4 = self.dconv_down4(x)
        x = self.maxpool(conv4)
        
        conv5 = self.dconv_down5(x)
        x = self.maxpool(conv5)
        
        x = self.dconv_down6(x)
        
        x = self.upsample(x)        
        x = torch.cat([x, conv5], dim=1)
        
        x = self.dconv_up5(x)
        x = self.upsample(x)        
        x = torch.cat([x, conv4], dim=1)
        
        x = self.dconv_up4(x)
        x = self.upsample(x)        
        x = torch.cat([x, conv3], dim=1)
        
        x = self.dconv_up3(x)
        x = self.upsample(x)        
        x = torch.cat([x, conv2], dim=1)       

        x = self.dconv_up2(x)
        x = self.upsample(x)        
        x = torch.cat([x, conv1], dim=1)   
        
        x = self.dconv_up1(x)
        
        out = self.conv_last(x)
        
        return out

In [101]:
model = UNet(2)
#model = DanQ(50, 5)

lst = [(x[0], x[1].numel()) for x in model.named_parameters()]
#np.sum([x[1] for x in lst])
count_parameters(model)

485773

In [102]:
model

UNet(
  (dconv_down1): Sequential(
    (0): Conv1d(6, 15, kernel_size=(7,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv1d(15, 15, kernel_size=(7,), stride=(1,), padding=(3,))
    (4): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
  )
  (dconv_down2): Sequential(
    (0): Conv1d(15, 22, kernel_size=(7,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(22, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv1d(22, 22, kernel_size=(7,), stride=(1,), padding=(3,))
    (4): BatchNorm1d(22, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
  )
  (dconv_down3): Sequential(
    (0): Conv1d(22, 33, kernel_size=(7,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(33, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True

In [96]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

NameError: name 'Beagle' is not defined