# Inspection: extraction and analysis of patterns

### Imports and paths

In [None]:
## imports

import numpy as np
from sklearn import linear_model as lm
import pickle
import os
from utils import *

In [None]:
## paths

root_path = './../'

# path to data, random splits, cell list
data_base_path = root_path+'data/'
split_base_path = data_base_path+'- splits/'
cell_list_path = data_base_path+'cells.csv'
name_base_path = data_base_path+'names.csv'
gene_info_path = data_base_path+'GeneFile.txt'

# path to folder where to write results
score_base_path = root_path+'scores/'
try:
    os.mkdir(score_base_path)
except OSError:
    pass

# suffixes
data_suffix = 'stripped/'
split_suffix = 'iteration_'
X_tail = 'X.csv'
T_tail = 'T.csv'

# prepare list of data folders where to read
# so to feed all the considered combinations
# of input formats
formats = ['- broad', '- broad narrow', '- gapped', '- gapped broad', '- gapped narrow', '- narrow']
folders_by_format = list()
for form in formats:
    base = data_base_path + form + '/'
    folders, cells = load_cell_paths(cell_list_path, base, suffix='/'+data_suffix)
    folders_by_format.append(folders)

### Set target epigenomes and gene

Here we will extract and inspect _ShallowChrome_ regulative patterns for gene `PAX5` across epigenomes `H1(-hESC)`, `GM12878` and `K562`. Just change the target values below to extract patterns for different genes and epigenomes.

In [None]:
## targets

# target epigenome(s)
target_cells = [
    'E003',  # H1(-hESC)
    'E116',  # GM12878
    'E123']  # K562
cs = [np.where(cells==cell)[0][0] for cell in target_cells]

# target gene
target = 'ENSG00000196092_PAX5'

# where is the target gene?
with open(gene_info_path) as handle:
    lines = handle.readlines()
l = 0
current_name = target + '_'
found = False
while l < len(lines) and not found:
    line = lines[l]
    current_name = line.strip().split('\t')[-1]
    if current_name == target:
        found = True
    l += 1
assert found
position = l - 1

### Set data split, along with model and fitting hyperparameters

Here we will use the standard data split employed in _DeepChrome_: train, validation and test sets are simply obtained by splitting the list of genes sequentially into three parts, without reshuffling. This split suits our needs, as target gene `PAX5` belongs to the test set, so it is not used to estimate _ShallowChrome_ parameters.

NB: l2-penalty coefficient `C = +∞` effectively enforces no penalty.

In [None]:
# model and fitting parameters
penalty='l2'
C = + np.inf
solver = 'lbfgs'
max_iter = 6000
multi_class = 'multinomial'
random_state = 666

# split
ref_iter = 0
split = load_split(split_base_path, split_suffix, ref_iter)

### Train _ShallowChrome_ on the selected split

...  and select best input format. Set `target_only = True` to only train on the target epigenomes. With `target_only = False` training and input format selection is performed on _all_ epigenomes. This is required to perform the comparison with _ChromHMM_ emission patterns in notebook `model validation`.

In [None]:
## training!
    
target_only = True
if target_only:
    loop_cells = target_cells
    loop_indexes = cs
else:
    loop_cells = cells
    loop_indexes = list(range(len(cells)))

bests = dict()
for c, cell in zip(loop_indexes, loop_cells):
    
    # print advancement message
    print '\n\n>> cell {0}...'.format(cell)

    # set score folder(s)
    score_folder = score_base_path+str(cell)
    try:
        os.mkdir(score_folder)
    except OSError:
        pass

    # define score matrices
    S_val = np.ndarray((len(formats), 1))
    S_test = np.ndarray((len(formats), 1))

    # loop for data formats
    for f, form in enumerate(formats):

        # print advancement message
        print '\r\tformat: {0}       '.format(form),

        # set score folder for format
        score_folder_format = score_folder+'/'+form+'/'
        try:
            os.mkdir(score_folder_format)
        except OSError:
            pass

        # retrieve data
        folder = folders_by_format[f][c]
        X = np.loadtxt(folder+X_tail, delimiter=',')
        T = binarize_target(np.loadtxt(folder+T_tail, delimiter=','))

        # instantiate model
        model = lm.LogisticRegression(penalty=penalty, C=C, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class)

        # fit model
        cache_model_at = score_folder_format+'model_C_'+str(C)+'_iter_'+str(ref_iter)+'.pkl'
        S_val[f,0], S_test[f,0] = fit_and_score(model, X, T, split, cache_model_at)

    val_scores = S_val[:, 0]
    best = np.argmax(val_scores)
    bests[cell] = best
    
best_formats = {cell: formats[bests[cell]] for cell in loop_cells}

# save best formats for future analyses
with open(score_base_path+'bests_{}.pkl'.format(0), 'wb') as handle:
    pickle.dump(best_formats, handle)
    
print(' done.')

### Extract _ShallowChrome_ patterns

First, let's reload best trained models and extract their weights

In [None]:
# restrict to only target cells and best found input format
folders, cells = load_cell_paths(cell_list_path, data_base_path, suffix='/'+data_suffix, best_formats=best_formats, cells=np.asarray(target_cells))
data_dict = dict()
for c, cell in enumerate(cells):
    X = np.loadtxt(folders[c]+X_tail, delimiter=',')
    T = np.loadtxt(folders[c]+T_tail, delimiter=',')
    data_dict[cell] = (X, T)

# load models for the target cells and retrieve weights
models = dict()
weights = dict()
print 'extracting model weights... '
for c, cell in enumerate(cells):
    
    print '\r\t>> cell {0}...  '.format(cell),
    score_folder = score_base_path+str(cell)
    score_folder_format = score_folder+'/'+best_formats[cell]+'/'
    cache_model_at = score_folder_format+'model_C_'+str(C)+'_iter_'+str(0)+'.pkl'
    with open(cache_model_at, 'rb') as model_file:
        model = pickle.load(model_file)
        
    # retrieve model and weights
    models[cell] = model
    weights[cell] = model.coef_

print ' done.'

Now let's extract patterns!

In [None]:
# extract patterns for target gene

inputs = dict()
labels = dict()
patterns = dict()
predictions = dict()

print 'extracting pattern for gene {0}... '.format(target)
for c, cell in enumerate(cells):
    
    print '\r\t>> cell {0}...   '.format(cell),
    
    # get score folder
    score_folder = score_base_path+str(cell)+'/'
    
    # retrieve data
    x = data_dict[cell][0][position]
    t = data_dict[cell][1][position]
    inputs[cell] = x
    labels[cell] = int(t >= np.median(data_dict[cell][1]))
    predictions[cell] = models[cell].predict(x.reshape([1, -1]))
    
    # compute and dump pattern
    pattern = weights[cell] * x
    patterns[cell] = pattern
    with open(score_folder + 'pattern_{0}_ref_iter_{1}.csv'.format(target, ref_iter), 'wb') as handle:
        pickle.dump(pattern, handle)

print ' done.'

Finally, let's plot the extracted patterns, shadowing out irrelevant predictors – a statistical `z-test` is internally performed over the train set in routine `draw_specific_pattern`.

In [None]:
name_array = np.loadtxt(name_base_path, delimiter=',', dtype=str)
name_dict = {k+1: name_array[k] for k in range(len(name_array))}
name_dict[0] = 'intercept'
cell_names = ['H1-hESC (OFF)', 'GM12878 (ON)', 'K562 (OFF)']
for c, cell in enumerate(cells):
    X, T = data_dict[cell]
    split = load_split(split_base_path, split_suffix, 0)
    X_train = X[split[0]]
    y_train = T[split[0]]
    draw_specific_pattern(models[cell], 
                          patterns[cell], 
                          name_array,
                          X_train,
                          y_train,
                          show_intercept=True, 
                          absolute=True,
                          norm=True,
                          label=cell_names[c],
                          y_bounds=None,
                          legend=True,
                          cache_at='./{}_{}.pdf'.format(target, cell),
                          show=True)