# Fitting: solving gene trascription state classification with _ShallowChrome_ and 'valley' thresholding

### Imports and paths

In [None]:
## imports

import numpy as np
from sklearn import linear_model as lm
import os
import matplotlib
from utils import *

In [None]:
## paths

root_path = './../'

# path to data, random splits, cell list
data_base_path = root_path+'data/'
split_base_path = data_base_path+'- splits/'
cell_list_path = data_base_path+'cells.csv'

# path to folder where to write results
score_base_path = root_path+'scores/'
try:
    os.mkdir(score_base_path)
except OSError:
    pass

# suffixes
data_suffix = 'stripped/'
X_tail = 'X.csv'
T_tail = 'T.csv'
deepchrome_suffix = 'DeepChrome_scores.txt'

# prepare list of data folders where to read
# so to feed all the considered combinations
# of input formats
formats = ['- broad', '- broad narrow', '- gapped', '- gapped broad', '- gapped narrow', '- narrow']
folders_by_format = list()
for form in formats:
    base = data_base_path + form + '/'
    folders, cells = load_cell_paths(cell_list_path, base, suffix='/'+data_suffix)
    folders_by_format.append(folders)

### Set target epigenomes, model and fitting paramerters

Here we will train and evaluate _ShallowChrome_ across all 56 epigenomes. Specific epigenomes of interest can be specified via variable `target_cells`. For example, one may just specify epigenomes `target_cells = ['E003', 'E123', 'E116']`, those for which pattern analyses are made in notebook `model inspection`.

NB: l2-penalty coefficient `C = +∞` effectively enforces no penalty.

In [None]:
## target epigenome(s)

# target_cells = ['E003', 'E123', 'E116']
target_cells = cells
cs = [np.where(cells==cell)[0][0] for cell in target_cells]

In [None]:
## model and fitting parameters

penalty='l2'
C = + np.inf
solver = 'lbfgs'
max_iter = 6000
multi_class = 'multinomial'
random_state = 666
iterations = 10

### Compute 'valley' binary thresholds

Here we compute binary thresholds with both the 'median' and 'valley' approach and compare the two.

In [None]:
%%capture out

diffs = list()
medians = list()
threshs = list()

for c, cell in zip(cs, target_cells):

    # print advancement message
    print '\r>> cell {0}...'.format(cell),
    
    # retrieve data
    folder = folders_by_format[0][c]
    X = np.loadtxt(folder+X_tail, delimiter=',')
    T_raw = np.loadtxt(folder+T_tail, delimiter=',')

    # perform binarization based
    # on valley thresholding
    T_tr = np.log(1 + T_raw)
    freqs, edges, _ = plt.hist(T_tr, bins=100, color='cadetblue') 
    thresh = np.exp(find_valley(T_tr, freqs, edges)) - 1
    median = np.median(T_raw)
    diff = median - thresh
    medians.append(median)
    threshs.append(thresh)
    diffs.append(diff)

In [None]:
## show the distribution with largest gap

# retrieve data
largest = np.argmax(diffs)
name = "Set2"
cmap = matplotlib.cm.get_cmap(name)
colors = cmap.colors
folder = folders_by_format[0][largest]
T_raw = np.loadtxt(folder+T_tail, delimiter=',')

# perform binarization based
# on valley thresholding
T_tr = np.log(1 + T_raw)
fig = plt.figure(dpi=300)
plt.hist(T_tr, bins=100, color='cadetblue') 
plt.axvline(x=np.log(1 + np.asarray(medians[largest])), color=colors[1], linewidth=1, label='median threshold')
plt.axvline(x=np.log(1 + np.asarray(threshs[largest])), color=colors[2], linewidth=1, label='valley threshold')
plt.xticks([np.log(1+threshs[largest]), np.log(1+medians[largest])], ['{:.3f}'.format(t) for t in [threshs[largest], medians[largest]]], fontsize=10)
plt.xlabel('RPKMs (log scale)', fontsize=10)
plt.ylabel('Bin frequency', fontsize=10)
plt.xlim([-0.2,6])
plt.title('Epigenome '+cells[largest])
plt.legend(fontsize=10)
fig.tight_layout()
plt.savefig('./valley.pdf', format='pdf')
plt.show()
plt.close()

### Train _ShallowChrome_

Here we train, select and evaluate _ShallowChrome_ models for each target epigenome. In order to get some statistical confidence on model performance, the procedure is repeated `iterations` times; each time a different, randomly generated dataset split is employed.

NB: The use of a threshold different than the median causes the two classes to be unbalanced. Here we simply adopt a subsampling strategy to guarantee balanced training.

In [None]:
## training!
    
val_scores_by_cell = dict()
test_scores_by_cell = dict()
for c, cell in zip(cs, target_cells):
    
    # print advancement message
    print '\n\n>> cell {0}...'.format(cell)

    # set score folder(s)
    score_folder = score_base_path+str(cell)
    try:
        os.mkdir(score_folder)
    except OSError:
        pass

    # define score matrices
    S_val = np.ndarray((len(formats), iterations))
    S_test = np.ndarray((len(formats), iterations))

    # loop for data formats
    for f, form in enumerate(formats):

        # set score folder for format
        score_folder_format = score_folder+'/'+form+'/'
        try:
            os.mkdir(score_folder_format)
        except OSError:
            pass

        # retrieve data
        folder = folders_by_format[f][c]
        X = np.loadtxt(folder+X_tail, delimiter=',')
        T_raw = np.loadtxt(folder+T_tail, delimiter=',')
        
        # perform binarization based
        # on valley thresholding
        thresh = threshs[c]
        T = binarize_target(T_raw, thresh)
        
        # re-balance by subsampling and
        # recompute random split
        X_v, T_v = balance_by_subsampling(X, T, random_state=random_state)
        splits = random_splits(len(T_v), iterations=iterations, random_state=random_state)
        
        # monitoring
        if f == 0:
            print '\n\tthreshold for epigenome {} is {:.3f} (was {:.3f})'.format(cell, thresh, np.median(T_raw))
            print '\tnew dataset length is {}'.format(len(T_v))

        # print advancement message
        print '\r\tformat: {0}        '.format(form),

        # loop over random splits
        for i in range(iterations): 

            # instantiate model
            model = lm.LogisticRegression(penalty=penalty, C=C, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class)
            
            # get split
            split = splits[i]

            # fit model
            cache_model_at = score_folder_format+'model_C_'+str(C)+'_iter_'+str(i+1)+'.pkl'
            S_val[f,i], S_test[f,i] = fit_and_score(model, X, T, split, cache_model_at)

    # save scores to disk
    np.savetxt(score_folder+'/val_aucs_valley.csv', S_val, delimiter=',', fmt='%.4f')
    np.savetxt(score_folder+'/test_aucs_valley.csv', S_test, delimiter=',', fmt='%.4f')
    
    # store them into dict
    val_scores_by_cell[cell] = S_val
    test_scores_by_cell[cell] = S_test
    
print ' done.'

### Print test results

In [None]:
final_val_scores = dict()
final_test_scores = dict()
final_selection = dict()

for cell in target_cells:
    
    S_val = val_scores_by_cell[cell]
    S_test = test_scores_by_cell[cell]
    final_val_scores[cell] = list()
    final_test_scores[cell] = list()
    final_selection[cell] = list()
    for split in range(iterations):

        val_scores = S_val[:, split]
        test_scores = S_test[:, split]
        best = np.argmax(val_scores)

        final_val_scores[cell].append(val_scores[best])
        final_test_scores[cell].append(test_scores[best])
        final_selection[cell].append(formats[best])
    
    # print test performance
    print '>> cell {0}: {1:.2f} ± {2:.2f} %'.format(cell, 100 * np.mean(final_test_scores[cell]), 100 * np.std(final_test_scores[cell]))

In [None]:
##  print aggregated stats

data = [np.mean(final_test_scores[cell]) for cell in target_cells]
data_std = [np.std(final_test_scores[cell]) for cell in target_cells]
print 'mean:\t{:.4f}'.format(np.mean(data))
print 'median:\t{:.4f}'.format(np.median(data))
print 'max:\t{:.4f}'.format(np.max(data))
print 'min:\t{:.4f}'.format(np.min(data))


In [None]:
## dump all results

shallowchrome_valley_results_path = score_base_path+'ShallowChrome_valley_scores.txt'
with open(shallowchrome_valley_results_path, 'w') as score_file:
    for c, cell in enumerate(target_cells):
        score_file.write(cell+': {0} +/- {1}\n'.format(data[c], data_std[c]))

In [None]:
## compare with other methods

deepchrome_results_path = score_base_path+deepchrome_suffix
deepchrome_scores = parse_scores(deepchrome_results_path, std=False)

shallowchrome_results_path = score_base_path+'ShallowChrome_scores.txt'
shallowchrome_scores = parse_scores(shallowchrome_results_path, std=True)

shallowchrome_valley_scores = parse_scores(shallowchrome_valley_results_path, std=True)

target_scores = [deepchrome_scores, shallowchrome_scores, shallowchrome_valley_scores]
sorter = np.argsort(-deepchrome_scores[:,0])

w = 1.75
s = 7
y_labels_ = ['DeepChrome', 'ShallowChrome', 'ShallowChrome (valley)']
name = "Set2"
cmap = get_cmap(name)
colors = cmap.colors

fig = plt.figure(dpi=300, figsize=(10,6))
for bb, b in enumerate(target_scores):
    scores_mean = b[:,0]
    if b.shape[-1] == 2:
        std = b[:,1]
    else:
        std = np.asarray([0]*len(target_cells))
    x = s*np.arange(len(target_cells)) + (w * (bb - 0))
    plt.bar(x, scores_mean[sorter], width=w, yerr=std[sorter], align='center', ecolor='grey', capsize=0, color=colors[bb], label=y_labels_[bb])
plt.yticks(np.arange(0.5, 1.0, 0.05), fontsize=11)
plt.xticks(x, target_cells[sorter], fontsize=11, rotation=90)
plt.legend(loc='lower right', fontsize=13)
plt.xlabel("epigenome", fontsize=14)
plt.ylabel("AUROC", fontsize=12)
plt.ylim([0.5, 0.93])
plt.xlim([x[0]-3*w, x[-1]+w])
fig.tight_layout()
plt.savefig('./vsValley.pdf', format='pdf')
plt.show()
