In [None]:
import h5py
import numpy as np
import pandas as pd
import modisco
import tqdm
import io
import base64
import urllib
import vdom.helpers as vdomh
import deepdish
import pyBigWig
import nexusformat.nexus as nx
import os

# Utility functions

In [None]:
BACKGROUND_FREQS = np.array([0.25, 0.25, 0.25, 0.25])
def info_content(track, pseudocount=0.001):
    """
    Given an L x 4 track, computes information content for each base and
    returns it as an L-array.
    """
    num_bases = track.shape[1]
    # Normalize track to probabilities along base axis
    track_norm = (track + pseudocount) / (np.sum(track, axis=1, keepdims=True) + (num_bases * pseudocount))
    ic = track_norm * np.log2(track_norm / np.expand_dims(BACKGROUND_FREQS, axis=0))
    return np.sum(ic, axis=1)

In [None]:
def trim_motif(pfm, motif, min_ic=0.2, pad=0):
    """
    Given the PFM and motif (both L x 4 arrays) (the motif could be the
    PFM itself), trims `motif` by cutting off flanks of low information
    content in `pfm`. `min_ic` is the minimum required information
    content. If specified this trimmed motif will be extended on either
    side by `pad` bases.
    If no base passes the `min_ic` threshold, then no trimming is done.
    """
    # Trim motif based on information content
    ic = info_content(pfm)
    pass_inds = np.where(ic >= min_ic)[0]  # Cut off flanks with less than min_ic IC
    
    if not pass_inds.size:
        return motif

    # Expand trimming to +/- pad bp on either side
    start, end = max(0, np.min(pass_inds) - pad), min(len(pfm), np.max(pass_inds) + pad + 1)
    return motif[start:end]

In [None]:
def pfm_to_pwm(pfm):
    ic = info_content(pfm)
    return pfm * np.expand_dims(ic, axis=1)

In [None]:
def extractMotifsFromMoDIScoH5( basedir, task ):
    folder_name = []
    for folder in os.listdir(basedir):
        ctdir = os.path.join(basedir,folder)
        if not os.path.isdir( ctdir ) or \
        folder in ['.ipynb_checkpoints','__pycache__']:
            continue
        folder_names = folder.split("/")[-1]

        pfms = {}
        pwms = {} 
        tfm_results_path = os.path.join(ctdir,folder + f'_{task}_modisco_results.h5')   
        with h5py.File(tfm_results_path, "r") as f:
            pos_pattern = f["pos_patterns"]
            num_patterns = len(pos_pattern.keys())
            for pattern_i, pattern_key in enumerate(pos_pattern.keys()):
                pattern = pos_pattern[pattern_key]
                pfm = pattern["sequence"][:]
                cwm = pattern["hypothetical_contribs"][:]

                pfm = trim_motif(pfm,pfm)        
                pwm = pfm_to_pwm(pfm)
                pfms[pattern_key] = pfm
                pwms[pattern_key] = pwm

            np.save(os.path.join(ctdir,f"pfms_{task}.npy"),pfms)
            np.save(os.path.join(ctdir,f"pwms_{task}.npy"),pwms)

In [None]:
def extractNSeqletsFromMoDIScoH5( basedir, task ):
    folder_name = []
    for folder in os.listdir(basedir):
        ctdir = os.path.join(basedir,folder)
        if not os.path.isdir( ctdir ) or \
        folder in ['.ipynb_checkpoints','__pycache__']:
            continue
        folder_names = folder.split("/")[-1]

        seqlets = {}
        tfm_results_path = os.path.join(ctdir,folder + f'_{task}_modisco_results.h5')   
        with h5py.File(tfm_results_path, "r") as f:
            pos_pattern = f["pos_patterns"]
            num_patterns = len(pos_pattern.keys())
            for pattern_i, pattern_key in enumerate(pos_pattern.keys()):
                pattern = pos_pattern[pattern_key]
                seqlet = pattern["seqlets"]['n_seqlets'][:]
                seqlets[pattern_key] = seqlet[0]
            np.save(os.path.join(ctdir,f"pfms_{task}_n_seqlets.npy"),seqlets)

# Extract data from all models

In [None]:
extractMotifsFromMoDIScoH5( 'MoDISco_PFMs/Mlig/', 'profile' )
extractMotifsFromMoDIScoH5( 'MoDISco_PFMs/Mlig/', 'counts' )
extractNSeqletsFromMoDIScoH5( 'MoDISco_PFMs/Mlig/', 'profile' )
extractNSeqletsFromMoDIScoH5( 'MoDISco_PFMs/Mlig/', 'counts' )

In [None]:
extractMotifsFromMoDIScoH5( 'MoDISco_PFMs/Smed/', 'profile' )
extractMotifsFromMoDIScoH5( 'MoDISco_PFMs/Smed/', 'counts' )
extractNSeqletsFromMoDIScoH5( 'MoDISco_PFMs/Smed/', 'profile' )
extractNSeqletsFromMoDIScoH5( 'MoDISco_PFMs/Smed/', 'counts' )

In [None]:
extractMotifsFromMoDIScoH5( 'MoDISco_PFMs/Sman/', 'profile' )
extractMotifsFromMoDIScoH5( 'MoDISco_PFMs/Sman/', 'counts' )
extractNSeqletsFromMoDIScoH5( 'MoDISco_PFMs/Sman/', 'profile' )
extractNSeqletsFromMoDIScoH5( 'MoDISco_PFMs/Sman/', 'counts' )