In [66]:
"""Generate maps of features for each sequence in data set."""

'Generate maps of features for each sequence in data set.'

In [67]:
import os
import src.utils as utils

In [68]:
# define feature functions
def get_window(seq, pos, window_size):
    """Return window of length window_size centered at position pos in seq.

    If the window exceeds the bounds of the seq, get_window returns
    the maximal possible window. Thus, the window at the upper and
    lower bounds are actually right- and left- facing half windows,
    respectively.

    Parameters
    ----------
        seq : string
            Protein sequence as string.
        pos : int
            Index of the center position of the window. Center is the
            left position of the two middle positions in even sized 
            windows.
        window_size : int
            Total number of symbols in window, including the center
            symbol. 

    Returns
    -------
        window : string
            Window of length window_size centered at position pos in
            seq.
    """
    if pos < 0 or pos > len(seq) - 1:
        raise ValueError('Pos is outside the bounds of seq.')
        
    if window_size % 2 == 0:
        delta = window_size // 2
        
        lower = pos - delta + 1 
        if lower < 0:
            lower = 0
        upper = pos + delta + 1
        if upper > len(seq):
            upper = len(seq)
        return seq[lower:upper]
    
    else:
        delta = (window_size - 1) // 2

        lower = pos - delta
        if lower < 0:
            lower = 0
        upper = pos + delta + 1  
        if upper > len(seq):
            upper = len(seq)
        return seq[lower:upper]

def get_hydrophobicity(seq):
    """Return average hydrophobicity of symbols in seq.

    Parameters
    ----------
        seq : string
            Protein sequence as string.

    Returns
    -------
        hydrophobicity : int
            Score of hydrophobicity with most hydrophobic at 1 and
            most hydrophilic at 0.
    """
    hydrophobicity_dict = {'I': 4.5, 'V': 4.2, 'L': 3.8, 'F': 2.8, 'C': 2.5,
                           'M': 1.9, 'A': 1.8, 'W': -0.9, 'G': -0.4, 'T': -0.7,
                           'S': -0.8, 'Y': -1.3, 'P': -1.6, 'H': -3.2, 'N': -3.5,
                           'D': -3.5, 'Q': -3.5, 'E': -3.5, 'K': -3.9, 'R': -4.5}
    hydrophobicity_dict = {key: (value + 4.5) / 9 for key, value in hydrophobicity_dict.items()}
    seq_hydrophobicities = [hydrophobicity_dict.get(sym, 0) for sym in seq]
    return sum(seq_hydrophobicities) / len(seq_hydrophobicities)

def get_polarity(seq):
    """Return average polarity of symbols in seq.

    Parameters
    ----------
        seq : string
            Protein sequence as string.

    Returns
    -------
        polarity : int
            Score of average polarity ranging from 0 to 1, where 1 is
            polar and 0 is nonpolar.
    """
    polarity_dict = {'I': 0, 'V': 0, 'L': 0, 'F': 0, 'C': 0, 'M': 0, 'A': 0,
                     'W': 0, 'G': 0, 'T': 1, 'S': 1, 'Y': 1, 'P': 0, 'H': 1,
                     'N': 1, 'D': 1, 'Q': 1, 'E': 1, 'K': 1, 'R': 1}
    seq_polarities = [polarity_dict.get(sym, 0.5) for sym in seq]
    return sum(seq_polarities) / len(seq_polarities)

def get_X_frac(seq, X):
    """Return fraction of symbols in X in seq.

    Parameters
    ----------
        seq : string
            Protein sequence as string.
        X : string or list
            Symbols to count as string or list.

    Returns
    -------
        X_frac : int
            Fraction of symbols in X in seq.
    """
    X_count = 0
    for sym in seq:
        if sym in X:
            X_count += 1
    return X_count/len(seq)

def get_pair_repeat_frac(seq, XY):
    """Return fractions of pair symbols contained in XY in
    seq which appear two or more times in a row.

    Parameters
    ----------
        seq : string
            Protein sequence as string.
        XY : string or list
            Pair symbols to count for repeats. Must contain 
            at least two symbols.

    Returns
    -------
        pair_repeat_frac : int
            Fraction of pair symbols in seq which appear 
            two or more times in a row.
    """
    if len(XY) < 2:
        raise ValueError('Requires at least two symbols.')
    if len(seq) <= 1:
        return 0

    # Count terminal symbols
    pair_repeat_count = 0
    if seq[0] in XY and seq[1] in XY:
        pair_repeat_count += 1
    if seq[len(seq) - 1] in XY and seq[len(seq) - 2] in XY:
        pair_repeat_count += 1

    # Count interior symbols
    if len(seq) > 2:
        for i in range(1, len(seq) - 1):
            if seq[i] in XY:
                if seq[i-1] in XY:
                    pair_repeat_count += 1
                elif seq[i+1] in XY:
                    pair_repeat_count += 1
                    
    return pair_repeat_count/len(seq)

In [69]:
# make dict of feature functions to iterate over
feature_dict = {"hydrophobicity":lambda seq:get_hydrophobicity(seq),
                "polarity":lambda seq:get_polarity(seq),
                "disorder_promoting_frac":lambda seq:get_X_frac(seq,"TAGRDHQKSEP"),
                "acidic_frac":lambda seq:get_X_frac(seq,"DE"),
                "basic_frac":lambda seq:get_X_frac(seq,"RK"),
                "aliphatic_frac":lambda seq:get_X_frac(seq,"ALMIV"),
                "polar_frac":lambda seq:get_X_frac(seq,"QNSTGCH"),
                "chain_expanding_frac":lambda seq:get_X_frac(seq,"EDRKP"),
                "aromatic_frac":lambda seq:get_X_frac(seq,"FYW"),}
                #"Q_repeat": lambda seq:get_pair_repeat_frac(seq,"QQ")}

In [70]:
# make lst of window sizes to iterate over
window_sizes = [10,20,30,40,50]

In [71]:
# load in fasta file
records = utils.read_fasta('../../mobidb-pdb_validation/split_data/out/all_seqs.fasta')

In [72]:
# make output directory
if not os.path.exists('out/'):
    os.mkdir('out/')

# generate feature maps
for name, seq in records:
    acc = name.split("|")[0][1:]
    for window in window_sizes:
        feature_maps = []
        for feature_func in feature_dict:
            feature_map = []
            for i in range(len(seq)):
                feature_map.append(feature_dict[feature_func](get_window(seq,i,window)))
            feature_maps.append(feature_map)
            
        # make window directory
        if not os.path.exists(f"out/window_size{window}"):
            os.mkdir(f"out/window_size{window}")
            
        # write outputs for window
        with open(f"out/window_size{window}/{acc}_feature_map{window}.tsv", "w") as file:
            for i in range(len(seq)):
                for feature_map in feature_maps:
                    file.write(f"{feature_map[i]}\t")
                file.write("\n")

KeyboardInterrupt: 