# Class 11/03/2019

In [18]:
import re

# Find the largest putative protein from an aa sequence using a RE
# Proteins start with a 'M' and end with '_'
# Accepts arguments minimum and maximum length
def max_protein(aa, minsize=2, maxsize=100):
    if (minsize < 2 or maxsize < minsize):
        raise Exception
        
    regex = re.compile('M.{' + str(minsize - 2) + ',' + str(maxsize - 2) + '}?_')
    return max(re.findall(regex, aa.upper()), key = lambda el: len(el))

# Test
print(max_protein('ACGTAGCAMA_NAEDIAEDMAGCTAGCTAGCTAGC_ACTGMYAECAEC_ONAE'))

MAGCTAGCTAGCTAGC_


In [25]:
# Convert a Prosite Pattern to a Regex Pattern
def prosite_to_regex(prosite):
    converter = {
        '-': '',
        'x': '.',
        '(': '{',
        ')': '}',
        '{': '[^',
        '}': ']'
    }
    
    return "".join([converter[el] if el in converter.keys() else el for el in prosite])

# Test
print(prosite_to_regex("C-x-H-x-[LIVMFY]-C-x(2)-C-[LIVMYA](3,4)-{XK}"))

C.H.[LIVMFY]C.{2}C[LIVMYA]{3,4}[^XK]


In [24]:
# Test a sequence with th given prosite pattern
def find_prosite(aa, prosite):
    return re.findall(re.compile(prosite_to_regex(prosite)), aa.upper())

# Test
print(find_prosite("ATTTCGGC", "x-T(2)-[CG]-x"))

['TTTCG']


# Class 13/3/2019

In [35]:
# Convert a REBASE Pattern to a Regex Pattern
def rebase_to_regex(rebase):
    converter = {
        'R': '[AG]',
        'Y': '[CT]',
        'S': '[GC]',
        'W': '[AT]',
        'K': '[GT]',
        'M': '[AC]',
        'B': '[CGT]',
        'D': '[AGT]',
        'H': '[ACT]',
        'V': '[ACG]',
        'N': '[ACGT]',
        '-': '.',
        '^': ''
    }
    
    return (rebase.find('^'),\
            "".join([converter[el] if el in converter.keys() else el for el in rebase]))

# Test
rebase_to_regex("MBDKWA^T|CG..AEJNN.AENUHE")

(6, '[AC][CGT][AGT][GT][AT]AT|CG..AEJ[ACGT][ACGT].AE[ACGT]U[ACT]E')

In [36]:
import re

# Given a sequence and a restriction enzyme pattern in the REBASE syntax,
# i.e. including IUPAC positions, determines the cut positions of
# the enzyme in the sequence
def cut_positions(enzyme_pattern, seq):
    cut_offset, rebase_regex = rebase_to_regex(enzyme_pattern)
    
    regex = re.compile(rebase_regex)
    return [match.start() + cut_offset for match in re.finditer(regex, seq)]

# Test
cut_positions("V^HD", "ACTAATGSTGAACCTGA")

[1, 4, 10, 13]

In [38]:
def cut_subsequences(enz, seq):
    cut_pos = [0] + cut_positions(enz, seq) + [len(seq) -1]
    
    return [seq[cut_pos[i]: cut_pos[i+1]] for i in range(0, len(cut_pos) - 1)]

# Test
cut_subsequences("G^AANTC", "ATGAAAGAAGTCTTATGAATGAGCCTCAGCTGAAGAANTCCATCGCGCAGAANTCCTACGCTCAGACTCAGACTCAGCATTATAGTGAATTCTTAATAAATAAAATAA")

['ATGAAAG',
 'AAGTCTTATGAATGAGCCTCAGCTGAAGAANTCCATCGCGCAGAANTCCTACGCTCAGACTCAGACTCAGCATTATAGTG',
 'AATTCTTAATAAATAAAATA']

In [24]:
# Given a DNA sequence, allows to detect if there are repeated sequences
# of size k (k as argument), ordered by descreasing order of frequency
# Returns a dic with subseqs as keys and their frequency as values 
def find_repeated_subseqs(seq, k, top=3):
    regex = re.compile('.{' + str(k) + '}')
    res = {}
    
    for match in re.findall(regex, seq):
        if match in res:
            res[match] += 1
        else:
            res[match] = 1
    
    return {k: v for (k, v) in\
        sorted(res.items(), key = lambda x: x[1], reverse = True)[0:top]}

# Test
find_repeated_subseqs("ACTGTGCACAAATGTG", 2)

{'TG': 4, 'CA': 2, 'AC': 1}