## Solve the String Composition Problem.

In [17]:
def kmer_to_num(kmer):
    num = 0
    dic = {"A": 0, "C": 1, "G": 2, "T": 3}
    for i in range(len(pattern)):
        num += dic[pattern[i]] * 4 ** (len(pattern) - i - 1)
    return num

def num_to_kmer(k, num):
    slovar = {0: "A", 1: "C", 2: "G", 3: "T"}
    pattern = ''
    while num > 3:
        pattern += slovar[num % 4]
        num = num // 4
        if num < 4:
            pattern += slovar[num]
    return 'A' * (k - len(pattern)) + pattern[::-1]

In [18]:
def composition(k, text):
    'turning text in list of reads in lexicografic order'
    comp = []
    for i in range(len(text) - k + 1):
        comp.append(text[i:i + k])
    return sorted(comp)

In [19]:
with open('dataset_197_3.txt', 'r+') as f:
    k = int(f.readline().strip())
    text = f.readline().strip()
    f.write('\n'.join(composition(k, text)))

## String Spelled

In [20]:
import sys

if __name__ == "__main__":
    Input = sys.stdin.readlines()
    patternList = [pattern.strip() for pattern in Input]
    ans = composition(4, patternList)
    print(ans)

[]


In [21]:
def spell(lines):
    string = lines[0]
    k = len(lines[0])
    for i in range(1, len(lines)):
        string += lines[i][k - 1]
    return string

In [22]:
with open('dataset_198_3.txt', 'r') as f:
    pattern_list = list(map(lambda x: x.strip(), f.readlines()))
with open('answer_198_3.txt', 'w') as f:
    f.write(spell(pattern_list))

## Overlap Graph Problem

In [26]:
def reads_to_graph(reads):
    graph = dict()
    for read1 in reads:
        for read2 in reads:
            if read1[1:] == read2[:-1]:
                if read1 in graph:
                    graph[read1].append(read2)
                else:
                    graph[read1] = [read2]
    return graph

In [34]:
with open('dataset_198_10.txt', 'r') as f:
    patternList = list(map(lambda x: x.strip(), f.readlines()))
overlapList = reads_to_graph(patternList)
with open('answer_198_10.txt', 'w') as f:
    for pattern, adjacencies in overlapList.items():
        if len(adjacencies) > 0:
            f.write(pattern + '->' + ','.join(adjacencies) + '\n')

In [28]:
reads = ['ATGCG', 'GCATG', 'CATGC', 'AGGCA', 'GGCAT', 'GGCAC']
reads_to_graph(reads)

{'GCATG': ['CATGC'],
 'CATGC': ['ATGCG'],
 'AGGCA': ['GGCAT', 'GGCAC'],
 'GGCAT': ['GCATG']}

## De Bruijn Graph from a String

In [42]:
def debru_from_string(k, string):
    graph = dict()
    for i in range(len(string) - k + 1):
        if string[i:i + k - 1] in graph:
            graph[string[i:i + k - 1]].append(string[i + 1:i + k])
        else:
            graph[string[i:i + k - 1]] = [string[i + 1:i + k]]
    return graph

In [80]:
def count(string):
    a = ['000', '001', '010', '011', '100', '101', '110', '111']
    dic = dict((d, 0) for d in a)
    print(dic)
    for i in range(len(string) - 2):
        dic[string[i:i+3]] += 1
    return dic

In [108]:
count('1001101100')

{'000': 0, '001': 0, '010': 0, '011': 0, '100': 0, '101': 0, '110': 0, '111': 0}


{'000': 0,
 '001': 1,
 '010': 0,
 '011': 2,
 '100': 2,
 '101': 1,
 '110': 2,
 '111': 0}

In [90]:
debru_from_string(4, '0101010100')

{'010': ['101', '101', '101', '100'], '101': ['010', '010', '010']}

In [43]:
k = 4
string = 'AAGATTCTCTAAGATC'
debru_from_string(k, string)

{'AAG': ['AGA', 'AGA'],
 'AGA': ['GAT', 'GAT'],
 'GAT': ['ATT', 'ATC'],
 'ATT': ['TTC'],
 'TTC': ['TCT'],
 'TCT': ['CTC', 'CTA'],
 'CTC': ['TCT'],
 'CTA': ['TAA'],
 'TAA': ['AAG']}

In [46]:
with open('dataset_199_6.txt', 'r') as f:
    k = int(f.readline())
    string = f.readline().strip()
graph = debru_from_string(k, string)
with open('answer_199_6.txt', 'w') as f:
    for pattern, adjacencies in graph.items():
        if len(adjacencies) > 0:
            f.write(pattern + ' -> ' + ','.join(adjacencies) + '\n')

In [21]:
def de_bruijn_graph(k, seq):
    from collections import defaultdict
    graph_dict = defaultdict(list)
    k = k - 1
    for pos in range(len(seq) - k):
        graph_dict[seq[pos:pos+k]].append(seq[pos+1: pos+k+1])
    return graph_dict


with open('dataset_199_6.txt', 'r') as f:
    k = int(f.readline())
    string = f.readline()
graph = de_bruijn_graph(k, string)
with open('answer_199_6.txt', 'w') as f:
    for pattern, adjacencies in graph.items():
        if len(adjacencies) > 0:
            f.write(pattern + ' -> ' + ','.join(adjacencies) + '\n')

## DeBruijn Graph from k-mers 

In [60]:
def graph_from_kmers(kmers):
    pre_su = set()
    for kmer in kmers:
        pre_su.add(kmer[1:])
        pre_su.add(kmer[:-1])
    pre_su = list(pre_su)
    graph = dict((p_s, list()) for p_s in pre_su)
    for kmer in kmers:
        graph[kmer[:-1]].append(kmer[1:])
    return graph

In [66]:
def printing_dict(dic):
    for pattern, adjacencies in dic.items():
        if len(adjacencies) > 0:
            print(pattern + ' -> ' + ','.join(adjacencies))

In [67]:
kmers = ['GAGG', 'CAGG', 'GGGG', 'GGGA', 'CAGG', 'AGGG', 'GGAG']
printing_dict(graph_from_kmers(kmers))

GAG -> AGG
GGG -> GGG,GGA
GGA -> GAG
AGG -> GGG
CAG -> AGG,AGG


In [68]:
with open('dataset_200_8.txt', 'r') as f:
    strings = list(map(lambda x: x.strip(), f.readlines()))
graph = graph_from_kmers(strings)
with open('answer_200_8.txt', 'w') as f:
    for pattern, adjacencies in graph.items():
        if len(adjacencies) > 0:
            f.write(pattern + ' -> ' + ','.join(adjacencies) + '\n')