In [1]:
def DeBruijn (kmers):
    """Input : list with str 
       Output : Dic with str as keys (a kmers' prefix) and LIST that contain str as values ( same kmers' suffix)
       
       For each kmer, create a key with its prefix (kmer[:-1]) and a value with its suffix (kmer[1:]).
       For instance, if AAGA is a kmer, dic[AAG] = AGA. 
       If a kmer appears multiple times, append its suffix as value for an equal number of times
       Note that each key/vertex is a suffix of a kmer and a prefix of another (apart from start & end in RIGHT sequence)
       Thus, DeBruijn's output can be used to 'glue' kmers """
    
    adjacency_dic = {}
    
    for kmer in kmers:
        if kmer[:-1] not in adjacency_dic:
            adjacency_dic[kmer[:-1]] = []                                    #initiate key with an empty list
        else:
            pass
        
        kmer_prfx = kmer[:-1]
        kmer_sfx  = kmer[1:]
        (adjacency_dic[kmer_prfx]).append(kmer_sfx)
        
    return adjacency_dic

In [2]:
def Contiguous_and_Endpoint_Nodes (adjacency_dic) :
    """ Input : Dic with str (or int) as keys (a kmers' prefix) and LIST with str (or int) as values (same kmers' suffix). 
        Output: list containing 2 lists as elements, which comprise str or int. First list element corresponds to continue nodes.
                For a node to be continue, its has to satisfy this rule : both indegree and outdegree = 1. 
                Second list element corresponds to nodes that indegree or outdegree != 1 
                
                In a Debruijn graph, for each of its nodes, assess whether it has more than one input or output.
                Separate continue nodes from endpoint nodes.
    
    """
    
    outdegreeSum = dict.fromkeys(adjacency_dic, 0)
    indegreeSum  = outdegreeSum.copy()
    more_indegrees_extra = []
    start_or_stop_nodes = []
    continue_nodes = []
    

    for key, values in adjacency_dic.items():                              
        outdegreeSum[key] += len(values)                      # add N of outdegrees for each key-node
        
    for key,values in adjacency_dic.items():                               
        for node in values:
            try :
                indegreeSum[node] += 1                       # remove 1 every time a node is an indegree
            except:
                more_indegrees_extra.append(node)                       #create artificial node

    for key in outdegreeSum:
        if outdegreeSum[key]==1 and indegreeSum[key]== 1:
            continue_nodes.append(key)
        if outdegreeSum[key]!=1 or indegreeSum[key]!= 1 :
            start_or_stop_nodes.append(key)
            
    return [continue_nodes, start_or_stop_nodes + more_indegrees_extra]       

In [3]:
def MaximalNonBranchingPaths (adjacency_dic):
    """Input 1: Dic with str (or int) as keys (a kmers' prefix) and LIST with str (or int) as values (same kmers' suffix) 
       Output : List with lists as elements. Every list corresponds to a maximum path without branches at its intermediate nodes"""
    
    import copy
    
    graph = copy.deepcopy(adjacency_dic)
    intermediates, endpoints = Contiguous_and_Endpoint_Nodes (adjacency_dic)
    All_paths = []
    
    for start_node in endpoints :
        try:
            graph[start_node]                                                                                                               
        
        except KeyError:                                         # some nodes might have only outdegrees,thus appear only as values and not keys
            continue                                             ## in this case, jump to next node, since no path can be created by a node which does not constitute a key
        
        else:   
            for i in range(len(graph[start_node])):              #a key/node forms as many paths as the values it projects to
                current_node = start_node        
                path = [current_node]                            # paths start by one key and continue solely based on values
                while True:
                    current_node = graph[current_node].pop()     # turn the value to a new key of which a new value will be appended
                    path.append(current_node)                    # at least one value after the first node/key will be appended to a path
                    if current_node in endpoints:                # stop when there is a node with 0 outputs OR ( more than 1 output or input)
                        All_paths.append(path)                   # add the path to the list of paths
                        break                                    # break while loop and jump to 2nd for loop if another path exists for the same key 
                                                                 # Following break, jump to 1st for loop, thus to another key if another path does not exist or the same key. 
                    else :
                        intermediates.remove (current_node)      # Removing the intermediate will facilitate finding isolated cycles later on
    
    for start_node in intermediates :
        current_node = start_node
        path = [current_node]
        while True:
            try :
                current_node = graph[current_node].pop()
            
            except IndexError :                                  # path stops when a key has an empty list (IndexError) as value
                if current_node == start_node and len(path) > 1: # if the end value is the same as the start key. Path > 1 to rule out submitting a key itself as a path
                    All_paths.append(path) 
                break
            
            else : path.append(current_node)

    return All_paths

In [4]:
def PathToGenome(reads) :
    """Input : list with strings as elements (reads)
       Output: string corresponding to the whole genome
    
    Construct genome based on reads(that is, pieces of genome).
    A genome path must have already been constructed to call this function. Hence, all kmers are already found and placed
    in the right succession and merely need to be 'joined' into one string.
    
    Below an explanation of how the input is produced: 
    
       1) Find the first read in the genome and add it as the starting one
       2) Find the next read by matching the last already added read (without its first base) 
          to one of the remaining reads (without their last base)
       3) Add the matching read (including its last letter and repeat step 2)
               Example of constructed genome based on reads
                      ACCGA -> CCGAA ->  CGAAG -> GAAGC
                             ||
                             \/
                         matched : CCGA                            """
    
    reads_len = len(reads)
    
    genome = reads[0]
    
    for read_index in range(1,reads_len):
        genome += reads[read_index][-1]
    
        
    return genome