# Notebook for development of the script - 2

In [3]:
import random as rd
import numpy as np
import matplotlib.pyplot as plt
from Bio.Seq import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA, IUPACUnambiguousDNA
import reprlib

In [4]:
class Genome(Seq):
    """Classe Genome"""
    def __init__(self, seq, circular=True):
        Seq.__init__(self, seq, alphabet=IUPACUnambiguousDNA())
        self.circular=circular
        
#     def __str__(self):
#         """To show the attributes and values of the instance"""
#         out=''
#         for key, value in self.__dict__.items():
#             out+='{:20s}  {}\n'.format(key, reprlib.repr(value))
#         return out

    def sequencing(self, read_length=100, reads_nb=5000):
        reads=[]
        for _ in range(reads_nb):
            start = rd.randint(0, len(self)-1)
            read_seq = self._data[start:min(start+100, len(self))] + self._data[0:max(start+100-len(self), 0)]
            read = Read(read_seq)
            reads.append(read)
        return reads


In [5]:
class Read(Seq):
    """Classe Read"""
    def __init__(self, seq, circular=True):
        Seq.__init__(self, seq, alphabet=IUPACUnambiguousDNA())
    
#     def __str__(self):
#         """To show the attributes and values of the instance"""
#         out=''
#         for key, value in self.__dict__.items():
#             out+='{:20s}  {}\n'.format(key, reprlib.repr(value))
#         return out
    
    def generate_kmers(self, kmers_length=30):
        """Returns a list of the k-mers included in the read"""
        kmers=[]
        for i in range(len(self) - kmers_length):
            kmer_seq = self._data[i:i+kmers_length]
            kmer = Kmer(kmer_seq)
            kmers.append(kmer)
        return kmers


In [6]:
class Kmer(Seq):
    """Classe Kmer"""
    def __init__(self, seq):
        Seq.__init__(self, seq, alphabet=IUPACUnambiguousDNA())
        self.prefix = seq[:-1]
        self.suffix = seq[1:]
    
#     def __str__(self):
#         """To show the attributes and values of the instance"""
#         out=''
#         for key, value in self.__dict__.items():
#             out+='{:20s}  {}\n'.format(key, reprlib.repr(value))
#         return out
    

In [8]:
class Graph:
    def __init__(self, km1mers):
        self.nodes = tuple(km1mers)
        self.matrix = np.zeros((len(self.nodes), len(self.nodes)))
        self.eulerian = None
    
    def fill_matrix(self, kmers):
        n = 0
        for kmer in kmers:
            print('K-mer {:5d}'.format(n), end='\r')
            i = self.nodes.index(kmer.prefix)
            j = self.nodes.index(kmer.suffix)
            self.matrix[i, j]+=1
            n+=1
    
    def test_eulerian(self):
        for i in range(len(self.nodes)):
            if self.matrix[i, :].sum() != self.matrix[:, i].sum():
                print("PROBLEM : This graph is not Eulerian.")
                self.eulerian = False
                break
        if self.eulerian != False:
            print('SUCCESS : This graph is Eulerian !!')
            self.eulerian = True
        

In [222]:
# Trouve 1 cycle
def find_eulerian_cycle():
    # attention, ce code ne gere pas les sommets connectés avec eux-memes
    cycle = []
    start, s = np.array(np.where(graph.matrix))[:,1]
    cycle.append(start)
    graph.matrix[start, s]-=1
    while s!= start:
        cycle.append(s)
        t = np.where(graph.matrix[s, :])[0][0]     # s comme sommet, t comme temporary
        graph.matrix[s, t]-=1
        s = t
    return cycle

In [223]:
# Test fin
def test_fin():
    if graph.matrix.sum()==0: return True
    else: return False

In [247]:
# PROGRAMME PRINCIPAL
# Parameters
genome_length= 10
read_length=7
reads_nb=6
kmers_length=3

# Genome generation
# genome= Genome(''.join(rd.choices(["A", "T", "G", "C"], k=genome_length)), circular=True)
genome = Genome('ATGGCGTGCA')

# Reads generation
reads = genome.sequencing(read_length=read_length, reads_nb=reads_nb)

# K-mers generation
kmers=set()
for read in reads:
    kmers.update(read.generate_kmers(kmers_length=kmers_length))

# (K-1)-mers generation
km1mers=set()
for kmer in kmers:
    km1mers.update([kmer.prefix, kmer.suffix])

# Graph generation
graph = Graph(km1mers)
graph.fill_matrix(kmers)
graph.test_eulerian()

# Trouve tous les cycles
cycles = []
while test_fin() == False:
    cycles.append(find_eulerian_cycle())
cycles

K-mer     0K-mer     1K-mer     2K-mer     3K-mer     4K-mer     5K-mer     6K-mer     7K-mer     8K-mer     9SUCCESS : This graph is Eulerian !!




[[1, 4, 3, 5, 2], [1, 7, 4, 6, 0]]

In [248]:
# Assemble les cycles
if len(cycles)==1:
    seq = cycles[0]
else:
    cycle1 = cycles.pop(0)
    print(cycles)
    for s in cycle1:
        seq.append(s)
        for cycle in cycles:
            if s in cycle:
                i = cycle.index(s)
                c = cycle[i+1:] + cycle[:i]
                seq.append(*c)

            

[[1, 7, 4, 6, 0]]


TypeError: append() takes exactly one argument (4 given)

In [245]:
seq

[1,
 4,
 3,
 5,
 2,
 1,
 [4, 3, 5, 2],
 7,
 4,
 [3, 5, 2, 1],
 6,
 0,
 1,
 [4, 3, 5, 2],
 7,
 4,
 [3, 5, 2, 1],
 6,
 0,
 1,
 [7, 4, 6, 0],
 4,
 [6, 0, 1, 7],
 3,
 5,
 2]

In [None]:
# Recréé la sequence
genome_assembly = ''
for s in seq:
    genome_assembly+=graph.nodes[s][0]

In [135]:
# test genomme de départ == genome assemblé
offset = genome_assembly.index(genome._data[:40])
genome_assembly[offset:]+genome_assembly[:offset] == genome._data

True

In [237]:
a = [2, 3, 4, 5]
b = [6, 7, 8, 9]

In [238]:
a+b

[2, 3, 4, 5, 6, 7, 8, 9]

In [242]:
a+b

[2, 3, 4, 5, 6, 7, 8, 9]

In [249]:
dir(a)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [None]:
seq=[tab[0][0]]
assemble(tab, 0, 0)

In [None]:
assemble(tab, cycle, I, i):
#     for j, s in enumerate(tab[I][i+1:]+tab[I][:i]):
    for j, s in enumerate(cycle[i+1:]+cycle[:i]):
        seq.append(s)
        for J, c in enumerate(tab[:I]+tab[I+1:]):
            if s in c:
                assemble(tab[:I]+tab[I+1:], 