# DVPT 5

In [1]:
import random as rd
import reprlib
import time
from sys import getsizeof

In [4]:
class Genome:
    def __init__(self, sequence):
        self.sequence = sequence
        
    def sequencing(self, read_length=100, reads_nb=5000):
        reads=[]
        for _ in range(reads_nb):
            start = rd.randint(0, len(self.sequence)-1)
            read_seq = self.sequence[start:min(start+100, len(self.sequence))] + self.sequence[0:max(start+100-len(self.sequence), 0)]
            read = Read(read_seq)
            reads.append(read)
        return reads


    
class Read:
    reads = []
    all_kmers = []
    all_km1mers = []
    
    def __init__(self, sequence):
        self.sequence = sequence
        
    def generate_kmers(self, kmers_length):
        """Returns a list of the k-mers (as strings only) included in the read"""
        kmers=[]
        for i in range(len(self.sequence) - kmers_length):
            kmer = self.sequence[i:i+kmers_length]
            kmers.append(kmer)
        return kmers


    
class Graph:
    def __init__(self):
        self.vertices = {}
        self.edges = {}
        self.euler_cycles = []
        self.euler_biggest = []
        
    def test_eulerian(self):
        for vertex in self.vertices.values():
            if len(vertex.edges_in) != len(vertex.edges_out):
                print('PROBLEM : This graph is not Eulerian.')
                return False
        print('SUCCESS : This graph is Eulerian !')
        return True
    
    def find_cycle(self):
        edge = self.edges.popitem()[1]
        cycle = [edge.vertex_from]
        e = edge.vertex_to.edges_out.pop()
        while e != edge:
            self.edges.pop(e.sequence)
            cycle.append(e.vertex_from)
            e = e.vertex_to.edges_out.pop()
        return cycle
    
    def find_all_cycles(self):
        while len(self.edges) > 0:
            self.euler_cycles.append(self.find_cycle())
            
    def assemble_cycles(self, cycle, tab):
        for v in cycle:
            self.euler_biggest.append(v)
            for c in tab:
                if v in c:
                    nextcycle = c[c.index(v)+1:] + c[:c.index(v)+1]
                    nexttab = tab[:tab.index(c)] + tab[tab.index(c)+1:]
                    self.assemble_cycles(nextcycle, nexttab)
                    tab.remove(c)
                    break

    def add_vertex(self, seq):
        if seq not in self.vertices:
            # si le vertex n'existe pas, on le créé
            v = Vertex(seq)
            self.vertices[seq] = v
            return v
        else:
            # s'il existe, la fonction va juste le chercher
            return self.vertices[seq]
    
    def add_edge(self, seq, v_from, v_to):
        if seq not in self.edges:
            e = Edge(seq, v_from, v_to)
            self.edges[seq] = e
            return e
        else:
            return self.edges[seq]
            
    
    
    
    
class Edge:
    def __init__(self, sequence, vertex_from, vertex_to):
        self.sequence = sequence
        self.vertex_from = vertex_from
        self.vertex_to = vertex_to

            
        
class Vertex:
    def __init__(self, sequence):
        self.sequence = sequence
        self.edges_in = set()
        self.edges_out = set()


In [6]:
# PROGRAMME PRINCIPAL
# Parameters
genome_length= 600000
read_length=100
reads_nb=200000
kmers_length=55


t = time.time()
# Genome generation
genome= Genome(''.join(rd.choices(["A", "T", "G", "C"], k=genome_length)))
# genome = Genome('ATGGCGTGCA')
print(time.time()-t)


# Reads generation
Read.reads = genome.sequencing(read_length=read_length, reads_nb=reads_nb)
print(time.time()-t)

# Graph creation (empty)
graph = Graph()


# Vertices & Edges generation
for r in Read.reads:
    for kmer in r.generate_kmers(kmers_length=kmers_length):
        # Vertices
        v_from = graph.add_vertex(kmer[:-1])  # prefix
        v_to = graph.add_vertex(kmer[1:])   # suffix
        # Edges
        e = graph.add_edge(kmer, v_from, v_to)
        # add this edge into the vertices attributes
        v_from.edges_out.add(e)
        v_to.edges_in.add(e)
print(time.time()-t)


# Test Eulerien
graph.test_eulerian()
print(time.time()-t)


# Trouve tous les cycles
graph.find_all_cycles()
print(str(len(graph.euler_cycles))+' cycle found')
print(time.time()-t)


# Assemble les cycles
graph.assemble_cycles(graph.euler_cycles[0], graph.euler_cycles[1:])
print(time.time()-t)

# Recréé la sequence
genome_assembly = ''
for v in graph.euler_biggest:
    genome_assembly += v.sequence[0]
print(time.time()-t)


# Test genome de départ == genome assemblé
if genome.sequence in genome_assembly*2:
    print('BRAVO : The re-assembled genome matches to the starting genome !!')
else :  
    print('SORRY : The re-assembled genome does not match the starting genome')
print(time.time()-t)



0.21599054336547852
1.521254539489746
39.27867245674133
SUCCESS : This graph is Eulerian !
39.444478273391724
1 cycle found
39.95925307273865
40.07105112075806
40.29884719848633
BRAVO : The re-assembled genome matches to the starting genome !!
40.30382466316223


In [2]:
dir({})

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [8]:
n = 1

In [9]:
'cycle{}'.format('s' if n > 1 else '')

'cycle'

In [13]:
print('{} Euler cycle{} found in the graph'.format(1, 's' if 1 > 1 else ''))


1 Euler cycle found in the graph


In [16]:
a='je mandge'

In [17]:
a[0:80]

'je mandge'