# DVPT 4

In [1]:
import random as rd
# import numpy as np
# import matplotlib.pyplot as plt
# from Bio.Seq import *
# from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA, IUPACUnambiguousDNA
import reprlib
import time
from sys import getsizeof

In [74]:
class Genome:
    def __init__(self, sequence):
        self.sequence = sequence
        
    def sequencing(self, read_length=100, reads_nb=5000):
        reads=[]
        for _ in range(reads_nb):
            start = rd.randint(0, len(self.sequence)-1)
            read_seq = self.sequence[start:min(start+100, len(self.sequence))] + self.sequence[0:max(start+100-len(self.sequence), 0)]
            read = Read(read_seq)
            reads.append(read)
        return reads


    
class Read:
    reads = []
    all_kmers = []
    all_km1mers = []
    
    def __init__(self, sequence):
        self.sequence = sequence
        
    def generate_kmers(self, kmers_length):
        """Returns a list of the k-mers (as strings only) included in the read"""
        kmers=[]
        for i in range(len(self.sequence) - kmers_length):
            kmer = self.sequence[i:i+kmers_length]
            kmers.append(kmer)
        return kmers
    
    def generate_all_kmers(cls, kmers_length):
        all_kmers = set()
        for read in cls.reads:
            kmers = read.generate_kmers(kmers_length)
            all_kmers.update(kmers)
        return list(all_kmers)
    generate_all_kmers = classmethod(generate_all_kmers)
    
    def generate_all_km1mers(cls):
        all_km1mers = set()
        for kmer in cls.all_kmers:
            all_km1mers.add(kmer[:-1]) #prefix
            all_km1mers.add(kmer[1:])  #suffix
        return list(all_km1mers)
    generate_all_km1mers = classmethod(generate_all_km1mers)

    
    
class Graph:
    def __init__(self, vertices, edges):
        self.vertices = vertices
        self.edges = edges
        self.euler_cycles = []
        self.euler_biggest = []
        
    def get_in_edges(self, vertex):
        in_edges=[]
        for edge in self.edges:
            if edge.vertex_to == vertex:
                in_edges.append(edge)
        return in_edges
        
    def get_out_edges(self, vertex):
        out_edges=[]
        for edge in self.edges:
            if edge.vertex_from == vertex:
                out_edges.append(edge)
        return out_edges
        
    def test_eulerian(self):
        for vertex in self.vertices:
            if len(self.get_in_edges(vertex)) != len(self.get_out_edges(vertex)):
                print('PROBLEM : This graph is not Eulerian.')
                return False
        print('SUCCESS : This graph is Eulerian !')
        return True
    
    def find_cycle(self):
        edge = self.edges[1]
        cycle = [edge.vertex_to]
        e = self.get_out_edges(edge.vertex_to)[0]
        while e != edge:
            cycle.append(e.vertex_to)
            self.edges.remove(e)
            e = self.get_out_edges(e.vertex_to)[0]
        self.edges.remove(e)
        return cycle
    
    def find_all_cycles(self):
        while len(self.edges) > 0:
            self.euler_cycles.append(self.find_cycle())
            
    def assemble_cycles(self, cycle, tab):
        # PROBLEM : contrairement a la version precedente, ici j'assemble des edges
        for v in cycle:
            self.euler_biggest.append(v)
            for c in tab:
                if v in c:
                    nextcycle = c[c.index(v)+1:] + c[:c.index(v)+1]
                    nexttab = tab[tab.index(c)+1:] + tab[:tab.index(c)]
                    self.assemble_cycles(nextcycle, nexttab)
                    tab.remove(c)
                    break
    
    
    
    
class Edge:
    edges = {}
    
    def __init__(self, sequence):
        self.sequence = sequence
        self.vertex_from = None
        self.vertex_to = None
    
#     def get_next_edges(self):
#         return self.vertex_to.edges_out
            
        
class Vertex:
    vertices = {}
    
    def __init__(self, sequence):
        self.sequence = sequence
#         self.edges_in = []
#         self.edges_out = []
    
    def get_vertex(cls, sequence):
        for vertex in cls.vertices:
            if vertex.sequence == sequence:
                return vertex
    get_vertex = classmethod(get_vertex)
    

In [18]:
# PROGRAMME PRINCIPAL
# Parameters
genome_length= 10000
read_length=100
reads_nb=5000
kmers_length=30


t = time.time()
# Genome generation
genome= Genome(''.join(rd.choices(["A", "T", "G", "C"], k=genome_length)))
# genome = Genome('ATGGCGTGCA')


print(time.time()-t)
# Reads generation
Read.reads = genome.sequencing(read_length=read_length, reads_nb=reads_nb)


# Vertices & Edges generation
for r in Read.reads:
    for kmer in r.generate_kmer(kmers_length=kmers_length):
        # Edges
        if kmer not in Edge.edges:
            Edge.edges[kmer] = Edge(kmer)
        # Vertices
        prefix, suffix = kmer[:-1], kmer[1:]
        if prefix not in Vertex.vertices:
            Vertex.vertices[prefix] = Vertex(prefix)
        if suffix not in Vertex.vertices:
            Vertex.vertices[suffix] = Vertex(suffix)

# 







# print(time.time()-t)
# # K-mers generation
# Read.all_kmers = Read.generate_all_kmers(kmers_length=kmers_length)


# print(time.time()-t)
# # (K-1)-mers generation
# Read.all_km1mers = Read.generate_all_km1mers()


# print(time.time()-t)
# # Vertices generation
# Vertex.vertices = {}
# for km1mer in Read.all_km1mers:
#     v = Vertex(km1mer)
#     Vertex.vertices.append(v)


# print(time.time()-t)



# Edges generation
Edge.edges = []
for kmer in Read.all_kmers:
    e = Edge(kmer)
    e.vertex_from = Vertex.get_vertex(e.sequence[:-1])     # prefix
    e.vertex_to = Vertex.get_vertex(e.sequence[1:])        # suffix
    Edge.edges.append(e)


    
print(len(Edge.edges))


print(time.time()-t)
# Graph generation
graph = Graph(Vertex.vertices, Edge.edges)


print(time.time()-t)
# Test Eulerien
graph.test_eulerian()


print(time.time()-t)
# Trouve tous les cycles
graph.find_all_cycles()
print(str(len(graph.euler_cycles))+' cycle found')


print(time.time()-t)
# Assemble les cycles
graph.assemble_cycles(graph.euler_cycles[0], graph.euler_cycles[1:])


print(time.time()-t)
# Recréé la sequence
genome_assembly = ''
for v in graph.euler_biggest:
    genome_assembly += v.sequence[0]


# Test genome de départ == genome assemblé
if genome.sequence in genome_assembly*2:
    print('BRAVO : The re-assembled genome matches to the starting genome !!')
else :  
    print('SORRY : The re-assembled genome does not match the starting genome')

print(time.time()-t)



0.002931833267211914
0.16936874389648438
0.3273284435272217
0.33681297302246094
0.34792089462280273
10000
12.799936294555664
12.800384998321533
SUCCESS : This graph is Eulerian !
34.7986581325531
1 cycle found
40.64784359931946
40.650317907333374
BRAVO : The re-assembled genome matches to the starting genome !!
40.656861305236816


In [38]:
dic = dict.fromkeys(range(100000))

In [68]:
help(dic)

Help on dict object:

class dict(object)
 |  dict() -> new empty dictionary
 |  dict(mapping) -> new dictionary initialized from a mapping object's
 |      (key, value) pairs
 |  dict(iterable) -> new dictionary initialized as if via:
 |      d = {}
 |      for k, v in iterable:
 |          d[k] = v
 |  dict(**kwargs) -> new dictionary initialized with the name=value pairs
 |      in the keyword argument list.  For example:  dict(one=1, two=2)
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, key, /)
 |      True if D has a key k, else False.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __init__(self, /, *args, **kwargs)
 |      Initialize s

In [61]:
class Chiffre:
    chiffres = []
    def __init__(self, chiffre):
        self.ch = chiffre
    
    def get_chiffre(cls, i):
        for c in cls.chiffres:
            if c.ch == i:
                return c
        return False
    get_chiffre = classmethod(get_chiffre)

In [62]:
Chiffre.chiffres = [Chiffre(i) for i in range(100000)]

In [63]:
rdindex = [rd.randint(0, len(dic)-1) for _ in range(100)]

In [67]:
t = time.time()
for i in rdindex:
#     elt = dic[i]
#     elt = Chiffre.get_chiffre(i)
    elt = Chiffre.chiffres[i]
print(time.time() - t)

0.00015974044799804688


In [69]:
a = {1:2}

In [72]:
a.update({1:3})

In [73]:
a

{1: 3}

---
## Tests

In [58]:
a = {2:True, 4:True, 5:False}

In [None]:
# class Graph:
#     def __init__(self):
#         self.edges = {} #Key : edge object, value : bool(True "is in graph", False "has been visited already")
#         self.vertices = []
#         self.cycles = []
        
#     def test_eulerian(self):
#         for vertex in self.vertices:
#             if len(vertex.edges_in) != len(vertex.edges_out):
#                 return False
#         return True
        
#     def choose_edge(self, list_edges):
#         for e in list_edges:
#             if self.edges[e]:
#                 return e
    
#     def find_cycle(self, edge):
#         cycle = [edge]
#         self.edges[edge] = False
#         le = edge.get_next_edges()
#         e = self.choose_edge(le)
#         while e != edge:
#             cycle.append(e)
#             self.edges[e] = False
#             le = e.get_next_edges()
#             e = self.choose_edge(le)
#         return cycle
    
#     def find_all_cycles(self):
#         while sum(self.edges.values()) > 0:
#             i = iter(self.edges.items())
#             edge, is_still_here = next(i)
#             while not is_still_here:
#                 edge, is_still_here = next(i)
#             cycle = self.find_cycle(edge)
#             self.cycles.append(cycle)