# DVPT 4

In [78]:
import random as rd
import numpy as np
import matplotlib.pyplot as plt
from Bio.Seq import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA, IUPACUnambiguousDNA
import reprlib
import time
from sys import getsizeof

In [121]:
class Genome:
    def __init__(self, sequence):
        self.sequence = sequence
        
    def sequencing(self, read_length=100, reads_nb=5000):
        reads=[]
        for _ in range(reads_nb):
            start = rd.randint(0, len(self.sequence)-1)
            read_seq = self.sequence[start:min(start+100, len(self.sequence))] + self.sequence[0:max(start+100-len(self.sequence), 0)]
            read = Read(read_seq)
            reads.append(read)
        return reads


    
class Read:
    reads = []
    all_kmers = []
    all_km1mers = []
    
    def __init__(self, sequence):
        self.sequence = sequence
        
    def generate_kmers(self, kmers_length):
        """Returns a list of the k-mers (as strings only) included in the read"""
        kmers=[]
        for i in range(len(self.sequence) - kmers_length):
            kmer = self.sequence[i:i+kmers_length]
            kmers.append(kmer)
        return kmers
    
    def generate_all_kmers(cls, kmers_length):
        all_kmers = set()
        for read in cls.reads:
            kmers = read.generate_kmers(kmers_length)
            all_kmers.update(kmers)
        return list(all_kmers)
    generate_all_kmers = classmethod(generate_all_kmers)
    
    def generate_all_km1mers(cls):
        all_km1mers = set()
        for kmer in cls.all_kmers:
            all_km1mers.add(kmer[:-1]) #prefix
            all_km1mers.add(kmer[1:])  #suffix
        return list(all_km1mers)
    generate_all_km1mers = classmethod(generate_all_km1mers)
    
    
    



class Graph:
    def __init__(self):
        self.edges = {} #Key : edge object, value : bool(True "is in graph", False "has been visited already")
        self.vertices = []
        self.eulerian = False
        self.cycles = []
        
    def test_eulerian(self):
        for vextex in self.vertices:
            if len(vertex.in_edges) != len(vertex.out_edges):
                self.eulerian = False
                return None
        self.eulerian = True
        
    def choose_edge(self, list_edges):
        for e in list_edges:
            if self.edges[e]:
                return e
    
    def find_cycle(self, edge):
        cycle = [edge]
        self.edges[edge] = False
        le = edge.get_next_edges()
        e = self.choose_edge(le)
        while e != edge:
            cycle.append(e)
            self.edges[e] = False
            le = e.get_next_edges()
            e = self.choose_edge(le)
        return cycle
    
    def find_all_cycles(self):
        while sum(self.edges.values()) > 0:
            i = iter(self.edges.items())
            edge, is_still_here = next(i)
            while not is_still_here:
                edge, is_still_here = next(i)
            cycle = self.find_cycle(edge)
            self.cycles.append(cycle)
    
    
    
    
class Edge:
    edges = []
    
    def __init__(self, sequence):
        self.sequence = sequence
        self.from_vertex = None
        self.to_vertex = None
    
    def get_next_edges():
        return self.to_vertex.out_edges
            
        
class Vertex:
    vertices = []
    
    def __init__(self, sequence):
        self.sequence = sequence
        self.in_edges = []
        self.out_edges = []
    
    def get_vertex(cls, sequence):
        for vertex in cls.vertices:
            if vertex.sequence == sequence:
                return vertex
    get_vertex = classmethod(get_vertex)
    

In [123]:
# PROGRAMME PRINCIPAL
# Parameters
genome_length= 1000
read_length=7
reads_nb=20
kmers_length=3

# Genome generation
genome = Genome('ATGGCGTGCA')
print(genome.sequence)

# Reads generation
Read.reads = genome.sequencing(read_length=read_length, reads_nb=reads_nb)
print(Read.reads)

# K-mers generation
Read.all_kmers = Read.generate_all_kmers(kmers_length=kmers_length)
print(Read.all_kmers)

# (K-1)-mers generation
Read.all_km1mers = Read.generate_all_km1mers()
print(Read.all_km1mers)

# Edges generation
for kmer in Read.all_kmers:
    Edge.edges.append(Edge(kmer))
print(Edge.edges)

# Vertex generation
for km1mer in Read.all_km1mers:
    Vertex.vertices.append(Vertex(km1mer))
print(Vertex.vertices)

# Edge - Vertex connecting
for edge in Edge.edges:
    edge.from_vertex = Vertex.get_vertex(edge.sequence[:-1])  # prefix
    edge.from_vertex.out_edges.append(edge)
    edge.to_vertex = Vertex.get_vertex(edge.sequence[1:])     # suffix
    edge.to_vertex.in_edges.append(edge)

# Graph generation



# Trouve tous les cycles


# Assemble les cycles


# Recréé la sequence


# Test genome de départ == genome assemblé




ATGGCGTGCA
[<__main__.Read object at 0x7fe271c7d470>, <__main__.Read object at 0x7fe271c107b8>, <__main__.Read object at 0x7fe271c107f0>, <__main__.Read object at 0x7fe271c10828>, <__main__.Read object at 0x7fe271c10860>, <__main__.Read object at 0x7fe271c10898>, <__main__.Read object at 0x7fe271c108d0>, <__main__.Read object at 0x7fe271c10908>, <__main__.Read object at 0x7fe271c10940>, <__main__.Read object at 0x7fe271c10978>, <__main__.Read object at 0x7fe271c109b0>, <__main__.Read object at 0x7fe271c109e8>, <__main__.Read object at 0x7fe271c10a20>, <__main__.Read object at 0x7fe271c10a58>, <__main__.Read object at 0x7fe271c10a90>, <__main__.Read object at 0x7fe271c10ac8>, <__main__.Read object at 0x7fe271c10b00>, <__main__.Read object at 0x7fe271c10b38>, <__main__.Read object at 0x7fe271c10b70>, <__main__.Read object at 0x7fe271c10ba8>]
['AAT', 'GCA', 'GTG', 'CAA', 'GGC', 'GCG', 'CGT', 'TGC', 'ATG', 'TGG']
['CG', 'AT', 'TG', 'GC', 'GT', 'CA', 'GG', 'AA']
[<__main__.Edge object at 0x

---
## Tests

In [58]:
a = {2:True, 4:True, 5:False}