# Notebook for development of the script 3

In [4]:
import random as rd
import numpy as np
import matplotlib.pyplot as plt
from Bio.Seq import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA, IUPACUnambiguousDNA
import reprlib
import time
from sys import getsizeof

In [41]:
class Genome(Seq):
    """Classe Genome"""
    def __init__(self, seq, circular=True):
        Seq.__init__(self, seq, alphabet=IUPACUnambiguousDNA())
        self.circular=circular
        
    def sequencing(self, read_length=100, reads_nb=5000):
        reads=[]
        for _ in range(reads_nb):
            start = rd.randint(0, len(self)-1)
            read_seq = self._data[start:min(start+100, len(self))] + self._data[0:max(start+100-len(self), 0)]
            read = Read(read_seq)
            reads.append(read)
        return reads


In [42]:
class Read(Seq):
    """Classe Read"""
    reads=[]
    
    def __init__(self, seq, circular=True):
        Seq.__init__(self, seq, alphabet=IUPACUnambiguousDNA())
    
    def generate_kmers(self, kmers_length=30):
        """Returns a list of the k-mers included in the read"""
        kmers=[]
        for i in range(len(self) - kmers_length):
            kmer_seq = self._data[i:i+kmers_length]
            kmer = Kmer(kmer_seq)
            kmers.append(kmer)
        return kmers


In [43]:
class Kmer(Seq):
    """Classe Kmer"""
    kmers=set()
    
    def __init__(self, seq):
        Seq.__init__(self, seq, alphabet=IUPACUnambiguousDNA())
        self.prefix = None
        self.suffix = None
    

In [44]:
class Sommet(Seq):
    """Classe Sommet"""
    sommets = {}
    
    def __init__(self, seq):
        Seq.__init__(self, seq, alphabet=IUPACUnambiguousDNA())
        

In [45]:
class Graph:
    def __init__(self, list_sommets):
        self.network = {}.fromkeys(list_sommets, [])
        self.eulerian = None
    
    def fill_network(self, kmers):
        for kmer in kmers:
            self.network[kmer.prefix].append(kmer.suffix)
    
    def test_eulerian(self):
        for i in range(len(self.nodes)):
            if self.matrix[i, :].sum() != self.matrix[:, i].sum():
                print("PROBLEM : This graph is not Eulerian.")
                self.eulerian = False
                break
        if self.eulerian != False:
            print('SUCCESS : This graph is Eulerian !!')
            self.eulerian = True
        

In [46]:
# Trouve 1 cycle
def find_eulerian_cycle():
    # attention, ce code ne gere pas les sommets connectés avec eux-memes
    cycle = []
    start, s = np.array(np.where(graph.matrix))[   :,0]
    cycle.append(start)
    graph.matrix[start, s]-=1
    while s!= start:
        cycle.append(s)
        t = np.where(graph.matrix[s, :])[0][0]     # s comme sommet, t comme temporary
        graph.matrix[s, t]-=1
        s = t
    return cycle

In [47]:
# Test fin
def test_fin():
    if graph.matrix.sum()==0: return True
    else: return False

In [48]:
# Assemble les cycles
def assemble(cycle, tab):      # cycle a parcourir pour avancer 
                           # & tableau des cycles restant a parcourir
    for s in cycle:        # prenons un sommet du cycle
        seq.append(s)        # ajoutons le a la sequence
        for c in tab:          # puis testons s'il est également connecté à un des cycles restant
            if s in c:
                nextcycle = c[c.index(s)+1:] + c[:c.index(s)+1]     # Si oui, recadrons le prochain cycle pour que le parcours de la liste commence bien au début
                nexttab = tab[tab.index(c)+1:] + tab[:tab.index(c)]   # enlevons également ce cycle du tableau pour le prochain appel de la fonction
                assemble(nextcycle, nexttab)
                tab.remove(c)
                break
                

In [49]:
# PROGRAMME PRINCIPAL
# Parameters
genome_length= 1000
read_length=7
reads_nb=20
kmers_length=3

# Genome generation
# genome= Genome(''.join(rd.choices(["A", "T", "G", "C"], k=genome_length)), circular=True)
genome = Genome('ATGGCGTGCA')

# Reads generation
Read.reads = genome.sequencing(read_length=read_length, reads_nb=reads_nb)
print(reprlib.repr(Read.reads))

# K-mers generation
for read in Read.reads:
    Kmer.kmers.update(read.generate_kmers(kmers_length=kmers_length))
print(reprlib.repr(Kmer.kmers))

# (K-1)-mers generation
for kmer in Kmer.kmers:
    #prefix
    pref = kmer[:-1]
    if pref not in Sommet.sommets:
        s = Sommet(str(pref))
        Sommet.sommets[pref] = s
        kmer.prefix = s
    else:
        kmer.prefix = Sommet.sommets[pref]
    #suffix
    suff = kmer[1:]
    if suff not in Sommet.sommets:
        s = Sommet(str(suff))
        Sommet.sommets[suff] = s
        kmer.suffix = s
    else:
        kmer.suffix = Sommet.sommets[suff]
print(reprlib.repr(len(Sommet.sommets)))
    
    
    
# Graph generation
graph = Graph(Sommet.sommets.values())
print(reprlib.repr(graph.network))
graph.fill_network(Kmer.kmers)
print(reprlib.repr(graph.network))

graph.test_eulerian()
print(reprlib.repr(graph.nodes))

# Trouve tous les cycles
cycles = []
while test_fin() == False:
    cycles.append(find_eulerian_cycle())
print(str(len(cycles))+' cycles trouvés')

# Assemble les cycles
if len(cycles) == 1:
    seq = cycles[0]
else:
    seq=[]
    assemble(cycles[0], cycles[1:])
print(reprlib.repr(seq))

# Recréé la sequence
genome_assembly = ''
for s in seq:
    genome_assembly+=graph.nodes[s][0]
genome_assembly

# test genome de départ == genome assemblé
offset = (genome_assembly*2).index(genome._data)
genome_assembly[offset:]+genome_assembly[:offset] == genome._data

[Read('TGGCGTG...mbiguousDNA()), Read('CAATGGC...mbiguousDNA()), Read('CAATGGC...mbiguousDNA()), Read('GCAATGG...mbiguousDNA()), Read('AATGGCG...mbiguousDNA()), Read('GCAATGG...mbiguousDNA()), ...]
{Kmer('AAT', I...mbiguousDNA()), Kmer('ATG', I...mbiguousDNA()), Kmer('CAA', I...mbiguousDNA()), Kmer('CGT', I...mbiguousDNA()), Kmer('GCA', I...mbiguousDNA()), Kmer('GCG', I...mbiguousDNA()), ...}
8
{Sommet('AA', ...mbiguousDNA()): [], Sommet('AT', ...mbiguousDNA()): [], Sommet('CA', ...mbiguousDNA()): [], Sommet('CG', ...mbiguousDNA()): [], ...}
{Sommet('AA', ...mbiguousDNA()): [Sommet('GC', ...mbiguousDNA()), Sommet('AT', ...mbiguousDNA()), Sommet('GG', ...mbiguousDNA()), Sommet('GT', ...mbiguousDNA()), Sommet('TG', ...mbiguousDNA()), Sommet('CA', ...mbiguousDNA()), ...], Sommet('AT', ...mbiguousDNA()): [Sommet('GC', ...mbiguousDNA()), Sommet('AT', ...mbiguousDNA()), Sommet('GG', ...mbiguousDNA()), Sommet('GT', ...mbiguousDNA()), Sommet('TG', ...mbiguousDNA()), Sommet('CA', ...mbiguousDNA



AttributeError: 'Graph' object has no attribute 'nodes'

In [50]:
len(Read.reads)

20

In [51]:
len(Kmer.kmers)

10

In [52]:
len(Sommet.sommets)

8

In [53]:
Sommet.sommets



{Seq('GG', IUPACUnambiguousDNA()): Sommet('GG', IUPACUnambiguousDNA()),
 Seq('GC', IUPACUnambiguousDNA()): Sommet('GC', IUPACUnambiguousDNA()),
 Seq('AA', IUPACUnambiguousDNA()): Sommet('AA', IUPACUnambiguousDNA()),
 Seq('AT', IUPACUnambiguousDNA()): Sommet('AT', IUPACUnambiguousDNA()),
 Seq('TG', IUPACUnambiguousDNA()): Sommet('TG', IUPACUnambiguousDNA()),
 Seq('CG', IUPACUnambiguousDNA()): Sommet('CG', IUPACUnambiguousDNA()),
 Seq('GT', IUPACUnambiguousDNA()): Sommet('GT', IUPACUnambiguousDNA()),
 Seq('CA', IUPACUnambiguousDNA()): Sommet('CA', IUPACUnambiguousDNA())}

---
## Tests

In [None]:
def assemble(cycle, tab):      # cycle a parcourir pour avancer 
                           # & tableau des cycles restant a parcourir
    for s in cycle:        # prenons un sommet du cycle
        seq.append(s)        # ajoutons le a la sequence
        for c in tab:          # puis testons s'il est également connecté à un des cycles restant
            if s in c:
                nextcycle = c[c.index(s)+1:] + c[:c.index(s)]     # Si oui, recadrons le prochain cycle pour que le parcours de la liste commence bien au début
                nexttab = tab[tab.index(c)+1:] + tab[:tab.index(c)]   # enlevons également ce cycle du tableau pour le prochain appel de la fonction
                
                

In [None]:
def assemble(cycle, tab):      # cycle a parcourir pour avancer 
                           # & tableau des cycles restant a parcourir
    for s in cycle:        # prenons un sommet du cycle
        seq.append(s)        # ajoutons le a la sequence
        for j, c in enumerate(tab):          # puis testons s'il est également connecté à un des cycles restant
            for i, s1 in enumerate(c):
                if s == s1:
                    nextcycle = c[i+1:] + c[:i]     # Si oui, recadrons le prochain cycle pour que le parcours de la liste commence bien au début
                    nexttab = tab[j+1:] + tab[:j]   # enlevons également ce cycle du tableau pour le prochain appel de la fonction
            
            
            
            
            if s in c:
                nextcycle = c[c.index(s)+1:] + c[:c.index(s)]     # Si oui, recadrons le prochain cycle pour que le parcours de la liste commence bien au début
                nexttab = tab[j+1:] + tab[:j]   # enlevons également ce cycle du tableau pour le prochain appel de la fonction
                
                

In [11]:
a = set(['AT', 'RE', 'GK'])

In [17]:
class Sommet:
    def __init__(self, seq):
        self.seq=seq

In [23]:
s = Sommet('ATA')

In [25]:
s.__hash__()

8733618324814

In [26]:
b = s

In [27]:
hash(b)

8733618324814

In [28]:
c = Sommet('ATA')

In [29]:
hash(c)

8733618327304

{'seq': 'ATA'}

In [35]:
a = {}

In [36]:
dir(a)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [None]:
a[]