In [1]:
from Bio import SeqIO # Permet l'import de la fonction parse
from Bio.SeqIO.QualityIO import FastqGeneralIterator # Permet un parse plus rapide lorsque beaucoup de séquences (fastaq uniquement sinon voir site biopython pour fasta)
import functools as ft # Permet de regrouper des listes pour radix sort
import numpy as np

from tqdm import tqdm # Permet d'estimer le temps d'éxécution sur un boucle
from datetime import datetime # Permet de comparer la vitesse de 2 programmes 

### Import des listes reads et genome

In [3]:
def genome_import() :
    """
    Fonction qui importe depuis un fichier fasta la séquence du génome dans une liste contenant ["seq", "n°k"]
    """
    list_genom=[]
    for record in SeqIO.parse("GCF_000002765.5_GCA_000002765_genomic.fna","fasta"):
        list_genom.append([str(record.seq).upper(),record.description[-14:]]) # attention le marquage est à changer pour la dernière séquence
    
    return list_genom

def reads_import_cuts(k) :
    """
    Fonction qui importe depuis un fichier fasta les séquences des reads et les coupes en les rangeant
    dans une liste contenant ["seq", "nom", "n°kmer"]
    
    Entrée : k, int : longueur du kmer
    
    Sortie : list_reads : liste de tous les read découpés en kmers
    """
    list_reads=[]
    with open("single_Pfal_dat.fq") as in_handle:
        for title, seq, qual in tqdm(FastqGeneralIterator(in_handle), desc = "Import sequence"):
            i = 1    # Incrémenteur du nombre de k-mer
            while len(seq) >= 1 : # On parcoure toute la séquence
                if len(seq) > k : # Cas où le k-mer est entier
                    list_reads.append([str(seq[0:k]).upper(),title, i])
                    i += 1
                    seq = seq[k:]
                else : # Cas où le dernier k-mer n'est pas entier
                    list_reads.append([str(seq).upper(),title, i])
                    seq = ""
            
    return list_reads

In [4]:
now = datetime.now()

list_genom = genome_import()

list_reads= reads_import_cuts(10)

time_imp = datetime.now() - now
print("Temps total =", time_imp)
print(list_reads[0:6])

Import sequence: 1500000it [00:43, 34197.32it/s]

Temps total = 0:00:44.382836
[['TTTCCTTTTT', 'NC_004325.2-100000', 1], ['AAGCGTTTTA', 'NC_004325.2-100000', 2], ['TTTTTTAATA', 'NC_004325.2-100000', 3], ['AAAAAAATAT', 'NC_004325.2-100000', 4], ['AGTATTATAT', 'NC_004325.2-100000', 5], ['AGTAACGGGT', 'NC_004325.2-100000', 6]]





### Import de la DC3

In [5]:
def import_file(filename, prog) :
    
    if prog == "DC3" :
            filename = "DC3_save/" + filename
    elif prog == "Map" :
            filename = "Mapping_save/" + filename
    
    with open(filename, 'r') as f:
        
        # Partie DC3:
        if prog == "DC3" :
            text = f.readline()
            data = text.split(" ")[:-1]
            print(data)
            for i in range(len(data)) :
                data[i] = int(data[i])
            return(data)
        
        elif prog == "Map" :
            n = int(f.readline())
            data = []
            for i in range(n):
                temp = f.readline()
                temp = temp.split("&")
                temp_lis = temp[3][1:-2].split(",") # Car il reste les []
                lis = []
                for j in range(len(temp_lis)) :
                    lis.append(int(temp_lis[j]))
                data.append([temp[0], temp[1], int(temp[2]), lis])
            return data

In [10]:
DC3_chrom1 = import_file("DC3_chrom1.txt", "DC3")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Fonction de string search

In [6]:
def BWT(text,suffix_table):
    """
    Compute the BWT from the suffix table

    Args:
        T (str): string
        end_of_string (char): end of string character to append

    Return:
        bwt (str): BWT
    """
    bwt = ""
    sf_tab = suffix_table
    for i in range(len(sf_tab)):
        crt = sf_tab[i]
        bwt += text[crt-1]
    return(bwt)

In [7]:
def pattern_matching_BWT(S,pattern,bwt,index,somme):
    """
    Search a pattern in a String using the BWT

    Args:
        S (str): string
        pattern (str): pattern
        bwt : the bwt of the text (to not compute it each time)

    Return:
        bool: true if the pattern is in the string
        int : position of the first occurence of the pattern in the ordered text
        int : position of the last occurence of the pattern in the ordered text
    """
    pattern_in_S = False
    L = list(bwt)
    lpattern = list(pattern)
    start_string = -1
    end_string = -1
    ##init des valeurs utiles pour la substring search
    e = 0
    f = len(L)
    i = len(lpattern)-1
    ##début de la boucle de recherche
    while e < f and i > 0 :
        X = lpattern[i]
        Y = lpattern[i-1]
        r = 0
        s = 0
        suite_impos = True
        for tpl in somme :
            if tpl[0]<X:
                r = tpl[1] ##donne place du premier char dans la liste ordonnée
            if tpl[0]==X:
                s = tpl[1]-1 ##donne place du dernier char
        if e>r:
            r = e
        if f<s:
            s = f
        for u in range(r,s+1):
            if(suite_impos==False):##we use the boolean to exit the loop early
                break
            if L[u]==Y:
                suite_impos= False
                prev = 0
                id = index[u]
                start_string = u
                for tpl in somme :
                    if tpl[0]<Y:
                        prev = tpl[1]
                e = prev + id[1]-1
        char_found = False  ##this allows to exit the loop early if the char has been found
        for u in reversed(range(r,s+1)):
            if(char_found or suite_impos):##on sort de la boucle si on trouve le char, ou si on sait déja qu'il n'est pas la
                break
            if L[u]==Y:
                char_found = True
                prev = 0
                id = index[u]
                end_string = u
                for tpl in somme :
                    if tpl[0]<Y:
                        prev = tpl[1]
                f = prev + id[1]-1
        if suite_impos :## this will stop the loop if no char has been found in the previous one
            break
        i-=1

    if suite_impos:
        i = 0
        pattern_in_S = False
        return pattern_in_S,start_string,end_string
    ##dans le cas où e = f, il faut vérifier que le reste du substring est bon
    if i > 0 :
        while i > 0 :
            if L[e] != lpattern[i-1]:
                break
            prev = 0
            id=index[e]
            start_string = e
            end_string = e
            for tpl in somme :
                if tpl[0]<L[e]:
                    prev = tpl[1]

            e = prev + id[1]
            i -= 1
    if i < 1 :
        pattern_in_S = True

    return pattern_in_S,start_string,end_string

In [8]:
def string_location(text,string,matches,suffix_table):
    '''
    Gives the position of each occurence of the substring in the text

    Args :
        text (string) : the text to search in
        string (string) : the substring to be searched

    Return :
        ???
    '''
    result = matches
    sft = suffix_table
    list_occur = []
    if result[0] == False :
        print("No occurence of the substring was found")
        list_occur.append(-1)
    else :
        for i in range(result[1],(result[2]+1)):
            id = sft[i-1]-1
            list_occur.append(id)
            print(text[id:id+len(string)])
    return(list_occur)

def k_positioning(text,patt,bwt,suffix_table):##permet d'obtenir la liste des positions
    ##initialisation de l'alphabet, de l'index et du compteur total de char
    L = list(bwt)
    alphabet = ['$','A','C','G','T']
    total = []
    index = L+[]
    for lett in alphabet:
        cntr = 0
        for i in range(len(L)):
            if L[i]==lett:
                cntr +=1
                index[i]=(lett,cntr)
        total.append((lett,cntr))##le faire en une liste somme et total
    somme = []
    som=0
    for tpl in total:
        som += tpl[1]
        somme.append((tpl[0],som))
    ##recuperation des positions des premiers et derniers patterns trouvés
    mat = pattern_matching_BWT(text,patt,bwt,index,somme)
    ##recupération et renvoi des positions de tout les patterns
    return string_location(text,patt,mat,suffix_table)

### Appel de la fonction :

In [11]:
sf_tab = DC3_chrom1
text = list_genom[0][0] # Corespond au DC3_chrom1 non DC3
pattern = list_reads[0][0] #Liste des pattern donc 15000000 de possibilité, changer le premier terme pour passer au pattern suivant

bwt = BWT(text,sf_tab)


print(k_positioning(text,pattern,bwt,sf_tab))

CTTCCTTTTG
TTTCCTTTTT
ATTCCTTTTT
CTTCCTTTTT
ATTCCTTTTT
GTTCCTTTTT
CTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
GTTCCTTTTT
GTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
GTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
CTTCCTTTTT
TTTCCTTTTT
GTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
CTTCCTTTTT
TTTCCTTTTT
CTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
CTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
GTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
ATTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
[87581, 198275, 291676, 52368, 452329, 441270, 541863, 237424, 346984, 412675, 284667, 504106, 222

In [12]:
print(list_reads[0][0]) # Affiche le vrai pattern pour le moment majoritairement retrouvé mais quelques erreurs d'identifications

TTTCCTTTTT


In [14]:
print(list_genom[0][0][87581:87591]) # Indice du premier pattern : ok position correcte !

CTTCCTTTTG
