In [1]:
from Bio import SeqIO # Permet l'import de la fonction parse
from Bio.SeqIO.QualityIO import FastqGeneralIterator # Permet un parse plus rapide lorsque beaucoup de séquences (fastaq uniquement sinon voir site biopython pour fasta)
import functools as ft # Permet de regrouper des listes pour radix sort
import numpy as np
from collections import Counter 

from tqdm import tqdm # Permet d'estimer le temps d'éxécution sur un boucle
from datetime import datetime # Permet de comparer la vitesse de 2 programmes 

### Import des listes reads et genome

In [2]:
def genome_import() :
    """
    Fonction qui importe depuis un fichier fasta la séquence du génome dans une liste contenant ["seq", "n°k"]
    """
    list_genom=[]
    for record in SeqIO.parse("GCF_000002765.5_GCA_000002765_genomic.fna","fasta"):
        list_genom.append([str(record.seq).upper(),record.description[-14:]]) # attention le marquage est à changer pour la dernière séquence
    
    return list_genom

def reads_import_cuts(k) :
    """
    Fonction qui importe depuis un fichier fasta les séquences des reads et les coupes en les rangeant
    dans une liste contenant ["seq", "nom", "n°kmer"]
    
    Entrée : k, int : longueur du kmer
    
    Sortie : list_reads : liste de tous les read découpés en kmers
    """
    list_reads=[]
    with open("single_Pfal_dat.fq") as in_handle:
        for title, seq, qual in tqdm(FastqGeneralIterator(in_handle), desc = "Import sequence"):
            i = 1    # Incrémenteur du nombre de k-mer
            while len(seq) >= 1 : # On parcoure toute la séquence
                if len(seq) > k : # Cas où le k-mer est entier
                    list_reads.append([str(seq[0:k]).upper(),title, i])
                    i += 1
                    seq = seq[k:]
                else : # Cas où le dernier k-mer n'est pas entier
                    list_reads.append([str(seq).upper(),title, i])
                    seq = ""
            
    return list_reads

In [3]:
now = datetime.now()

list_genom = genome_import()

list_reads= reads_import_cuts(10)

time_imp = datetime.now() - now
print("Temps total =", time_imp)
print(list_reads[0:6])

Import sequence: 1500000it [00:25, 58263.69it/s] 

Temps total = 0:00:26.182152
[['TTTCCTTTTT', 'NC_004325.2-100000', 1], ['AAGCGTTTTA', 'NC_004325.2-100000', 2], ['TTTTTTAATA', 'NC_004325.2-100000', 3], ['AAAAAAATAT', 'NC_004325.2-100000', 4], ['AGTATTATAT', 'NC_004325.2-100000', 5], ['AGTAACGGGT', 'NC_004325.2-100000', 6]]





### Import de la DC3

In [26]:
def import_file(filename, prog) :
    
    if prog == "DC3" :
            filename = "DC3_save/" + filename
    elif prog == "Map" :
            filename = "Mapping_save/" + filename
    
    with open(filename, 'r') as f:
        
        # Partie DC3:
        if prog == "DC3" :
            text = f.readline()
            data = text.split(" ")[:-1]
            print(data)
            for i in range(len(data)) :
                data[i] = int(data[i])
            return(data)
        
        elif prog == "Map" :
            n = int(f.readline())
            data = []
            for i in range(n):
                temp = f.readline()
                temp = temp.split("&")
                temp_lis = temp[3][1:-2].split(",") # Car il reste les []
                lis = []
                for j in range(len(temp_lis)) :
                    lis.append(int(temp_lis[j]))
                data.append([temp[0], temp[1], int(temp[2]), lis])
            return data

In [27]:
DC3_chrom1 = import_file("DC3_chrom1.txt", "DC3")

['640851', '172408', '172409', '120988', '172410', '438563', '120989', '542369', '172411', '438564', '120990', '542370', '172412', '438565', '120991', '550307', '542371', '172413', '152787', '438566', '120992', '550308', '393610', '123276', '542372', '520735', '172414', '152788', '438567', '175697', '120993', '550309', '393611', '123277', '542373', '520736', '172415', '152789', '438568', '337094', '383737', '175698', '120994', '550310', '393612', '359318', '123278', '542374', '520737', '172416', '345874', '152790', '386731', '438569', '170857', '337095', '383738', '175699', '120995', '550311', '393613', '359319', '123279', '542375', '520738', '172417', '345875', '152791', '386732', '438570', '170858', '337096', '383739', '175700', '120996', '550312', '393614', '295131', '270532', '359320', '123280', '542376', '520739', '172418', '259042', '345876', '382971', '152792', '386733', '438571', '170859', '337097', '383740', '175701', '120997', '550313', '393615', '295132', '270533', '359321',

### Fonction de string search

In [22]:
def BWT(text,suffix_table):
    """
    Compute the BWT from the suffix table

    Args:
        T (str): string
        suffix_table (list) : the ordered list of all suffixes

    Return:
        bwt (str): BWT
    """
    bwt = ""
    suffix_table
    for i in range(len(suffix_table)):
        crt = suffix_table[i]
        bwt += text[crt-1]
    return(bwt)

In [29]:
def pattern_matching_BWT(S,pattern,bwt,index,somme):
    """
    Search a pattern in a String using the BWT

    Args:
        S (str): string
        pattern (str): pattern
        bwt (str) : the bwt of the text (to not compute it each time)
        index (list): the index for the chars in the bwt
        somme (list): the sum of each char in the bwt

    Return:
        bool: true if the pattern is in the string
        int : position of the first occurence of the pattern in the ordered text
        int : position of the last occurence of the pattern in the ordered text
    """
    pattern_in_S = False
    L = list(bwt)
    lpattern = list(pattern)
    start_string = -1
    end_string = -1
    ##init des valeurs utiles pour la substring search
    e = 0
    f = len(L)
    i = len(lpattern)-1
    X = lpattern[i]##correspond au dernier char du kmer, le premier a être cherché dans la string search
    for tpl in somme :
        if tpl[0]<X:
            e = tpl[1]+1 ##donne place du premier char dans la liste ordonnée
        if tpl[0]==X:
            f = tpl[1]-1 ##donne place du dernier char

    while e < f and i > 0 :
        X = lpattern[i]
        Y = lpattern[i-1]
        suite_impos = True
        r = e
        s = f
        for u in range(r,s+1):##On cherche le char suivant du pattern dans la bwt
            if(suite_impos==False):##we use the boolean to exit the loop early
                break
            if(L[u]==Y):
                suite_impos= False
                prev = 0
                idx = index[u]
                for tpl in somme :
                    if tpl[0]<Y:
                        prev = tpl[1]
                e = prev + idx ##on calcule la position du char trouvé dans la liste des chars sorted.
                start_string = e

        char_found = False
        for u in reversed(range(r,s+1)):
            if(char_found or suite_impos):
                break
            if(L[u]==Y):
                char_found = True
                prev = 0
                idx = index[u]
                for tpl in somme :
                    if tpl[0]<Y:
                        prev = tpl[1]
                f = prev + idx
                end_string = f
        if suite_impos :## this will stop the loop if no char has been found in the previous one
            break
        i -= 1
    if suite_impos:
        i = 0
        pattern_in_S = False
        return pattern_in_S,start_string,end_string
    ##dans le cas où e = f, il faut vérifier que le reste du substring est bon
    if i > 0 :
        while i > 0 :##on fait une bwt inverse "classique" en vérifiant a chaque fois que le char suivant 
                     ##est bien le même que celui du pattern
            if L[e] != lpattern[i-1]:
                break
            prev = 0
            idx = index[e]
            start_string = e
            end_string = e
            for tpl in somme :
                if tpl[0]<L[e]:
                    prev = tpl[1]

            e = prev + idx
            i -= 1
        idx = index[e]
        start_string = e
        end_string = e
    if i < 1 :
        pattern_in_S = True
    return pattern_in_S,start_string,end_string

In [24]:
def string_location(text,string,matches,suffix_table):
    '''
    Gives the position of each occurence of the substring in the text

    Args :
        text (string) : the text to search in
        string (string) : the substring that was search (the pattern)
        matches (tuple) : the output of the pattern matching function, containing a boolean 
            to say if an occurence was found (matches[0]), the position of the first occurence 
            in the suffix table (matches[1]) and the position of the last occurence (matches[2])
        suffix_table (list) : the ordered list of all suffixes

    Return :
        (list) : the list of all positions of the occurences of the pattern in the original text
    '''
    sft = suffix_table
    list_occur = []
    if result[0] == False :
        print("No occurence of the substring was found")
        list_occur.append(-1)
    else :
        for i in range(matches[1],(matches[2]+1)):##les occurences du patterns se suivent toutes dans la table des suffixes
            idx = sft[i]
            list_occur.append(idx)
            ##print(text[idx:idx+len(string)])
    return(list_occur)

def k_positioning(text,patt,bwt,suffix_table,index,somme):##permet d'obtenir la liste des positions
    '''
    Args :
        text (string) : the text to search in
        string (string) : the substring that will be searched (the pattern)
        bwt (str) : the bwt of the text (to not compute it each time)
        suffix_table (list) : the ordered list of all suffixes
        index (list): the index for the chars in the bwt
        somme (list): the sum of each char in the bwt
    
    Return :
        (list) : the list of all positions of the occurences of the pattern in the original text
    '''
    ##recuperation des positions des premiers et derniers patterns trouvés
    mat = pattern_matching_BWT(text,patt,bwt,index,somme)
    ##recupération et renvoi des positions de tout les patterns
    return string_location(text,patt,mat,suffix_table)

### Appel de la fonction :

In [28]:
sf_tab = DC3_chrom1
text = list_genom[0][0] # Corespond au DC3_chrom1 non DC3
num = 0
pattern = list_reads[num][0] #Liste des pattern donc 15000000 de possibilité, changer le premier terme pour passer au pattern suivant
text = text+"$"
bwt = BWT(text,sf_tab)
##initialisation de l'alphabet, de l'index et du compteur total de char
L = list(bwt)
alphabet = ['$','A','C','G','T']
index = []
char_index = {}
for char in L:
    if char not in char_index: 
        char_index[char] = 0
    index.append(char_index[char])
    char_index[char] += 1
total = Counter(text)
som = 0
somme = []
for char in alphabet:
    som += total[char]
    somme.append((char,som))
print(k_positioning(text,pattern,bwt,sf_tab,index,somme))

608648 608704
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
TTTCCTTTTT
[198275, 346984, 412675, 222197, 222208, 222219, 141536, 278529, 472382, 95770, 370439, 308410, 373485, 593890, 68418, 593802, 423536, 536793, 249822, 201291, 380365, 569951, 361808, 434544, 52645, 389576, 293273, 73765, 444636, 412568, 253527, 436364, 436171, 436187, 436203, 436219, 436235, 436251, 436267, 436283, 436299, 444994, 264242, 225936, 436348, 43

In [10]:
print(list_reads[num][0]) # Affiche le vrai pattern pour le moment majoritairement retrouvé mais quelques erreurs d'identifications

TAAAATATTG


In [11]:
print(list_genom[0][0][87581:87591]) # Indice du premier pattern : ok position correcte !

CTTCCTTTTG
