In [1]:
#instaliranje potrebne datoteke
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [2]:
from Bio import SeqIO
import pandas as pd
import tensorflow as tf
import numpy as np
import random
import keras
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers
import math

In [3]:
#spajanje na google drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
#Funkcija za učitavanje FASTA datoteke
def get_sequences_from_fasta(fasta_file):
    sequences = []
    with open(fasta_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            sequences.append(str(record.seq))
    return sequences

In [5]:
#funkcija za generiranje skupa podataka za treniranje i validaciju
data_length=500
korak=100
def getData(fasta_file,gff_file_path,podaci):

    sequence = get_sequences_from_fasta(fasta_file)[0]

    #ucitavanje gff file-a
    columns = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
    GFF_DataFrame = pd.read_csv(gff_file_path, sep="\t", comment="#", names=columns)

    #stvaranje vektora koji određuje da li je baza dio gena ili nije
    gene_vector=tf.zeros(shape=len(sequence))

    for index,row in GFF_DataFrame.iterrows():
        if(row["type"]=="gene"):


            start_position = row["start"]-1
            end_position = row["end"]-1


            mask = tf.concat([
                tf.zeros(start_position),
                tf.ones(end_position - start_position + 1),
                tf.zeros(gene_vector.shape[0] - end_position - 1)
            ], axis=0)


            gene_vector = gene_vector + mask


    gene_vector=gene_vector.numpy()
    for i in range(len(gene_vector)):
        if(gene_vector[i]!=0):
            gene_vector[i]=1
    gene_vector=tf.convert_to_tensor(gene_vector)



    #dijeljenje sekvence na dijelove
    for i in range(math.ceil(len(sequence)/korak)):
        slice_of_seq=sequence[i*korak:i*korak+data_length]
        slice_of_vector=gene_vector[i*korak:i*korak+data_length]

        if(len(slice_of_vector)!=data_length):

            start=(i*korak-len(sequence))
            end=i*korak+data_length-len(sequence)

            #loop around - spaja kraj i pocetak jer je DNA kružna
            slice_of_seq=sequence[start:] + sequence[:end]
            slice_of_vector=tf.concat([gene_vector[start:], gene_vector[:end]], axis=0)

        podaci.append((slice_of_seq,slice_of_vector))
    print("Gotovo procesiranje podataka za bakteriju")
    return podaci


In [6]:
#dohvaćanje podataka sa google drive-a
import os
data=[]
cwd = os.getcwd()
folder_path = "/content/drive/MyDrive/Test_gene_annotations"
gene_annotations = os.listdir(folder_path)


for bacteria_name in gene_annotations:

    folder_path="/content/drive/MyDrive" +"/Test_gene_annotations/"+bacteria_name+"/data"
    files_in_annotations = os.listdir(folder_path)

    for gca_file in files_in_annotations:


        if(gca_file.startswith("GCA")):
            folder_path="/content/drive/MyDrive"+"/Test_gene_annotations/"+bacteria_name+"/data/"+gca_file
            gca_files = os.listdir(folder_path)

            for file in gca_files:

                if(file.startswith("GCA")):
                    fasta_file="/content/drive/MyDrive/"+"Test_gene_annotations/"+bacteria_name+"/data/"+gca_file+"/"+file

                if(file=="genomic.gff"):
                    gff_file_path="/content/drive/MyDrive/"+"Test_gene_annotations/"+bacteria_name+"/data/"+gca_file+"/"+file
    print("\nProcesiranje podataka za "+ bacteria_name+"\n")
    data=getData(fasta_file,gff_file_path,data)







Procesiranje podataka za Pseudomonas_aeruginosa

Gotovo procesiranje podataka za bakteriju


In [7]:
#kreiranje dataseta za testiranje
test_data = data[:int(len(data))]

(test_seq, test_vector) = zip(*test_data)

test_seq=np.array(test_seq)
test_vector=np.array(test_vector)

test_dataset = tf.data.Dataset.from_tensor_slices((test_seq, test_vector)).batch(32)
print("Veličina skupa za testiranje: " + str(len(test_data)))
print("Veličina dataseta za testiranje: " + str(len(test_dataset)))
print("DONE")

Veličina skupa za testiranje: 62645
Veličina dataseta za testiranje: 1958
DONE


In [8]:
#custom split funkcija
def custom_split_fn(string_tensor):
    return tf.strings.unicode_split(string_tensor, 'UTF-8')

print(custom_split_fn("ACAAGTTAC"))
print("DONE")

tf.Tensor([b'A' b'C' b'A' b'A' b'G' b'T' b'T' b'A' b'C'], shape=(9,), dtype=string)
DONE


In [9]:
#sloj za vekotrizaciju teksta
text_vectorization= TextVectorization(
    output_mode="int",
    split=custom_split_fn,
    output_sequence_length=data_length,
)
dataset=["ACGTTTGAGTCCAT"]
text_vectorization.adapt(dataset)
print(text_vectorization.get_vocabulary())
print("DONE")

['', '[UNK]', 't', 'g', 'c', 'a']
DONE


In [10]:
#vektoriziranje dataseta
int_test_ds= test_dataset.map(
    lambda x,y: (text_vectorization(x)-2,y),
    num_parallel_calls=4
)
print("DONE")

DONE


In [11]:
for inputs, targets in int_test_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("inputs[0]:", tf.one_hot(inputs[0], depth=4))
  break

inputs.shape: (32, 500)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32, 500)
targets.dtype: <dtype: 'float32'>
inputs[0]: tf.Tensor(
[0 0 0 3 3 3 1 3 1 3 2 2 1 1 2 1 3 0 0 2 0 3 1 0 1 3 3 3 0 2 1 3 3 2 1 1 1
 2 3 1 1 0 2 3 3 0 0 0 2 2 3 3 2 2 3 1 2 1 3 0 1 3 2 1 0 3 3 0 3 1 3 0 3 1
 3 0 3 2 3 3 1 1 3 3 1 0 2 3 0 0 0 0 0 2 0 0 0 0 3 3 3 1 1 3 0 3 1 3 3 3 2
 1 1 0 0 3 3 0 1 2 0 2 0 0 1 1 1 3 2 1 1 2 1 2 0 0 0 0 2 0 1 0 1 2 3 0 3 3
 2 0 2 1 3 0 1 3 3 1 2 2 2 3 1 2 3 3 0 0 1 2 1 0 1 0 0 0 2 0 2 2 1 1 2 3 1
 1 2 3 3 3 3 1 1 0 0 1 0 2 1 3 1 3 3 2 2 1 1 0 1 0 2 1 3 1 1 2 0 1 0 0 0 2
 2 0 0 2 2 0 1 3 1 2 1 3 3 1 2 2 0 1 1 1 1 3 0 1 3 3 2 1 3 1 3 0 1 1 0 0 3
 0 2 2 3 2 3 1 2 1 1 0 0 0 0 0 0 2 2 3 2 3 2 1 1 2 0 1 0 1 2 1 2 3 1 1 1 3
 0 1 0 3 2 2 2 2 2 0 0 2 3 3 3 1 2 3 3 1 1 1 0 0 3 0 2 2 3 2 3 3 3 1 0 2 2
 3 1 1 3 2 1 3 2 2 1 0 2 2 1 0 2 1 1 2 2 0 1 2 2 0 1 2 0 0 0 0 3 0 0 3 3 1
 1 0 2 0 0 1 3 0 0 0 1 2 0 0 1 1 1 1 2 2 0 2 3 1 2 1 2 3 0 2 1 1 2 3 0 1 0
 1 1 3 0 3 3 1 0 2 2 1 1 2 2 2 1 0 2 2 

In [12]:
#učitavanje modela sa google drive-a
test_model = keras.models.load_model("/content/drive/MyDrive/upgraded_trained_model")
print("Uspješno učitan model")

Uspješno učitan model


In [13]:
#testiranje modela i dohvaćanje izlaza modela
test_loss, test_acc = test_model.evaluate(int_test_ds)
print(f"Test accuracy: {test_acc:.3f}")

predictions = test_model.predict(int_test_ds)
print("Gotovo evaluiranje i predviđanje")

Test accuracy: 0.954
Gotovo evaluiranje i predviđanje


In [14]:
#većinsko glasanje i sastavljanje vektora za genom
final_vector=[]
for counter in range(len(predictions)):

  for i in range(korak):
    rez=0
    for j in range(int(data_length/korak)):

      if(predictions[counter-1*j][i+korak*j]>0.5):
        rez+=1
      else:
        rez+=0

    rez/=(data_length/korak)
    if(rez>0.5):
      final_vector.append(1)

    else:
      final_vector.append(0)


In [23]:
#funkcija za kreiranje anotacija
def dohvacanje_anotacija(izgladeni_vector):
  lista_anotacija=[]
  i=0
  ind=0
  first=True
  while(i<len(izgladeni_vector)):
    if(izgladeni_vector[i]==1 and first):
      ind=i
      first=False
      i+=1
    elif(izgladeni_vector[i]==1 and not first):
      i+=1
      continue
    elif(izgladeni_vector[i]==0 and not first):
      lista_anotacija.append([ind,i])
      first=True
      i+=1
    elif(izgladeni_vector[i]==0 and first):
      i+=1
      continue

  #print(lista_anotacija)
  return lista_anotacija


In [16]:
#zaglađivanje vektora kroz 5 iteracija
def izgladivanje(final_vector):

  izgladeni_vector=[]
  duljina_s_jedne_strane=10

  for baza in range(len(final_vector)):

    rez=0
    for i in range(1,duljina_s_jedne_strane+1):

        if(baza+i>=len(final_vector)):
          rez+=final_vector[baza+i-len(final_vector)] + final_vector[baza-i]
        else:
          rez+=(final_vector[baza+i] + final_vector[baza-i])

    rez=rez/(duljina_s_jedne_strane*2)

    if(rez>0.5):
      izgladeni_vector.append(1)
    else:
      izgladeni_vector.append(0)

  return izgladeni_vector

g=izgladivanje(final_vector)
for i in range(3):
  g=izgladivanje(g)

izgladeni_vector=izgladivanje(g)
print("DONE")

DONE


In [17]:
#učitavanje anotacija iz gff datoteke
folder_path = "/content/drive/MyDrive/Test_gene_annotations"

gene_annotations = os.listdir(folder_path)

for bacteria_name in gene_annotations:

    folder_path="/content/drive/MyDrive" +"/Test_gene_annotations/"+bacteria_name+"/data"

    files_in_annotations = os.listdir(folder_path)

    for gca_file in files_in_annotations:


        if(gca_file.startswith("GCA")):
            folder_path="/content/drive/MyDrive"+"/Test_gene_annotations/"+bacteria_name+"/data/"+gca_file
            gca_files = os.listdir(folder_path)

            for file in gca_files:

                if(file=="genomic.gff"):
                    gff_file_path="/content/drive/MyDrive/"+"Test_gene_annotations/"+bacteria_name+"/data/"+gca_file+"/"+file


columns = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
GFF_DataFrame = pd.read_csv(gff_file_path, sep="\t", comment="#", names=columns)
print(GFF_DataFrame.head())



        seqid   source    type  start      end score strand phase  \
0  AE004091.2  Genbank  region      1  6264404     .      +     .   
1  AE004091.2  Genbank    gene    483     2027     .      +     .   
2  AE004091.2  Genbank     CDS    483     2027     .      +     0   
3  AE004091.2  Genbank    gene   2056     3159     .      +     .   
4  AE004091.2  Genbank     CDS   2056     3159     .      +     0   

                                          attributes  
0  ID=AE004091.2:1..6264404;Dbxref=taxon:208964;I...  
1  ID=gene-PA0001;Name=dnaA;gbkey=Gene;gene=dnaA;...  
2  ID=cds-AAG03391.1;Parent=gene-PA0001;Dbxref=NC...  
3  ID=gene-PA0002;Name=dnaN;gbkey=Gene;gene=dnaN;...  
4  ID=cds-AAG03392.1;Parent=gene-PA0002;Dbxref=NC...  


In [24]:
gff_anotations=[]
for index,row in GFF_DataFrame.iterrows():
        if(row["type"]=="gene"):


            start_position = row["start"]
            end_position = row["end"]

            gff_anotations.append([start_position,end_position])

#print("Anotacije GFF datoteke: ",gff_anotations)
#print("Anotacije modela :",end=" ")
v=dohvacanje_anotacija(izgladeni_vector)

counter=0
error_margin=10
for row in gff_anotations:
  for my_row in v:
    if(abs(row[0]-my_row[0])<error_margin and abs(row[1]-my_row[1])<error_margin):
      counter+=1
print("Broj točno predviđenih anotacija: "+ str(counter))
print("Ukupni broj anotacija: "+str(len(gff_anotations)))
print("Final test accuracy: "+str(round(counter/len(gff_anotations)*100,4))+"%")

Broj točno predviđenih anotacija: 1673
Ukupni broj anotacija: 5677
Final test accuracy: 29.4698%
