<a href="https://colab.research.google.com/github/Flychuban/DNA-Sequencing/blob/main/DNASequencing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
import os
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [2]:
# convert string data to k-mer counting substrings
def convertKmer(sample, substring_length):
  return [sample[letter_order: letter_order+substring_length] for letter_order in range(len(sample) - substring_length + 1)]

In [3]:
sample_sentence = "ACGTTACGTTACAAGTTGGGACTGATCAGAATGGA"

k_mer_words = convertKmer(sample_sentence, substring_length=6)
k_mer_words

['ACGTTA',
 'CGTTAC',
 'GTTACG',
 'TTACGT',
 'TACGTT',
 'ACGTTA',
 'CGTTAC',
 'GTTACA',
 'TTACAA',
 'TACAAG',
 'ACAAGT',
 'CAAGTT',
 'AAGTTG',
 'AGTTGG',
 'GTTGGG',
 'TTGGGA',
 'TGGGAC',
 'GGGACT',
 'GGACTG',
 'GACTGA',
 'ACTGAT',
 'CTGATC',
 'TGATCA',
 'GATCAG',
 'ATCAGA',
 'TCAGAA',
 'CAGAAT',
 'AGAATG',
 'GAATGG',
 'AATGGA']

In [4]:
# Now we have to vectorize the result of k_mer converted because we will use bag of words algorithm

cv = CountVectorizer()
vectorized_words = cv.fit_transform(k_mer_words).toarray()
vectorized_words

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 

This shows that we correctly proccessed data

In [5]:
# Now we have to import real data

human_dna = pd.read_table("/content/drive/MyDrive/DNA/data/archive/human.txt")
human_dna.head()

Unnamed: 0,sequence,class
0,ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...,4
1,ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...,4
2,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
3,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
4,ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...,3


In [6]:
chimpanze_dna = pd.read_table('/content/drive/MyDrive/DNA/data/archive/chimpanzee.txt')
chimpanze_dna.head()

Unnamed: 0,sequence,class
0,ATGCCCCAACTAAATACCGCCGTATGACCCACCATAATTACCCCCA...,4
1,ATGAACGAAAATCTATTCGCTTCATTCGCTGCCCCCACAATCCTAG...,4
2,ATGGCCTCGCGCTGGTGGCGGTGGCGACGCGGCTGCTCCTGGAGGC...,4
3,ATGGCCTCGCGCTGGTGGCGGTGGCGACGCGGCTGCTCCTGGAGGC...,4
4,ATGGGCAGCGCCAGCCCGGGTCTGAGCAGCGTGTCCCCCAGCCACC...,6


In [7]:
dog_dna = pd.read_table('/content/drive/MyDrive/DNA/data/archive/dog.txt')
dog_dna.head()

Unnamed: 0,sequence,class
0,ATGCCACAGCTAGATACATCCACCTGATTTATTATAATCTTTTCAA...,4
1,ATGAACGAAAATCTATTCGCTTCTTTCGCTGCCCCCTCAATAATAG...,4
2,ATGGAAACACCCTTCTACGGCGATGAGGCGCTGAGCGGCCTGGGCG...,6
3,ATGTGCACTAAAATGGAACAGCCCTTCTACCACGACGACTCATACG...,6
4,ATGAGCCGGCAGCTAAACAGAAGCCAGAACTGCTCCTTCAGTGACG...,0


In [8]:
#Apply k-mer to all sequences in the data columns
human_dna['words'] = human_dna.apply(lambda x: convertKmer(x['sequence'], substring_length=6), axis=1)
human_dna = human_dna.drop('sequence', axis=1)

chimpanze_dna['words'] = chimpanze_dna.apply(lambda x: convertKmer(x['sequence'], substring_length=6), axis=1)
chimpanze_dna = chimpanze_dna.drop('sequence', axis=1)

dog_dna['words'] = dog_dna.apply(lambda x: convertKmer(x['sequence'], substring_length=6), axis=1)
dog_dna = dog_dna.drop('sequence', axis=1)

human_dna.head()

Unnamed: 0,class,words
0,4,"[ATGCCC, TGCCCC, GCCCCA, CCCCAA, CCCAAC, CCAAC..."
1,4,"[ATGAAC, TGAACG, GAACGA, AACGAA, ACGAAA, CGAAA..."
2,3,"[ATGTGT, TGTGTG, GTGTGG, TGTGGC, GTGGCA, TGGCA..."
3,3,"[ATGTGT, TGTGTG, GTGTGG, TGTGGC, GTGGCA, TGGCA..."
4,3,"[ATGCAA, TGCAAC, GCAACA, CAACAG, AACAGC, ACAGC..."


In [11]:
# separate features from labels
human_words = list(human_dna['words'])
for sample in range(len(human_dna['words'])):
  human_words[sample] = " ".join(human_words[sample])

human_labels = human_dna.iloc[:, 0].values

chimpanze_words = list(chimpanze_dna['words'])
for sample in range(len(chimpanze_dna['words'])):
  chimpanze_words[sample] = " ".join(chimpanze_words[sample])

chimpanze_labels = chimpanze_dna.iloc[:, 0].values

dog_words = list(dog_dna['words'])
for sample in range(len(dog_dna['words'])):
  dog_words[sample] = " ".join(dog_words[sample])

dog_labels = dog_dna.iloc[:, 0].values