<a href="https://colab.research.google.com/github/Flychuban/DNA-Sequencing/blob/main/DNASequencing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
import os
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [11]:
# convert string data to k-mer counting substrings
def convertKmer(sample, substring_length):
  return [sample[letter_order: letter_order+substring_length] for letter_order in range(len(sample) - substring_length + 1)]

In [12]:
sample_sentence = "ACGTTACGTTACAAGTTGGGACTGATCAGAATGGA"

k_mer_words = convertKmer(sample_sentence, substring_length=6)
k_mer_words

['ACGTTA',
 'CGTTAC',
 'GTTACG',
 'TTACGT',
 'TACGTT',
 'ACGTTA',
 'CGTTAC',
 'GTTACA',
 'TTACAA',
 'TACAAG',
 'ACAAGT',
 'CAAGTT',
 'AAGTTG',
 'AGTTGG',
 'GTTGGG',
 'TTGGGA',
 'TGGGAC',
 'GGGACT',
 'GGACTG',
 'GACTGA',
 'ACTGAT',
 'CTGATC',
 'TGATCA',
 'GATCAG',
 'ATCAGA',
 'TCAGAA',
 'CAGAAT',
 'AGAATG',
 'GAATGG',
 'AATGGA']

In [13]:
# Now we have to vectorize the result of k_mer converted because we will use bag of words algorithm

cv = CountVectorizer()
vectorized_words = cv.fit_transform(k_mer_words).toarray()
vectorized_words

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 