In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

import os
import pandas as pd

from collections import Counter
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
import numpy as np

In [None]:
files = []
folder = "../proba3/DC/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

y_pred_train_enc = knn.predict(X_train)
y_pred_train = encoder.inverse_transform(y_pred_train_enc)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Accuracy score train: " + str(accuracy_score(y_train, y_pred_train)))

In [None]:
# rezultati:
"""
Classification report: 
               precision    recall  f1-score   support

   /HCoV-229E       0.99      0.95      0.97       153
   /HCoV-HKU1       0.97      0.97      0.97        98
   /HCoV-NL63       0.99      0.98      0.98       213
   /HCoV-OC43       0.98      0.99      0.99       366
         /IBV       1.00      0.99      0.99      3150
    /MERS-CoV       1.00      0.98      0.99       395
    /SARS-CoV       0.75      1.00      0.86         3
   /SARS-CoV2       0.93      1.00      0.96      1360
     /bat-CoV       1.00      0.80      0.89        15
  /bovine-CoV       0.99      1.00      0.99       375
  /canine-CoV       0.98      0.92      0.95       218
 /dolphin-CoV       1.00      1.00      1.00         3
  /equine-CoV       1.00      0.57      0.73         7
  /feline-CoV       0.96      0.94      0.95       536
  /ferret-CoV       1.00      0.69      0.82        13
/hedgehog-CoV       1.00      1.00      1.00         6
 /porcine-CoV       1.00      1.00      1.00       132
  /rabbit-CoV       1.00      0.50      0.67         6
     /rat-CoV       1.00      0.38      0.55         8
  /turkey-CoV       1.00      0.89      0.94        27

     accuracy                           0.98      7084
    macro avg       0.98      0.88      0.91      7084
 weighted avg       0.98      0.98      0.98      7084

Accuracy score: 0.9798136645962733
Accuracy score train: 0.9882746051032807
"""

In [None]:
files = []
folder = "../proba3/DN/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

y_pred_train_enc = knn.predict(X_train)
y_pred_train = encoder.inverse_transform(y_pred_train_enc)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Accuracy score train: " + str(accuracy_score(y_train, y_pred_train)))

In [None]:
# rezultati:
"""
Classification report: 
               precision    recall  f1-score   support

   /HCoV-229E       1.00      0.96      0.98       153
   /HCoV-HKU1       0.99      0.97      0.98        98
   /HCoV-NL63       1.00      0.99      1.00       213
   /HCoV-OC43       0.99      0.99      0.99       366
         /IBV       1.00      0.98      0.99      3150
    /MERS-CoV       1.00      0.98      0.99       395
    /SARS-CoV       1.00      1.00      1.00         3
   /SARS-CoV2       0.93      1.00      0.96      1360
     /bat-CoV       1.00      0.87      0.93        15
  /bovine-CoV       0.99      1.00      1.00       375
  /canine-CoV       0.99      0.91      0.95       218
 /dolphin-CoV       1.00      1.00      1.00         3
  /equine-CoV       1.00      0.57      0.73         7
  /feline-CoV       0.99      0.98      0.98       536
  /ferret-CoV       1.00      0.85      0.92        13
/hedgehog-CoV       1.00      1.00      1.00         6
 /porcine-CoV       1.00      0.99      1.00       132
  /rabbit-CoV       1.00      0.50      0.67         6
     /rat-CoV       1.00      0.38      0.55         8
  /turkey-CoV       1.00      0.96      0.98        27

     accuracy                           0.98      7084
    macro avg       0.99      0.89      0.93      7084
 weighted avg       0.98      0.98      0.98      7084

Accuracy score: 0.9827780914737436
Accuracy score train: 0.9894896719319562
"""

In [None]:
files = []
folder = "../proba3/IN/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

y_pred_train_enc = knn.predict(X_train)
y_pred_train = encoder.inverse_transform(y_pred_train_enc)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Accuracy score train: " + str(accuracy_score(y_train, y_pred_train)))

In [2]:
# rezultati:
"""
Classification report: 
               precision    recall  f1-score   support

   /HCoV-229E       0.99      0.96      0.97       153
   /HCoV-HKU1       0.98      0.97      0.97        98
   /HCoV-NL63       1.00      0.98      0.99       213
   /HCoV-OC43       0.98      0.99      0.99       366
         /IBV       1.00      0.98      0.99      3150
    /MERS-CoV       1.00      0.97      0.99       395
    /SARS-CoV       0.75      1.00      0.86         3
   /SARS-CoV2       0.92      1.00      0.96      1360
     /bat-CoV       1.00      0.73      0.85        15
  /bovine-CoV       1.00      0.99      1.00       375
  /canine-CoV       0.98      0.92      0.95       218
 /dolphin-CoV       1.00      1.00      1.00         3
  /equine-CoV       1.00      0.57      0.73         7
  /feline-CoV       0.98      0.95      0.97       536
  /ferret-CoV       1.00      0.85      0.92        13
/hedgehog-CoV       1.00      1.00      1.00         6
 /porcine-CoV       1.00      1.00      1.00       132
  /rabbit-CoV       1.00      0.67      0.80         6
     /rat-CoV       1.00      0.38      0.55         8
  /turkey-CoV       1.00      0.89      0.94        27

     accuracy                           0.98      7084
    macro avg       0.98      0.89      0.92      7084
 weighted avg       0.98      0.98      0.98      7084
 Accuracy score: 0.9802371541501976
 Accuracy score train: 0.9880315917375455
"""

In [None]:
files = []
folder = "../proba3/IC/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

y_pred_train_enc = knn.predict(X_train)
y_pred_train = encoder.inverse_transform(y_pred_train_enc)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Accuracy score train: " + str(accuracy_score(y_train, y_pred_train)))

In [None]:
# rezultati:
"""
Classification report: 
               precision    recall  f1-score   support

   /HCoV-229E       0.98      0.96      0.97       153
   /HCoV-HKU1       0.99      0.97      0.98        98
   /HCoV-NL63       1.00      0.98      0.99       213
   /HCoV-OC43       0.98      0.99      0.99       366
         /IBV       1.00      0.99      0.99      3150
    /MERS-CoV       1.00      0.96      0.98       395
    /SARS-CoV       1.00      1.00      1.00         3
   /SARS-CoV2       0.93      1.00      0.96      1360
     /bat-CoV       1.00      0.80      0.89        15
  /bovine-CoV       1.00      1.00      1.00       375
  /canine-CoV       0.98      0.93      0.95       218
 /dolphin-CoV       1.00      1.00      1.00         3
  /equine-CoV       1.00      0.57      0.73         7
  /feline-CoV       0.98      0.97      0.98       536
  /ferret-CoV       1.00      0.85      0.92        13
/hedgehog-CoV       1.00      1.00      1.00         6
 /porcine-CoV       1.00      0.99      1.00       132
  /rabbit-CoV       1.00      0.50      0.67         6
     /rat-CoV       1.00      0.38      0.55         8
  /turkey-CoV       1.00      0.89      0.94        27

     accuracy                           0.98      7084
    macro avg       0.99      0.89      0.92      7084
 weighted avg       0.98      0.98      0.98      7084
 Accuracy score: 0.9823546019198193
 Accuracy score train: 0.9889428918590523
"""

# OVO BOLJE BEZ OVERSAMPLINGA

In [None]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

y_pred_train_enc = knn.predict(X_train)
y_pred_train = encoder.inverse_transform(y_pred_train_enc)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Accuracy score train: " + str(accuracy_score(y_train, y_pred_train)))

with open("classification_report_knn_direct.txt", "w") as f:
    f.write(report)
    f.write("\nAccuracy score: ")
    f.write(str(accuracy_score(y_test, y_pred)))
    f.write("\nAccuracy score train: ")
    f.write(str(accuracy_score(y_train, y_pred_train)))

Number of unique repeats: 2210


In [3]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 798
Report: 
              precision    recall  f1-score   support

    MERS-CoV       0.98      0.99      0.99       395
    SARS-CoV       0.75      1.00      0.86         3
     bat-CoV       1.00      0.88      0.93        16
  bovine-CoV       1.00      0.99      0.99       378
  canine-CoV       0.95      0.96      0.96       219
  feline-CoV       0.98      0.99      0.98       536
hedgehog-CoV       1.00      0.83      0.91         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.98      1685
   macro avg       0.96      0.95      0.95      1685
weighted avg       0.98      0.98      0.98      1685

Accuracy score: 0.9827893175074184


In [4]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 1007
Report: 
              precision    recall  f1-score   support

    MERS-CoV       0.99      0.98      0.98       395
    SARS-CoV       0.75      1.00      0.86         3
     bat-CoV       1.00      0.88      0.93        16
  bovine-CoV       1.00      0.98      0.99       378
  canine-CoV       0.97      0.95      0.96       219
  feline-CoV       0.96      0.99      0.97       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      0.99      1.00       132

    accuracy                           0.98      1685
   macro avg       0.96      0.97      0.96      1685
weighted avg       0.98      0.98      0.98      1685

Accuracy score: 0.9786350148367953


In [5]:
files = []
folder = "../proba3/DC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4335
Report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.73      0.85        15
  bovine-CoV       0.99      0.99      0.99       375
  canine-CoV       0.98      0.95      0.97       218
  feline-CoV       0.96      0.99      0.98       536
hedgehog-CoV       1.00      0.83      0.91         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.98      1680
   macro avg       0.99      0.94      0.96      1680
weighted avg       0.98      0.98      0.98      1680

Accuracy score: 0.9839285714285714


In [6]:
files = []
folder = "../proba3/DN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 6309
Report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       0.99      1.00      1.00       375
  canine-CoV       0.99      0.96      0.98       218
  feline-CoV       0.98      0.99      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9916666666666667


In [7]:
files = []
folder = "../proba3/IC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 5163
Report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.67      0.80        15
  bovine-CoV       0.99      1.00      0.99       375
  canine-CoV       0.99      0.93      0.96       218
  feline-CoV       0.96      0.99      0.98       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.98      1680
   macro avg       0.99      0.95      0.97      1680
weighted avg       0.98      0.98      0.98      1680

Accuracy score: 0.9845238095238096


In [8]:
files = []
folder = "../proba3/IN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')  # Euclidean by default

# 3️⃣ Fit
knn.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = knn.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4897
Report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.98      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.73      0.85        15
  bovine-CoV       0.99      1.00      0.99       375
  canine-CoV       0.99      0.91      0.95       218
  feline-CoV       0.95      1.00      0.97       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.98      1680
   macro avg       0.99      0.95      0.97      1680
weighted avg       0.98      0.98      0.98      1680

Accuracy score: 0.9809523809523809
