In [1]:
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

import os
import pandas as pd

from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np

In [None]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

base_models = [
    ('nb', MultinomialNB()),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

# 3️⃣ Meta-learner: Logistic Regression is common
meta_model = LogisticRegression(max_iter=1000)

# 4️⃣ Create stacking classifier
stacked_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5  # 5-fold cross-validation for out-of-fold meta features
)

stacked_clf.fit(X_train, y_train_enc)

# 6️⃣ Predict
y_pred_enc = stacked_clf.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

with open("classification_report.txt", "w") as f:
    f.write(report)
    f.write("\nAccuracy score: ")
    f.write(str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 2210




In [9]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

base_models = [
    ('nb', MultinomialNB()),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

# 3️⃣ Meta-learner: Logistic Regression is common
meta_model = LogisticRegression(max_iter=1000)

# 4️⃣ Create stacking classifier
stacked_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5  # 5-fold cross-validation for out-of-fold meta features
)

stacked_clf.fit(X_train, y_train_enc)

# 6️⃣ Predict
y_pred_enc = stacked_clf.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 798
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       1.00      1.00      1.00       378
  canine-CoV       0.95      0.98      0.96       219
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      0.83      0.91         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1685
   macro avg       0.99      0.97      0.98      1685
weighted avg       0.99      0.99      0.99      1685

Accuracy score: 0.9899109792284867


In [5]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

base_models = [
    ('nb', MultinomialNB()),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

# 3️⃣ Meta-learner: Logistic Regression is common
meta_model = LogisticRegression(max_iter=1000)

# 4️⃣ Create stacking classifier
stacked_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5  # 5-fold cross-validation for out-of-fold meta features
)

stacked_clf.fit(X_train, y_train_enc)

# 6️⃣ Predict
y_pred_enc = stacked_clf.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 1007
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       1.00      1.00      1.00       378
  canine-CoV       0.97      0.96      0.96       219
  feline-CoV       0.98      0.99      0.98       536
hedgehog-CoV       1.00      0.83      0.91         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1685
   macro avg       0.99      0.96      0.98      1685
weighted avg       0.99      0.99      0.99      1685

Accuracy score: 0.9875370919881306


In [6]:
files = []
folder = "../proba3/DC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

base_models = [
    ('nb', MultinomialNB()),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

# 3️⃣ Meta-learner: Logistic Regression is common
meta_model = LogisticRegression(max_iter=1000)

# 4️⃣ Create stacking classifier
stacked_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5  # 5-fold cross-validation for out-of-fold meta features
)

stacked_clf.fit(X_train, y_train_enc)

# 6️⃣ Predict
y_pred_enc = stacked_clf.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4335
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.98      0.98      0.98       218
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9940476190476191


In [10]:
files = []
folder = "../proba3/DN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

base_models = [
    ('nb', MultinomialNB()),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

# 3️⃣ Meta-learner: Logistic Regression is common
meta_model = LogisticRegression(max_iter=1000)

# 4️⃣ Create stacking classifier
stacked_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5  # 5-fold cross-validation for out-of-fold meta features
)

stacked_clf.fit(X_train, y_train_enc)

# 6️⃣ Predict
y_pred_enc = stacked_clf.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 6309
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.99      0.96      0.98       218
  feline-CoV       0.99      1.00      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.993452380952381


In [11]:
files = []
folder = "../proba3/IN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

base_models = [
    ('nb', MultinomialNB()),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

# 3️⃣ Meta-learner: Logistic Regression is common
meta_model = LogisticRegression(max_iter=1000)

# 4️⃣ Create stacking classifier
stacked_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5  # 5-fold cross-validation for out-of-fold meta features
)

stacked_clf.fit(X_train, y_train_enc)

# 6️⃣ Predict
y_pred_enc = stacked_clf.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4897
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.99      0.97      0.98       218
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.993452380952381


In [12]:
files = []
folder = "../proba3/IC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

base_models = [
    ('nb', MultinomialNB()),
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

# 3️⃣ Meta-learner: Logistic Regression is common
meta_model = LogisticRegression(max_iter=1000)

# 4️⃣ Create stacking classifier
stacked_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5  # 5-fold cross-validation for out-of-fold meta features
)

stacked_clf.fit(X_train, y_train_enc)

# 6️⃣ Predict
y_pred_enc = stacked_clf.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 5163
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.99      0.95      0.97       218
  feline-CoV       0.98      1.00      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9922619047619048
