In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

import os
import pandas as pd
import csv

from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np
from imblearn.over_sampling import SMOTE

In [None]:
files = []
folder = "../proba3/IN/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    max_depth=30,
    class_weight='balanced',
    max_features='sqrt',
    criterion = 'entropy',
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

In [None]:
# rezultati:
"""
Classification report: 
               precision    recall  f1-score   support

   /HCoV-229E       0.99      0.97      0.98       153
   /HCoV-HKU1       0.98      0.97      0.97        98
   /HCoV-NL63       0.98      1.00      0.99       213
   /HCoV-OC43       0.99      0.98      0.99       366
         /IBV       1.00      0.90      0.95      3150
    /MERS-CoV       1.00      0.94      0.97       395
    /SARS-CoV       1.00      1.00      1.00         3
   /SARS-CoV2       0.69      1.00      0.81      1360
     /bat-CoV       1.00      0.87      0.93        15
  /bovine-CoV       0.99      0.98      0.98       375
  /canine-CoV       0.78      0.70      0.74       218
 /dolphin-CoV       1.00      1.00      1.00         3
  /equine-CoV       1.00      0.71      0.83         7
  /feline-CoV       0.98      0.52      0.68       536
  /ferret-CoV       0.50      0.15      0.24        13
/hedgehog-CoV       0.86      1.00      0.92         6
 /porcine-CoV       0.98      0.99      0.98       132
  /rabbit-CoV       1.00      0.83      0.91         6
     /rat-CoV       1.00      0.62      0.77         8
  /turkey-CoV       0.93      1.00      0.96        27

     accuracy                           0.90      7084
    macro avg       0.93      0.86      0.88      7084
 weighted avg       0.93      0.90      0.90      7084

Accuracy score: 0.9010446075663467
"""

In [None]:
files = []
folder = "../proba3/IC/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    max_depth=30,
    class_weight='balanced',
    max_features='sqrt',
    criterion = 'entropy',
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

In [None]:
# rezultati:
"""
Classification report: 
               precision    recall  f1-score   support

   /HCoV-229E       0.99      0.96      0.97       153
   /HCoV-HKU1       0.98      0.97      0.97        98
   /HCoV-NL63       1.00      0.96      0.98       213
   /HCoV-OC43       0.99      0.99      0.99       366
         /IBV       0.99      0.96      0.98      3150
    /MERS-CoV       0.98      0.96      0.97       395
    /SARS-CoV       1.00      1.00      1.00         3
   /SARS-CoV2       0.72      1.00      0.84      1360
     /bat-CoV       0.92      0.80      0.86        15
  /bovine-CoV       0.99      0.98      0.99       375
  /canine-CoV       0.93      0.65      0.76       218
 /dolphin-CoV       1.00      1.00      1.00         3
  /equine-CoV       1.00      0.71      0.83         7
  /feline-CoV       0.97      0.36      0.52       536
  /ferret-CoV       0.36      0.92      0.52        13
/hedgehog-CoV       0.75      1.00      0.86         6
 /porcine-CoV       1.00      1.00      1.00       132
  /rabbit-CoV       0.62      0.83      0.71         6
     /rat-CoV       0.67      0.75      0.71         8
  /turkey-CoV       0.93      1.00      0.96        27

     accuracy                           0.91      7084
    macro avg       0.89      0.89      0.87      7084
 weighted avg       0.93      0.91      0.91      7084

Accuracy score: 0.9143139469226426
"""

In [None]:
files = []
folder = "../proba3/DN/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    max_depth=30,
    class_weight='balanced',
    max_features='sqrt',
    criterion = 'entropy',
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

In [None]:
# rezultati:
"""
Classification report: 
               precision    recall  f1-score   support

   /HCoV-229E       0.99      0.99      0.99       153
   /HCoV-HKU1       0.97      0.96      0.96        98
   /HCoV-NL63       1.00      0.99      1.00       213
   /HCoV-OC43       0.99      0.98      0.99       366
         /IBV       0.99      0.91      0.95      3150
    /MERS-CoV       1.00      0.93      0.97       395
    /SARS-CoV       0.75      1.00      0.86         3
   /SARS-CoV2       0.75      1.00      0.86      1360
     /bat-CoV       0.88      0.93      0.90        15
  /bovine-CoV       0.99      0.99      0.99       375
  /canine-CoV       0.96      0.69      0.80       218
 /dolphin-CoV       1.00      1.00      1.00         3
  /equine-CoV       1.00      0.71      0.83         7
  /feline-CoV       0.97      0.80      0.88       536
  /ferret-CoV       0.62      0.62      0.62        13
/hedgehog-CoV       0.67      0.67      0.67         6
 /porcine-CoV       1.00      1.00      1.00       132
  /rabbit-CoV       1.00      1.00      1.00         6
     /rat-CoV       0.86      0.75      0.80         8
  /turkey-CoV       0.93      1.00      0.96        27

     accuracy                           0.93      7084
    macro avg       0.92      0.90      0.90      7084
 weighted avg       0.94      0.93      0.93      7084

Accuracy score: 0.9278656126482213
"""

In [None]:
files = []
folder = "../proba3/DC/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl/"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    max_depth=30,
    class_weight='balanced',
    max_features='sqrt',
    criterion = 'entropy',
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

In [None]:
#rezultati:
"""
Classification report: 
               precision    recall  f1-score   support

   /HCoV-229E       0.99      0.94      0.96       153
   /HCoV-HKU1       0.93      0.94      0.93        98
   /HCoV-NL63       1.00      0.96      0.98       213
   /HCoV-OC43       1.00      0.92      0.96       366
         /IBV       0.99      0.93      0.96      3150
    /MERS-CoV       0.99      0.95      0.97       395
    /SARS-CoV       1.00      1.00      1.00         3
   /SARS-CoV2       0.68      1.00      0.81      1360
     /bat-CoV       0.81      0.87      0.84        15
  /bovine-CoV       0.99      0.99      0.99       375
  /canine-CoV       0.95      0.61      0.75       218
 /dolphin-CoV       1.00      1.00      1.00         3
  /equine-CoV       0.83      0.71      0.77         7
  /feline-CoV       0.94      0.38      0.54       536
  /ferret-CoV       0.45      0.77      0.57        13
/hedgehog-CoV       1.00      1.00      1.00         6
 /porcine-CoV       1.00      0.99      1.00       132
  /rabbit-CoV       1.00      1.00      1.00         6
     /rat-CoV       1.00      0.50      0.67         8
  /turkey-CoV       0.84      1.00      0.92        27

     accuracy                           0.90      7084
    macro avg       0.92      0.87      0.88      7084
 weighted avg       0.93      0.90      0.89      7084

Accuracy score: 0.8982213438735178
"""

# VECA DUBINA I ENTROPY UMESTO GINNY

In [None]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
#known_repeats = [repeat for repeat in known_repeats if not repeat.startswith("X")]
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    max_depth=30,
    class_weight='balanced',
    max_features='sqrt',
    criterion = 'entropy',
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

with open("classification_report.txt", "w") as f:
    f.write(report)
    f.write("\nAccuracy score: ")
    f.write(str(accuracy_score(y_test, y_pred)))

In [None]:
# rezultati 
"""
          precision    recall  f1-score   support

   HCoV-229E       1.00      0.96      0.98       153
   HCoV-HKU1       1.00      0.98      0.99        99
   HCoV-NL63       1.00      1.00      1.00       214
   HCoV-OC43       1.00      0.97      0.98       366
         IBV       1.00      0.95      0.97      3153
    MERS-CoV       0.99      0.93      0.96       395
    SARS-CoV       1.00      1.00      1.00         3
   SARS-CoV2       1.00      0.96      0.98      2133
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       0.99      1.00      0.99       378
  canine-CoV       0.98      0.74      0.84       219
 dolphin-CoV       1.00      1.00      1.00         3
  equine-CoV       1.00      0.86      0.92         7
  feline-CoV       0.97      0.93      0.95       536
  ferret-CoV       0.03      0.85      0.06        13
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      0.98      0.99       132
  rabbit-CoV       0.75      1.00      0.86         6
     rat-CoV       0.89      1.00      0.94         8
  turkey-CoV       1.00      0.89      0.94        27

    accuracy                           0.95      7867
   macro avg       0.93      0.95      0.92      7867
weighted avg       0.99      0.95      0.97      7867

Accuracy score: 0.9487733570611415
"""

In [None]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
#known_repeats = [repeat for repeat in known_repeats if not repeat.startswith("X")]
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    max_depth=30,
    class_weight='balanced',
    max_features='sqrt',
    criterion = 'entropy',
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

with open("classification_report.txt", "w") as f:
    f.write(report)
    f.write("\nAccuracy score: ")
    f.write(str(accuracy_score(y_test, y_pred)))

In [None]:
# rezultati 
"""
Classification report: 
              precision    recall  f1-score   support

   HCoV-229E       1.00      0.97      0.98       153
   HCoV-HKU1       1.00      0.92      0.96        99
   HCoV-NL63       0.99      1.00      1.00       214
   HCoV-OC43       0.99      0.95      0.97       366
         IBV       1.00      0.98      0.99      3153
    MERS-CoV       1.00      0.98      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
   SARS-CoV2       1.00      0.96      0.98      2133
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       0.99      0.99      0.99       378
  canine-CoV       0.94      0.76      0.84       219
 dolphin-CoV       1.00      1.00      1.00         3
  equine-CoV       1.00      0.86      0.92         7
  feline-CoV       0.99      0.74      0.85       536
  ferret-CoV       0.03      0.85      0.06        13
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      0.98      0.99       132
  rabbit-CoV       0.67      0.67      0.67         6
     rat-CoV       0.80      1.00      0.89         8
  turkey-CoV       0.93      0.93      0.93        27

    accuracy                           0.95      7867
   macro avg       0.92      0.92      0.90      7867
weighted avg       0.99      0.95      0.97      7867

Accuracy score: 0.9505529426719207
"""

In [2]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
#known_repeats = [repeat for repeat in known_repeats if not repeat.startswith("X")]
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
report = classification_report(y_test, y_pred)
print(report)

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

with open("classification_report.txt", "w") as f:
    f.write(report)
    f.write("\nAccuracy score: ")
    f.write(str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 2210
Classification report: 
              precision    recall  f1-score   support

   HCoV-229E       1.00      0.89      0.94       153
   HCoV-HKU1       0.00      0.00      0.00        99
   HCoV-NL63       1.00      0.85      0.92       214
   HCoV-OC43       1.00      0.89      0.94       366
         IBV       1.00      0.97      0.99      3153
    MERS-CoV       1.00      0.93      0.96       395
    SARS-CoV       1.00      1.00      1.00         3
   SARS-CoV2       1.00      0.96      0.98      2133
     bat-CoV       1.00      0.88      0.93        16
  bovine-CoV       0.99      0.89      0.93       378
  canine-CoV       0.93      0.53      0.67       219
 dolphin-CoV       0.00      0.00      0.00         3
  equine-CoV       0.00      0.00      0.00         7
  feline-CoV       0.47      0.98      0.64       536
  ferret-CoV       0.00      0.00      0.00        13
hedgehog-CoV       0.00      0.00      0.00         6
 porcine-CoV       0.94   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# REZULTATI DOSTA ZAVISE OD PARAMETARA, DUBINE POGOTOVO

# SA X

In [9]:
def extractRepeatsAndLabels(files):
    labels = []
    repeats = []

    # Read data from each file
    for file in files:
        with open(file, "r") as f:
            for line in f:
                parts = line.strip().split(",")
                if len(parts) >= 2:
                    label = parts[0]
                    repeat = parts[1]
                    labels.append(label)
                    repeats.append(repeat)

    return repeats, labels

In [10]:
def extractRepeatsAndLabelsTrain(files):
    labelTrain = []
    sequencesTrain = []

    # Read data from each file
    for file in files:
        records = SeqIO.parse(file, "fasta")
        for record in records:
            sequencesTrain.append(record.seq)
            labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))
    return sequencesTrain, labelTrain

In [11]:
def extractRepeatsAndLabelsTest(files):
    labelTrain = []
    sequencesTrain = []

    # Read data from each file
    for file in files:
        records = SeqIO.parse(file, "fasta")
        for record in records:
            sequencesTrain.append(record.seq)
            labelTrain.append(file.replace("../testSet/", "").replace(".fasta", ""))
    return sequencesTrain, labelTrain

In [12]:
def extractRepeatCounts(sequence, repeatList):
    return [sequence.count(r) for r in repeatList]

In [13]:
def testModel(repeatFolder, trainFolder, testModel):
    print(repeatFolder)
    files = []

    for file in os.listdir(repeatFolder):
        if file.startswith("."):
            continue
        files.append(repeatFolder + "/" + file)
    repeats, labels = extractRepeatsAndLabels(files)

    files.clear()

    for file in os.listdir(trainFolder):
        if file.startswith("."):
            continue
        files.append(trainFolder + "/" + file)
    sequencesTrain, labelTrain = extractRepeatsAndLabelsTrain(files)

    files.clear()

    for file in os.listdir(testFolder):
        if file.startswith("."):
            continue
        files.append(testFolder + "/" + file)
    sequencesTest, labelTest = extractRepeatsAndLabelsTest(files)

    knownRepeats = list(set(repeats))
    print(f"Number of unique repeats: {len(knownRepeats)}")


    X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
    y_train = labelTrain

    X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
    y_test = labelTest

    encoder = LabelEncoder()
    y_train_enc = encoder.fit_transform(y_train)
    y_test_enc = encoder.transform(y_test)

    tree = DecisionTreeClassifier(
        criterion='gini',   # or 'entropy'
        max_depth=10,        # limit depth to avoid overfitting
        random_state=42
    )

# 3️⃣ Train
    tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
    y_pred_enc = tree.predict(X_test)
    y_pred = encoder.inverse_transform(y_pred_enc)

    y_pred_enc_train = tree.predict(X_train)
    y_pred_train = encoder.inverse_transform(y_pred_enc_train)

    
    print("Classification report: ")
    print(classification_report(y_test, y_pred))

    print("Accuracy score test: " + str(accuracy_score(y_test, y_pred)))
    print("Accuracy score train: " + str(accuracy_score(y_train, y_pred_train)))
    print("------------------------------------------------------")

In [None]:
repeatFolder = ["../proba2/direct", "../proba2/indirect"]
trainFolder = "../trainSet/"
testFolder = "../testSet/"

for folder in repeatFolder:
    testModel(folder, trainFolder, testFolder)

../proba2/direct


In [5]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
#known_repeats = [repeat for repeat in known_repeats if not repeat.startswith("X")]
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

headers = known_repeats
headers.append("coronavirusType")
with open("instanca.csv", mode="a", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(headers)

for i in range(50):
    with open("instanca.csv", 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        data = X_train[i]
        data.append(y_train_enc[i])
        writer.writerows([data])
    

Number of unique repeats: 798
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       0.99      0.98      0.98       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       1.00      0.98      0.99       378
  canine-CoV       0.99      0.75      0.85       219
  feline-CoV       0.89      1.00      0.94       536
hedgehog-CoV       1.00      0.83      0.91         6
 porcine-CoV       0.99      1.00      1.00       132

    accuracy                           0.96      1685
   macro avg       0.98      0.93      0.95      1685
weighted avg       0.96      0.96      0.95      1685

Accuracy score: 0.9554896142433235


In [8]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

y_pred_enc_train = tree.predict(X_train)
y_pred_train = encoder.inverse_transform(y_pred_enc_train)

# 5️⃣ Evaluate
print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Accuracy score train: " + str(accuracy_score(y_train, y_pred_train)))

Number of unique repeats: 1007
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.97      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       1.00      0.96      0.98       378
  canine-CoV       0.99      0.63      0.77       219
  feline-CoV       0.83      0.99      0.91       536
hedgehog-CoV       0.86      1.00      0.92         6
 porcine-CoV       1.00      0.99      1.00       132

    accuracy                           0.93      1685
   macro avg       0.96      0.94      0.94      1685
weighted avg       0.94      0.93      0.93      1685

Accuracy score: 0.9347181008902077
Accuracy score train: 0.9407709982129181


# BEZ X

In [3]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
known_repeats = [repeat for repeat in known_repeats if not repeat.startswith("X")]
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 456
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       0.99      0.98      0.98       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       1.00      0.98      0.99       378
  canine-CoV       0.99      0.76      0.86       219
  feline-CoV       0.89      1.00      0.94       536
hedgehog-CoV       1.00      0.83      0.91         6
 porcine-CoV       1.00      0.99      1.00       132

    accuracy                           0.96      1685
   macro avg       0.98      0.93      0.96      1685
weighted avg       0.96      0.96      0.95      1685

Accuracy score: 0.9554896142433235


In [4]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
known_repeats = [repeat for repeat in known_repeats if not repeat.startswith("X")]
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 660
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.97      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      1.00      1.00        16
  bovine-CoV       1.00      0.96      0.98       378
  canine-CoV       0.99      0.63      0.77       219
  feline-CoV       0.84      1.00      0.91       536
hedgehog-CoV       0.86      1.00      0.92         6
 porcine-CoV       1.00      0.99      1.00       132

    accuracy                           0.94      1685
   macro avg       0.96      0.95      0.95      1685
weighted avg       0.95      0.94      0.93      1685

Accuracy score: 0.9364985163204748


In [7]:
files = []
folder = "../proba3/DC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4335
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       0.99      0.98      0.99       395
    SARS-CoV       0.60      1.00      0.75         3
     bat-CoV       1.00      0.67      0.80        15
  bovine-CoV       1.00      0.97      0.98       375
  canine-CoV       1.00      0.78      0.87       218
  feline-CoV       0.88      0.99      0.93       536
hedgehog-CoV       0.71      0.83      0.77         6
 porcine-CoV       0.99      0.99      0.99       132

    accuracy                           0.95      1680
   macro avg       0.90      0.90      0.89      1680
weighted avg       0.96      0.95      0.95      1680

Accuracy score: 0.9529761904761904


In [8]:
files = []
folder = "../proba3/DN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 6309
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       0.98      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.67      0.80        15
  bovine-CoV       1.00      0.97      0.98       375
  canine-CoV       0.97      0.91      0.94       218
  feline-CoV       0.93      0.99      0.96       536
hedgehog-CoV       1.00      0.50      0.67         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.97      1680
   macro avg       0.99      0.88      0.92      1680
weighted avg       0.97      0.97      0.97      1680

Accuracy score: 0.968452380952381


In [9]:
files = []
folder = "../proba3/IN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4897
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.98      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       0.82      0.93      0.87        15
  bovine-CoV       1.00      0.97      0.98       375
  canine-CoV       0.97      0.81      0.89       218
  feline-CoV       0.90      0.99      0.94       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.96      1680
   macro avg       0.96      0.96      0.96      1680
weighted avg       0.96      0.96      0.96      1680

Accuracy score: 0.9589285714285715


In [10]:
files = []
folder = "../proba3/IC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

tree = DecisionTreeClassifier(
    criterion='gini',   # or 'entropy'
    max_depth=10,        # limit depth to avoid overfitting
    random_state=42
)

# 3️⃣ Train
tree.fit(X_train, y_train_enc)

# 4️⃣ Predict
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

# 5️⃣ Evaluate
print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 5163
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.98      0.99       395
    SARS-CoV       0.75      1.00      0.86         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      0.97      0.98       375
  canine-CoV       0.98      0.79      0.88       218
  feline-CoV       0.88      0.99      0.93       536
hedgehog-CoV       1.00      0.33      0.50         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.95      1680
   macro avg       0.95      0.87      0.89      1680
weighted avg       0.96      0.95      0.95      1680

Accuracy score: 0.9547619047619048


# !!!!!!!!!!!!!!!!!! SA SARSCOV2 !!!!!!!!!!!!!!!!!!!!!!!!!!

In [8]:
import os
import csv
from Bio import SeqIO

# ------------------------------------
# 1️⃣ Get known repeats from the direct folder
# ------------------------------------
labels = []
repeats = []

folder_direct = "../proba2/direct"

for file in os.listdir(folder_direct):
    if file.startswith("."):
        continue
    with open(os.path.join(folder_direct, file), "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                labels.append(parts[0])
                repeats.append(parts[1])

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

# ------------------------------------
# 2️⃣ Stream train sequences, write counts to CSV
# ------------------------------------
with open("X_train.csv", "w", newline="") as f_train:
    writer = csv.writer(f_train)
    writer.writerow(known_repeats + ["coronavirusType"])  # header

    folder_train = "../trainSet"
    for file in os.listdir(folder_train):
        if file.startswith("."):
            continue
        filepath = os.path.join(folder_train, file)
        label = file.replace(".fasta", "")
        for record in SeqIO.parse(filepath, "fasta"):
            counts = [record.seq.count(r) for r in known_repeats]
            writer.writerow(counts + [label])

print("Train CSV done.")

# ------------------------------------
# 3️⃣ Stream test sequences, write counts to CSV
# ------------------------------------
with open("X_test.csv", "w", newline="") as f_test:
    writer = csv.writer(f_test)
    writer.writerow(known_repeats + ["coronavirusType"])  # header

    folder_test = "../testSet"
    for file in os.listdir(folder_test):
        if file.startswith("."):
            continue
        filepath = os.path.join(folder_test, file)
        label = file.replace(".fasta", "")
        for record in SeqIO.parse(filepath, "fasta"):
            counts = [record.seq.count(r) for r in known_repeats]
            writer.writerow(counts + [label])

print("Test CSV done.")

Number of unique repeats: 885


KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# ------------------------------------
# Load CSVs
# ------------------------------------
df_train = pd.read_csv("X_train.csv")
df_test = pd.read_csv("X_test.csv")

X_train = df_train[known_repeats].values
y_train = df_train["coronavirusType"]

X_test = df_test[known_repeats].values
y_test = df_test["coronavirusType"]

# ------------------------------------
# Encode labels
# ------------------------------------
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# ------------------------------------
# Train Decision Tree
# ------------------------------------
tree = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    random_state=42
)
tree.fit(X_train, y_train_enc)

# ------------------------------------
# Predict & Evaluate
# ------------------------------------
y_pred_enc = tree.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("\nClassification report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy score:", accuracy_score(y_test, y_pred))