In [2]:
import os
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from collections import Counter

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# bolje je bez oversamplinga

In [3]:


# --- Load repeats info ---
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

# --- Load training sequences ---
files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(str(record.seq))  # convert Seq to string for count()
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

# --- Load testing sequences ---
files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTest = []
sequencesTest = []

for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(str(record.seq))
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

# --- Extract features ---
X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain
X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# --- Encode labels ---
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# --- Calculate sample weights based on encoded labels ---
counter = Counter(y_train_enc)
total = sum(counter.values())
weights = {cls: total/count for cls, count in counter.items()}
sample_weights = [weights[label] for label in y_train_enc]

# --- Prepare DMatrix for xgboost.train ---
dtrain = xgb.DMatrix(X_train, label=y_train_enc, weight=sample_weights)
dtest = xgb.DMatrix(X_test, label=y_test_enc)

num_classes = len(encoder.classes_)

params = {
    "objective": "multi:softprob",
    "num_class": num_classes,
    "eval_metric": "mlogloss",
    "seed": 42,
    # add other params you want here
}

model = xgb.train(params, dtrain, num_boost_round=100)

# Predict probabilities and then predicted classes
y_pred_probs = model.predict(dtest)
y_pred_enc = y_pred_probs.argmax(axis=1)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

# --- Alternatively, you can use sklearn API for simpler syntax ---

# model_skl = xgb.XGBClassifier(
#     n_estimators=100,
#     learning_rate=0.2,
#     use_label_encoder=False,
#     eval_metric='mlogloss',
#     random_state=42
# )
# model_skl.fit(X_train, y_train_enc, sample_weight=sample_weights)
# y_pred_enc = model_skl.predict(X_test)
# y_pred = encoder.inverse_transform(y_pred_enc)

# print("Classification report: ")
# print(classification_report(y_test, y_pred))
# print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))


Number of unique repeats: 1244


XGBoostError: [13:18:33] /workspace/src/c_api/../data/array_interface.h:499: Unicode-1 is not supported.
Stack trace:
  [bt] (0) /home/pc/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x22dcbc) [0x735805e2dcbc]
  [bt] (1) /home/pc/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x240651) [0x735805e40651]
  [bt] (2) /home/pc/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x4acee9) [0x7358060acee9]
  [bt] (3) /home/pc/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x4b3594) [0x7358060b3594]
  [bt] (4) /home/pc/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x4b53c0) [0x7358060b53c0]
  [bt] (5) /home/pc/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb2) [0x735805d35032]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7358510d3e2e]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7358510d0493]
  [bt] (8) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x7358510e63e9]



# SA NOVIM KLASAMA

# AMINOKISELINSKI PONOVCI

In [13]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,  # avoids warning
    eval_metric='mlogloss',   # avoids warning
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 798


Parameters: { "use_label_encoder" } are not used.



Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      1.00      1.00        16
  bovine-CoV       1.00      0.99      1.00       378
  canine-CoV       0.95      0.98      0.96       219
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      0.83      0.91         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1685
   macro avg       0.99      0.97      0.98      1685
weighted avg       0.99      0.99      0.99      1685

Accuracy score: 0.9893175074183976


In [14]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,  # avoids warning
    eval_metric='mlogloss',   # avoids warning
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 1007


Parameters: { "use_label_encoder" } are not used.



Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       1.00      0.99      0.99       378
  canine-CoV       1.00      0.92      0.96       219
  feline-CoV       0.96      1.00      0.98       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      0.99      1.00       132

    accuracy                           0.98      1685
   macro avg       0.99      0.98      0.99      1685
weighted avg       0.98      0.98      0.98      1685

Accuracy score: 0.9839762611275964


In [15]:
files = []
folder = "../proba3/DC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,  # avoids warning
    eval_metric='mlogloss',   # avoids warning
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4335


Parameters: { "use_label_encoder" } are not used.



Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      1.00      1.00        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       1.00      0.95      0.97       218
  feline-CoV       0.98      1.00      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9916666666666667


In [2]:
files = []
folder = "../proba3/DN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,  # avoids warning
    eval_metric='mlogloss',   # avoids warning
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 6309


Parameters: { "use_label_encoder" } are not used.



Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       0.99      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      1.00      1.00        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.99      0.98      0.98       218
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      0.67      0.80         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.96      0.97      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9940476190476191


In [3]:
files = []
folder = "../proba3/IC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,  # avoids warning
    eval_metric='mlogloss',   # avoids warning
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 5163


Parameters: { "use_label_encoder" } are not used.



Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      1.00      1.00        15
  bovine-CoV       1.00      0.99      1.00       375
  canine-CoV       0.98      0.97      0.97       218
  feline-CoV       0.98      1.00      0.99       536
hedgehog-CoV       1.00      0.67      0.80         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       0.99      0.95      0.97      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9910714285714286


In [4]:
files = []
folder = "../proba3/IN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,  # avoids warning
    eval_metric='mlogloss',   # avoids warning
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4897


Parameters: { "use_label_encoder" } are not used.



Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      1.00      1.00        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.99      0.98      0.98       218
  feline-CoV       0.99      1.00      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           1.00      1680
   macro avg       1.00      1.00      1.00      1680
weighted avg       1.00      1.00      1.00      1680

Accuracy score: 0.9952380952380953
