In [1]:
!pip install Bio

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import os
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix, vstack
from imblearn.over_sampling import SMOTE
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import numpy as np
from sklearn.utils import Bunch

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
files = []
folder = "../proba3/DC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

smote = SMOTE(random_state=42, k_neighbors=2)
X_train_res, y_train_enc_res = smote.fit_resample(X_train, y_train_enc)

model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    is_unbalance=True
)
model.fit(X_train_res, y_train_enc_res)

y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report:")
print(classification_report(y_test, y_pred))
print("Accuracy score:", accuracy_score(y_test, y_pred))

In [None]:
def get_repeats(folder):
    seen = set()
    for file in os.listdir(folder):
        if file.startswith("."):
            continue
        with open(os.path.join(folder, file)) as f:
            for line in f:
                parts = line.strip().split(",")
                if len(parts) >= 2:
                    seen.add(parts[1])
    return list(seen)

def extract_repeat_counts(sequence, repeat_list):
    counts = []
    seq = str(sequence)
    for r in repeat_list:
        counts.append(float(seq.count(r)))  # Force float here
    return counts

# 3️⃣ Generator for training examples
def sequence_generator(folder, repeat_list):
    for file in os.listdir(folder):
        if file.startswith("."):
            continue
        label = file.replace(".fasta", "")
        for record in SeqIO.parse(os.path.join(folder, file), "fasta"):
            yield extract_repeat_counts(record.seq, repeat_list), label


def stream_to_sparse_matrix(generator, repeat_list):
    rows = []
    labels = []
    for counts, label in generator:
        rows.append(counts)
        labels.append(label)
    X = csr_matrix(rows, dtype=np.float32)  # Force sparse matrix to be float32
    y = np.array(labels)
    return X, y

In [None]:
known_repeats = get_repeats("../proba3/IN/")
print(f"Unique repeats: {len(known_repeats)}")
# Train data
X_train, y_train = stream_to_sparse_matrix(
    sequence_generator("../trainSetNucl/", known_repeats), known_repeats
)

# Test data
X_test, y_test = stream_to_sparse_matrix(
    sequence_generator("../testSetNucl/", known_repeats), known_repeats
)

# 5️⃣ Encode labels
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# 6️⃣ Oversample
smote = SMOTE(random_state=42, k_neighbors=2)
X_train_res, y_train_enc_res = smote.fit_resample(X_train, y_train_enc)

# 7️⃣ Train model
model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    is_unbalance=True
)
model.fit(X_train_res, y_train_enc_res)

# 8️⃣ Predict
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report:")
print(classification_report(y_test, y_pred))
print("Accuracy score:", accuracy_score(y_test, y_pred))

In [None]:
# rezultati:
"""
Classification report:
              precision    recall  f1-score   support

   HCoV-229E       1.00      0.98      0.99       153
   HCoV-HKU1       1.00      1.00      1.00        98
   HCoV-NL63       1.00      1.00      1.00       213
   HCoV-OC43       1.00      0.99      1.00       366
         IBV       1.00      1.00      1.00      3150
    MERS-CoV       1.00      0.98      0.99       395
    SARS-CoV       0.75      1.00      0.86         3
   SARS-CoV2       0.99      1.00      0.99      1360
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.98      0.98      0.98       218
 dolphin-CoV       1.00      1.00      1.00         3
  equine-CoV       1.00      0.71      0.83         7
  feline-CoV       0.99      0.99      0.99       536
  ferret-CoV       1.00      0.92      0.96        13
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132
  rabbit-CoV       0.86      1.00      0.92         6
     rat-CoV       1.00      0.88      0.93         8
  turkey-CoV       0.96      1.00      0.98        27

    accuracy                           0.99      7084
   macro avg       0.98      0.97      0.97      7084
weighted avg       0.99      0.99      0.99      7084

Accuracy score: 0.9947769621682665
"""

In [None]:
known_repeats = get_repeats("../proba3/IC/")
print(f"Unique repeats: {len(known_repeats)}")
# Train data
X_train, y_train = stream_to_sparse_matrix(
    sequence_generator("../trainSetNucl/", known_repeats), known_repeats
)

# Test data
X_test, y_test = stream_to_sparse_matrix(
    sequence_generator("../testSetNucl/", known_repeats), known_repeats
)

# 5️⃣ Encode labels
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# 6️⃣ Oversample
smote = SMOTE(random_state=42, k_neighbors=2)
X_train_res, y_train_enc_res = smote.fit_resample(X_train, y_train_enc)

# 7️⃣ Train model
model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    is_unbalance=True
)
model.fit(X_train_res, y_train_enc_res)

# 8️⃣ Predict
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report:")
print(classification_report(y_test, y_pred))
print("Accuracy score:", accuracy_score(y_test, y_pred))

In [None]:
# rezultati:
"""
Classification report:
              precision    recall  f1-score   support

   HCoV-229E       1.00      0.98      0.99       153
   HCoV-HKU1       1.00      0.99      0.99        98
   HCoV-NL63       1.00      1.00      1.00       213
   HCoV-OC43       1.00      1.00      1.00       366
         IBV       1.00      1.00      1.00      3150
    MERS-CoV       1.00      0.98      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
   SARS-CoV2       0.99      1.00      0.99      1360
     bat-CoV       1.00      1.00      1.00        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.97      0.96      0.97       218
 dolphin-CoV       1.00      1.00      1.00         3
  equine-CoV       1.00      0.71      0.83         7
  feline-CoV       0.98      0.98      0.98       536
  ferret-CoV       1.00      1.00      1.00        13
hedgehog-CoV       0.86      1.00      0.92         6
 porcine-CoV       1.00      1.00      1.00       132
  rabbit-CoV       0.86      1.00      0.92         6
     rat-CoV       1.00      0.88      0.93         8
  turkey-CoV       0.93      0.96      0.95        27

    accuracy                           0.99      7084
   macro avg       0.98      0.97      0.97      7084
weighted avg       0.99      0.99      0.99      7084

Accuracy score: 0.9940711462450593
"""

In [None]:
known_repeats = get_repeats("../proba3/DN/")
print(f"Unique repeats: {len(known_repeats)}")
# Train data
X_train, y_train = stream_to_sparse_matrix(
    sequence_generator("../trainSetNucl/", known_repeats), known_repeats
)

# Test data
X_test, y_test = stream_to_sparse_matrix(
    sequence_generator("../testSetNucl/", known_repeats), known_repeats
)

# 5️⃣ Encode labels
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# 6️⃣ Oversample
smote = SMOTE(random_state=42, k_neighbors=2)
X_train_res, y_train_enc_res = smote.fit_resample(X_train, y_train_enc)

# 7️⃣ Train model
model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    is_unbalance=True
)
model.fit(X_train_res, y_train_enc_res)

# 8️⃣ Predict
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report:")
print(classification_report(y_test, y_pred))
print("Accuracy score:", accuracy_score(y_test, y_pred))

In [None]:
# rezultati:
"""
Classification report:
              precision    recall  f1-score   support

   HCoV-229E       1.00      0.98      0.99       153
   HCoV-HKU1       0.98      1.00      0.99        98
   HCoV-NL63       1.00      1.00      1.00       213
   HCoV-OC43       1.00      1.00      1.00       366
         IBV       1.00      1.00      1.00      3150
    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
   SARS-CoV2       0.99      1.00      0.99      1360
     bat-CoV       1.00      1.00      1.00        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.97      0.96      0.97       218
 dolphin-CoV       1.00      1.00      1.00         3
  equine-CoV       1.00      0.71      0.83         7
  feline-CoV       0.99      0.98      0.99       536
  ferret-CoV       1.00      0.92      0.96        13
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132
  rabbit-CoV       1.00      0.83      0.91         6
     rat-CoV       1.00      0.75      0.86         8
  turkey-CoV       0.96      1.00      0.98        27

    accuracy                           0.99      7084
   macro avg       0.99      0.96      0.97      7084
weighted avg       0.99      0.99      0.99      7084

Accuracy score: 0.9947769621682665
"""

In [None]:
known_repeats = get_repeats("../proba3/DC/")
print(f"Unique repeats: {len(known_repeats)}")
# Train data
X_train, y_train = stream_to_sparse_matrix(
    sequence_generator("../trainSetNucl/", known_repeats), known_repeats
)

# Test data
X_test, y_test = stream_to_sparse_matrix(
    sequence_generator("../testSetNucl/", known_repeats), known_repeats
)

# 5️⃣ Encode labels
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# 6️⃣ Oversample
smote = SMOTE(random_state=42, k_neighbors=2)
X_train_res, y_train_enc_res = smote.fit_resample(X_train, y_train_enc)

# 7️⃣ Train model
model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    is_unbalance=True
)
model.fit(X_train_res, y_train_enc_res)

# 8️⃣ Predict
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report:")
print(classification_report(y_test, y_pred))
print("Accuracy score:", accuracy_score(y_test, y_pred))

In [None]:
# rezultati:
"""
Classification report:
              precision    recall  f1-score   support

   HCoV-229E       0.99      0.98      0.99       153
   HCoV-HKU1       1.00      1.00      1.00        98
   HCoV-NL63       1.00      1.00      1.00       213
   HCoV-OC43       1.00      0.99      1.00       366
         IBV       1.00      1.00      1.00      3150
    MERS-CoV       1.00      0.98      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
   SARS-CoV2       0.99      1.00      0.99      1360
     bat-CoV       1.00      1.00      1.00        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.97      0.97      0.97       218
 dolphin-CoV       1.00      1.00      1.00         3
  equine-CoV       1.00      0.71      0.83         7
  feline-CoV       0.98      0.97      0.98       536
  ferret-CoV       1.00      0.92      0.96        13
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132
  rabbit-CoV       1.00      1.00      1.00         6
     rat-CoV       1.00      0.88      0.93         8
  turkey-CoV       0.96      1.00      0.98        27

    accuracy                           0.99      7084
   macro avg       0.99      0.97      0.98      7084
weighted avg       0.99      0.99      0.99      7084

Accuracy score: 0.9942123094297007

"""

# SA SMOTE - DIREKTNI

In [None]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

X_train_np = np.array(X_train)


smote = SMOTE(random_state=42, k_neighbors=2)
X_train_res, y_train_enc_res = smote.fit_resample(X_train_np, y_train_enc)


model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

model.fit(X_train_res, y_train_enc_res)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

In [None]:
# rezultati:
"""
Classification report: 
              precision    recall  f1-score   support

   HCoV-229E       0.99      0.99      0.99       153
   HCoV-HKU1       1.00      0.99      0.99        99
   HCoV-NL63       1.00      1.00      1.00       214
   HCoV-OC43       1.00      0.99      1.00       366
         IBV       1.00      1.00      1.00      3153
    MERS-CoV       1.00      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
   SARS-CoV2       1.00      1.00      1.00      2133
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       0.99      1.00      1.00       378
  canine-CoV       0.96      0.99      0.98       219
 dolphin-CoV       1.00      1.00      1.00         3
  equine-CoV       0.54      1.00      0.70         7
  feline-CoV       0.99      0.99      0.99       536
  ferret-CoV       1.00      1.00      1.00        13
hedgehog-CoV       1.00      0.83      0.91         6
 porcine-CoV       1.00      0.99      1.00       132
  rabbit-CoV       0.86      1.00      0.92         6
     rat-CoV       0.89      1.00      0.94         8
  turkey-CoV       0.96      1.00      0.98        27

    accuracy                           1.00      7867
   macro avg       0.96      0.99      0.97      7867
weighted avg       1.00      1.00      1.00      7867

Accuracy score: 0.99682216855218
"""

In [None]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

X_train_np = np.array(X_train)


smote = SMOTE(random_state=42, k_neighbors=2)
X_train_res, y_train_enc_res = smote.fit_resample(X_train_np, y_train_enc)


model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

model.fit(X_train_res, y_train_enc_res)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

In [None]:
# rezultati:
"""
Classification report: 
              precision    recall  f1-score   support

   HCoV-229E       1.00      1.00      1.00       153
   HCoV-HKU1       1.00      0.99      0.99        99
   HCoV-NL63       1.00      1.00      1.00       214
   HCoV-OC43       1.00      0.99      0.99       366
         IBV       1.00      1.00      1.00      3153
    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
   SARS-CoV2       1.00      1.00      1.00      2133
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       0.99      1.00      0.99       378
  canine-CoV       0.99      0.98      0.98       219
 dolphin-CoV       1.00      1.00      1.00         3
  equine-CoV       1.00      0.86      0.92         7
  feline-CoV       0.99      0.93      0.96       536
  ferret-CoV       1.00      1.00      1.00        13
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      0.99      1.00       132
  rabbit-CoV       0.11      1.00      0.20         6
     rat-CoV       0.89      1.00      0.94         8
  turkey-CoV       0.96      1.00      0.98        27

    accuracy                           0.99      7867
   macro avg       0.95      0.98      0.95      7867
weighted avg       1.00      0.99      0.99      7867

Accuracy score: 0.9918647514935808
"""

# AMINOKISELINSKI PONOVCI

In [4]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1, # ovo moze da se menja u optimizaciji
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 798
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 3917, number of used features: 402
[LightGBM] [Info] Start training from score -1.447621
[LightGBM] [Info] Start training from score -6.663643
[LightGBM] [Info] Start training from score -4.689562
[LightGBM] [Info] Start training from score -1.492024
[LightGBM] [Info] Start training from score -2.042600
[LightGBM] [Info] Start training from score -1.142183
[LightGBM] [Info] Start training from score -5.875186
[LightGBM] [Info] Start training from score -2.552770
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      1.00      1.00        16
  bovine

In [5]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1, # ovo moze da se menja u optimizaciji
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 1007
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 679
[LightGBM] [Info] Number of data points in the train set: 3917, number of used features: 321
[LightGBM] [Info] Start training from score -1.447621
[LightGBM] [Info] Start training from score -6.663643
[LightGBM] [Info] Start training from score -4.689562
[LightGBM] [Info] Start training from score -1.492024
[LightGBM] [Info] Start training from score -2.042600
[LightGBM] [Info] Start training from score -1.142183
[LightGBM] [Info] Start training from score -5.875186
[LightGBM] [Info] Start training from score -2.552770
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         

# NUKLEOTIDNI PONOVCI

In [8]:
files = []
folder = "../proba3/DC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1, # ovo moze da se menja u optimizaciji
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4346
[LightGBM] [Info] Number of data points in the train set: 3903, number of used features: 1903
[LightGBM] [Info] Start training from score -1.445127
[LightGBM] [Info] Start training from score -6.660063
[LightGBM] [Info] Start training from score -4.743140
[LightGBM] [Info] Start training from score -1.496420
[LightGBM] [Info] Start training from score -2.042964
[LightGBM] [Info] Start training from score -1.140203
[LightGBM] [Info] Start training from score -5.871605
[LightGBM] [Info] Start training from score -2.549189
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00       

In [9]:
files = []
folder = "../proba3/DN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1, # ovo moze da se menja u optimizaciji
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 6309
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.770548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14013
[LightGBM] [Info] Number of data points in the train set: 3903, number of used features: 4616
[LightGBM] [Info] Start training from score -1.445127
[LightGBM] [Info] Start training from score -6.660063
[LightGBM] [Info] Start training from score -4.743140
[LightGBM] [Info] Start training from score -1.496420
[LightGBM] [Info] Start training from score -2.042964
[LightGBM] [Info] Start training from score -1.140203
[LightGBM] [Info] Start training from score -5.871605
[LightGBM] [Info] Start training from score -2.549189
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       0.99      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00      

In [10]:
files = []
folder = "../proba3/IN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1, # ovo moze da se menja u optimizaciji
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 4897
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114416 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4914
[LightGBM] [Info] Number of data points in the train set: 3903, number of used features: 2144
[LightGBM] [Info] Start training from score -1.445127
[LightGBM] [Info] Start training from score -6.660063
[LightGBM] [Info] Start training from score -4.743140
[LightGBM] [Info] Start training from score -1.496420
[LightGBM] [Info] Start training from score -2.042964
[LightGBM] [Info] Start training from score -1.140203
[LightGBM] [Info] Start training from score -5.871605
[LightGBM] [Info] Start training from score -2.549189
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       0.33      1.00      0.50       

In [11]:
files = []
folder = "../proba3/IC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

known_repeats = list(set(repeats))
print(f"Number of unique repeats: {len(known_repeats)}")

def extract_repeat_counts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extract_repeat_counts(seq, known_repeats) for seq in sequencesTest]
y_test = labelTest

# LightGBM zahteva da i oznake budu brojevi, pa ih je potrebno enkodirati tako
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1, # ovo moze da se menja u optimizaciji
    random_state=42
)

model.fit(X_train, y_train_enc)
y_pred_enc = model.predict(X_test)
y_pred = encoder.inverse_transform(y_pred_enc)

print("Classification report: ")
print(classification_report(y_test, y_pred))

print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

Number of unique repeats: 5163
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.159290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5465
[LightGBM] [Info] Number of data points in the train set: 3903, number of used features: 2402
[LightGBM] [Info] Start training from score -1.445127
[LightGBM] [Info] Start training from score -6.660063
[LightGBM] [Info] Start training from score -4.743140
[LightGBM] [Info] Start training from score -1.496420
[LightGBM] [Info] Start training from score -2.042964
[LightGBM] [Info] Start training from score -1.140203
[LightGBM] [Info] Start training from score -5.871605
[LightGBM] [Info] Start training from score -2.549189
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       0.60      1.00      0.75       