In [1]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

import os
import pandas as pd

from collections import Counter
from sklearn.model_selection import train_test_split

In [4]:
!pip3 install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 3.4 MB/s eta 0:00:03
   ---- ----------------------------------- 1.0/8.7 MB 2.8 MB/s eta 0:00:03
   -------- ------------------------------- 1.8/8.7 MB 3.2 MB/s eta 0:00:03
   ------------- -------------------------- 2.9/8.7 MB 3.6 MB/s eta 0:00:02
   --------------- ------------------------ 3.4/8.7 MB 3.4 MB/s eta 0:00:02
   ------------------ --------------------- 


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
from Bio import SeqIO
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, accuracy_score

# --- CONFIG ---
TRAIN_FOLDER = "../trainSetNucl"
TEST_FOLDER = "../testSetNucl"
REPEAT_FOLDER = "../proba3/DC"

TRAIN_VEC_FILE = "X_train.npy"
TRAIN_LABEL_FILE = "y_train.npy"

# --- STEP 1: Load known repeats ---
knownRepeats = []

repeat_files = [os.path.join(REPEAT_FOLDER, f) for f in os.listdir(REPEAT_FOLDER) if not f.startswith(".")]
for file in repeat_files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                knownRepeats.append(parts[1])

knownRepeats = list(set(knownRepeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

# --- Function ---
def extractRepeatCounts(sequence, repeat_list):
    return np.array([sequence.count(r) for r in repeat_list], dtype=np.float32)

# --- STEP 2: Precompute train vectors if needed ---
if not (os.path.exists(TRAIN_VEC_FILE) and os.path.exists(TRAIN_LABEL_FILE)):
    print("Precomputing train vectors...")
    train_files = [os.path.join(TRAIN_FOLDER, f) for f in os.listdir(TRAIN_FOLDER) if not f.startswith(".")]

    X_train = []
    y_train = []

    for file in train_files:
        label = os.path.basename(file).replace(".fasta", "")
        records = SeqIO.parse(file, "fasta")
        for record in records:
            vec = extractRepeatCounts(record.seq, knownRepeats)
            X_train.append(vec)
            y_train.append(label)

    X_train = np.array(X_train, dtype=np.float32)
    y_train = np.array(y_train)

    np.save(TRAIN_VEC_FILE, X_train)
    np.save(TRAIN_LABEL_FILE, y_train)

    print(f"Train vectors saved: {X_train.shape[0]} samples, {X_train.shape[1]} features")

else:
    print("Train vectors already exist. Loading...")

# --- STEP 3: Load train vectors & build index ---
X_train = np.load(TRAIN_VEC_FILE)
y_train = np.load(TRAIN_LABEL_FILE)

print("Building NearestNeighbors index...")
nn = NearestNeighbors(n_neighbors=1, algorithm='auto', n_jobs=-1)
nn.fit(X_train)
print("Index ready!")

# --- STEP 4: Stream test data & classify ---
test_files = [os.path.join(TEST_FOLDER, f) for f in os.listdir(TEST_FOLDER) if not f.startswith(".")]

y_true = []
y_pred = []

print("Classifying test sequences...")
for test_file in test_files:
    print(f"True: {test_file}")
    test_label = os.path.basename(test_file).replace(".fasta", "")
    records = SeqIO.parse(test_file, "fasta")

    for record in records:
        vec = extractRepeatCounts(record.seq, knownRepeats).reshape(1, -1)
        dist, idx = nn.kneighbors(vec)
        predicted_label = y_train[idx[0][0]]

        y_true.append(test_label)
        y_pred.append(predicted_label)

# --- STEP 5: Report ---
print("\nClassification report:")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))


Number of unique repeats: 9507
Precomputing train vectors...
Train vectors saved: 16496 samples, 9507 features
Building NearestNeighbors index...
Index ready!
Classifying test sequences...
True: ../testSetNucl\bat-CoV.fasta
True: ../testSetNucl\bovine-CoV.fasta
True: ../testSetNucl\canine-CoV.fasta
True: ../testSetNucl\dolphin-CoV.fasta
True: ../testSetNucl\equine-CoV.fasta
True: ../testSetNucl\feline-CoV.fasta
True: ../testSetNucl\ferret-CoV.fasta
True: ../testSetNucl\HCoV-229E.fasta
True: ../testSetNucl\HCoV-HKU1.fasta
True: ../testSetNucl\HCoV-NL63.fasta
True: ../testSetNucl\HCoV-OC43.fasta
True: ../testSetNucl\hedgehog-CoV.fasta
True: ../testSetNucl\IBV.fasta
True: ../testSetNucl\MERS-CoV.fasta
True: ../testSetNucl\porcine-CoV.fasta
True: ../testSetNucl\rabbit-CoV.fasta
True: ../testSetNucl\rat-CoV.fasta
True: ../testSetNucl\SARS-CoV.fasta
True: ../testSetNucl\SARS-CoV2.fasta
True: ../testSetNucl\turkey-CoV.fasta

Classification report:
              precision    recall  f1-score  

In [3]:
import os
from Bio import SeqIO
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, accuracy_score

# --- CONFIG ---
TRAIN_FOLDER = "../trainSetNucl"
TEST_FOLDER = "../testSetNucl"
REPEAT_FOLDER = "../proba3/IC"

TRAIN_VEC_FILE = "X_train.npy"
TRAIN_LABEL_FILE = "y_train.npy"

# --- STEP 1: Load known repeats ---
knownRepeats = []

repeat_files = [os.path.join(REPEAT_FOLDER, f) for f in os.listdir(REPEAT_FOLDER) if not f.startswith(".")]
for file in repeat_files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                knownRepeats.append(parts[1])

knownRepeats = list(set(knownRepeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

# --- Function ---
def extractRepeatCounts(sequence, repeat_list):
    return np.array([sequence.count(r) for r in repeat_list], dtype=np.float32)

# --- STEP 2: Precompute train vectors if needed ---
print("Precomputing train vectors...")
train_files = [os.path.join(TRAIN_FOLDER, f) for f in os.listdir(TRAIN_FOLDER) if not f.startswith(".")]

X_train = []
y_train = []

for file in train_files:
    label = os.path.basename(file).replace(".fasta", "")
    for record in SeqIO.parse(file, "fasta"):
        vec = extractRepeatCounts(record.seq, knownRepeats)
        X_train.append(vec)
        y_train.append(label)

X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train)

np.save(TRAIN_VEC_FILE, X_train)
np.save(TRAIN_LABEL_FILE, y_train)

print(f"Train vectors saved: {X_train.shape[0]} samples, {X_train.shape[1]} features")

# --- STEP 3: Load train vectors & build index ---
X_train = np.load(TRAIN_VEC_FILE)
y_train = np.load(TRAIN_LABEL_FILE)

print("Building NearestNeighbors index...")
nn = NearestNeighbors(n_neighbors=1, algorithm='auto', n_jobs=-1)
nn.fit(X_train)
print("Index ready!")

# --- STEP 4: Stream test data & classify ---
test_files = [os.path.join(TEST_FOLDER, f) for f in os.listdir(TEST_FOLDER) if not f.startswith(".")]

y_true = []
y_pred = []

print("Classifying test sequences...")
for test_file in test_files:
    print(f"True: {test_file}")
    test_label = os.path.basename(test_file).replace(".fasta", "")
    records = SeqIO.parse(test_file, "fasta")

    for record in records:
        vec = extractRepeatCounts(record.seq, knownRepeats).reshape(1, -1)
        dist, idx = nn.kneighbors(vec)
        predicted_label = y_train[idx[0][0]]

        y_true.append(test_label)
        y_pred.append(predicted_label)

# --- STEP 5: Report ---
print("\nClassification report:")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))


Number of unique repeats: 11652
Precomputing train vectors...
Train vectors saved: 16496 samples, 11652 features
Building NearestNeighbors index...
Index ready!
Classifying test sequences...
True: ../testSetNucl\bat-CoV.fasta
True: ../testSetNucl\bovine-CoV.fasta
True: ../testSetNucl\canine-CoV.fasta
True: ../testSetNucl\dolphin-CoV.fasta
True: ../testSetNucl\equine-CoV.fasta
True: ../testSetNucl\feline-CoV.fasta
True: ../testSetNucl\ferret-CoV.fasta
True: ../testSetNucl\HCoV-229E.fasta
True: ../testSetNucl\HCoV-HKU1.fasta
True: ../testSetNucl\HCoV-NL63.fasta
True: ../testSetNucl\HCoV-OC43.fasta
True: ../testSetNucl\hedgehog-CoV.fasta
True: ../testSetNucl\IBV.fasta
True: ../testSetNucl\MERS-CoV.fasta
True: ../testSetNucl\porcine-CoV.fasta
True: ../testSetNucl\rabbit-CoV.fasta
True: ../testSetNucl\rat-CoV.fasta
True: ../testSetNucl\SARS-CoV.fasta
True: ../testSetNucl\SARS-CoV2.fasta
True: ../testSetNucl\turkey-CoV.fasta

Classification report:
              precision    recall  f1-score

In [2]:
import os
from Bio import SeqIO
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, accuracy_score

# --- CONFIG ---
TRAIN_FOLDER = "../trainSetNucl"
TEST_FOLDER = "../testSetNucl"
REPEAT_FOLDER = "../proba3/IN"

TRAIN_VEC_FILE = "X_train.npy"
TRAIN_LABEL_FILE = "y_train.npy"

# --- STEP 1: Load known repeats ---
knownRepeats = []

repeat_files = [os.path.join(REPEAT_FOLDER, f) for f in os.listdir(REPEAT_FOLDER) if not f.startswith(".")]
for file in repeat_files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                knownRepeats.append(parts[1])

knownRepeats = list(set(knownRepeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

# --- Function ---
def extractRepeatCounts(sequence, repeat_list):
    return np.array([sequence.count(r) for r in repeat_list], dtype=np.float32)

# --- STEP 2: Precompute train vectors if needed ---
print("Precomputing train vectors...")
train_files = [os.path.join(TRAIN_FOLDER, f) for f in os.listdir(TRAIN_FOLDER) if not f.startswith(".")]

X_train = []
y_train = []

for file in train_files:
    label = os.path.basename(file).replace(".fasta", "")
    for record in SeqIO.parse(file, "fasta"):
        vec = extractRepeatCounts(record.seq, knownRepeats)
        X_train.append(vec)
        y_train.append(label)

X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train)

np.save(TRAIN_VEC_FILE, X_train)
np.save(TRAIN_LABEL_FILE, y_train)

print(f"Train vectors saved: {X_train.shape[0]} samples, {X_train.shape[1]} features")

# --- STEP 3: Load train vectors & build index ---
X_train = np.load(TRAIN_VEC_FILE)
y_train = np.load(TRAIN_LABEL_FILE)

print("Building NearestNeighbors index...")
nn = NearestNeighbors(n_neighbors=1, algorithm='auto', n_jobs=-1)
nn.fit(X_train)
print("Index ready!")

# --- STEP 4: Stream test data & classify ---
test_files = [os.path.join(TEST_FOLDER, f) for f in os.listdir(TEST_FOLDER) if not f.startswith(".")]

y_true = []
y_pred = []

print("Classifying test sequences...")
for test_file in test_files:
    print(f"True: {test_file}")
    test_label = os.path.basename(test_file).replace(".fasta", "")
    records = SeqIO.parse(test_file, "fasta")

    for record in records:
        vec = extractRepeatCounts(record.seq, knownRepeats).reshape(1, -1)
        dist, idx = nn.kneighbors(vec)
        predicted_label = y_train[idx[0][0]]

        y_true.append(test_label)
        y_pred.append(predicted_label)

# --- STEP 5: Report ---
print("\nClassification report:")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))


Number of unique repeats: 11837
Precomputing train vectors...
Train vectors saved: 16496 samples, 11837 features
Building NearestNeighbors index...
Index ready!
Classifying test sequences...
True: ../testSetNucl\bat-CoV.fasta
True: ../testSetNucl\bovine-CoV.fasta
True: ../testSetNucl\canine-CoV.fasta
True: ../testSetNucl\dolphin-CoV.fasta
True: ../testSetNucl\equine-CoV.fasta
True: ../testSetNucl\feline-CoV.fasta
True: ../testSetNucl\ferret-CoV.fasta
True: ../testSetNucl\HCoV-229E.fasta
True: ../testSetNucl\HCoV-HKU1.fasta
True: ../testSetNucl\HCoV-NL63.fasta
True: ../testSetNucl\HCoV-OC43.fasta
True: ../testSetNucl\hedgehog-CoV.fasta
True: ../testSetNucl\IBV.fasta
True: ../testSetNucl\MERS-CoV.fasta
True: ../testSetNucl\porcine-CoV.fasta
True: ../testSetNucl\rabbit-CoV.fasta
True: ../testSetNucl\rat-CoV.fasta
True: ../testSetNucl\SARS-CoV.fasta
True: ../testSetNucl\SARS-CoV2.fasta
True: ../testSetNucl\turkey-CoV.fasta

Classification report:
              precision    recall  f1-score

In [1]:
import os
from Bio import SeqIO
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, accuracy_score

# --- CONFIG ---
TRAIN_FOLDER = "../trainSetNucl"
TEST_FOLDER = "../testSetNucl"
REPEAT_FOLDER = "../proba3/DN"

TRAIN_VEC_FILE = "X_train.npy"
TRAIN_LABEL_FILE = "y_train.npy"

# --- STEP 1: Load known repeats ---
knownRepeats = []

repeat_files = [os.path.join(REPEAT_FOLDER, f) for f in os.listdir(REPEAT_FOLDER) if not f.startswith(".")]
for file in repeat_files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                knownRepeats.append(parts[1])

knownRepeats = list(set(knownRepeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

# --- Function ---
def extractRepeatCounts(sequence, repeat_list):
    return np.array([sequence.count(r) for r in repeat_list], dtype=np.float32)

# --- STEP 2: Precompute train vectors if needed ---
print("Precomputing train vectors...")
train_files = [os.path.join(TRAIN_FOLDER, f) for f in os.listdir(TRAIN_FOLDER) if not f.startswith(".")]

X_train = []
y_train = []

for file in train_files:
    label = os.path.basename(file).replace(".fasta", "")
    for record in SeqIO.parse(file, "fasta"):
        vec = extractRepeatCounts(record.seq, knownRepeats)
        X_train.append(vec)
        y_train.append(label)

X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train)

np.save(TRAIN_VEC_FILE, X_train)
np.save(TRAIN_LABEL_FILE, y_train)

print(f"Train vectors saved: {X_train.shape[0]} samples, {X_train.shape[1]} features")

# --- STEP 3: Load train vectors & build index ---
X_train = np.load(TRAIN_VEC_FILE)
y_train = np.load(TRAIN_LABEL_FILE)

print("Building NearestNeighbors index...")
nn = NearestNeighbors(n_neighbors=1, algorithm='auto', n_jobs=-1)
nn.fit(X_train)
print("Index ready!")

# --- STEP 4: Stream test data & classify ---
test_files = [os.path.join(TEST_FOLDER, f) for f in os.listdir(TEST_FOLDER) if not f.startswith(".")]

y_true = []
y_pred = []

print("Classifying test sequences...")
for test_file in test_files:
    print(f"True: {test_file}")
    test_label = os.path.basename(test_file).replace(".fasta", "")
    records = SeqIO.parse(test_file, "fasta")

    for record in records:
        vec = extractRepeatCounts(record.seq, knownRepeats).reshape(1, -1)
        dist, idx = nn.kneighbors(vec)
        predicted_label = y_train[idx[0][0]]

        y_true.append(test_label)
        y_pred.append(predicted_label)

# --- STEP 5: Report ---
print("\nClassification report:")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))


Number of unique repeats: 14226
Precomputing train vectors...
Train vectors saved: 16496 samples, 14226 features
Building NearestNeighbors index...
Index ready!
Classifying test sequences...
True: ../testSetNucl\bat-CoV.fasta
True: ../testSetNucl\bovine-CoV.fasta
True: ../testSetNucl\canine-CoV.fasta
True: ../testSetNucl\dolphin-CoV.fasta
True: ../testSetNucl\equine-CoV.fasta
True: ../testSetNucl\feline-CoV.fasta
True: ../testSetNucl\ferret-CoV.fasta
True: ../testSetNucl\HCoV-229E.fasta
True: ../testSetNucl\HCoV-HKU1.fasta
True: ../testSetNucl\HCoV-NL63.fasta
True: ../testSetNucl\HCoV-OC43.fasta
True: ../testSetNucl\hedgehog-CoV.fasta
True: ../testSetNucl\IBV.fasta
True: ../testSetNucl\MERS-CoV.fasta
True: ../testSetNucl\porcine-CoV.fasta
True: ../testSetNucl\rabbit-CoV.fasta
True: ../testSetNucl\rat-CoV.fasta
True: ../testSetNucl\SARS-CoV.fasta
True: ../testSetNucl\SARS-CoV2.fasta
True: ../testSetNucl\turkey-CoV.fasta

Classification report:
              precision    recall  f1-score

# FINALNO SA SVIM PODACIMA AA

In [None]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

repeats.clear()

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

X_test.clear()
X_train.clear()

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 2210


In [2]:
def extractRepeatsAndLabels(files):
    labels = []
    repeats = []

    # Read data from each file
    for file in files:
        with open(file, "r") as f:
            for line in f:
                parts = line.strip().split(",")
                if len(parts) >= 2:
                    label = parts[0]
                    repeat = parts[1]
                    labels.append(label)
                    repeats.append(repeat)

    return repeats, labels

In [3]:
def extractRepeatsAndLabelsTrain(files):
    labelTrain = []
    sequencesTrain = []

    # Read data from each file
    for file in files:
        records = SeqIO.parse(file, "fasta")
        for record in records:
            sequencesTrain.append(record.seq)
            labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))
    return sequencesTrain, labelTrain

In [4]:
def extractRepeatsAndLabelsTest(files):
    labelTrain = []
    sequencesTrain = []

    # Read data from each file
    for file in files:
        records = SeqIO.parse(file, "fasta")
        for record in records:
            sequencesTrain.append(record.seq)
            labelTrain.append(file.replace("../testSet/", "").replace(".fasta", ""))
    return sequencesTrain, labelTrain

In [6]:
def extractRepeatCounts(sequence, repeatList):
    return [sequence.count(r) for r in repeatList]

In [8]:
def testModel(repeatFolder, trainFolder, testModel):
    print(repeatFolder)
    files = []

    for file in os.listdir(repeatFolder):
        if file.startswith("."):
            continue
        files.append(repeatFolder + "/" + file)
    repeats, labels = extractRepeatsAndLabels(files)

    files.clear()

    for file in os.listdir(trainFolder):
        if file.startswith("."):
            continue
        files.append(trainFolder + "/" + file)
    sequencesTrain, labelTrain = extractRepeatsAndLabelsTrain(files)

    files.clear()

    for file in os.listdir(testFolder):
        if file.startswith("."):
            continue
        files.append(testFolder + "/" + file)
    sequencesTest, labelTest = extractRepeatsAndLabelsTest(files)

    knownRepeats = list(set(repeats))
    print(f"Number of unique repeats: {len(knownRepeats)}")


    X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
    y_train = labelTrain

    X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
    y_test = labelTest

    X_train_np = np.array(X_train)
    X_test_np = np.array(X_test)

    predictions = []

    for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
        dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
        idx = np.argmin(dists)
    # Predict the label of the closest training protein
        predictions.append(y_train[idx])

    print("Classification report: ")
    print(classification_report(y_test, predictions))

    print("Accuracy score test: " + str(accuracy_score(y_test, predictions)))
    print("------------------------------------------------------")

In [10]:
repeatFolder = ["../proba2/direct", "../proba2/indirect"]
trainFolder = "../trainSet/"
testFolder = "../testSet/"

for folder in repeatFolder:
    testModel(folder, trainFolder, testFolder)

../proba2/direct
Number of unique repeats: 798
Classification report: 
               precision    recall  f1-score   support

    /MERS-CoV       1.00      0.98      0.99       395
    /SARS-CoV       1.00      1.00      1.00         3
     /bat-CoV       0.94      0.94      0.94        16
  /bovine-CoV       1.00      0.99      1.00       378
  /canine-CoV       0.90      0.99      0.94       219
  /feline-CoV       0.99      0.97      0.98       536
/hedgehog-CoV       1.00      1.00      1.00         6
 /porcine-CoV       1.00      1.00      1.00       132

     accuracy                           0.98      1685
    macro avg       0.98      0.98      0.98      1685
 weighted avg       0.98      0.98      0.98      1685

Accuracy score test: 0.9821958456973294
------------------------------------------------------
../proba2/indirect
Number of unique repeats: 1007
Classification report: 
               precision    recall  f1-score   support

    /MERS-CoV       1.00      0.99      0

# SA NEPOZNATIM PONOVCIMA

In [3]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 798
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.98      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       0.94      0.94      0.94        16
  bovine-CoV       1.00      0.99      1.00       378
  canine-CoV       0.90      0.99      0.94       219
  feline-CoV       0.99      0.97      0.98       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.98      1685
   macro avg       0.98      0.98      0.98      1685
weighted avg       0.98      0.98      0.98      1685

Accuracy score: 0.9821958456973294


In [4]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 1007
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       0.94      0.94      0.94        16
  bovine-CoV       1.00      0.99      1.00       378
  canine-CoV       0.63      0.99      0.77       219
  feline-CoV       0.99      0.76      0.86       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.92      1685
   macro avg       0.94      0.96      0.94      1685
weighted avg       0.95      0.92      0.92      1685

Accuracy score: 0.9192878338278931


# BEZ NEPOZNATIH PONOVAKA

In [4]:
files = []
folder = "../proba2/direct"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
knownRepeats = [repeat for repeat in knownRepeats if not repeat.startswith("X")]
print(f"Number of unique repeats: {len(knownRepeats)}")

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 456
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       1.00      1.00      1.00       378
  canine-CoV       0.90      0.99      0.95       219
  feline-CoV       1.00      0.97      0.98       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1685
   macro avg       0.99      0.99      0.99      1685
weighted avg       0.99      0.99      0.99      1685

Accuracy score: 0.9851632047477745


In [6]:
files = []
folder = "../proba2/indirect"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSet/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSet"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSet/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
knownRepeats = [repeat for repeat in knownRepeats if not repeat.startswith("X")]
print(f"Number of unique repeats: {len(knownRepeats)}")

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 660
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.94      0.97        16
  bovine-CoV       1.00      1.00      1.00       378
  canine-CoV       0.63      0.99      0.77       219
  feline-CoV       1.00      0.77      0.87       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.92      1685
   macro avg       0.95      0.96      0.95      1685
weighted avg       0.95      0.92      0.93      1685

Accuracy score: 0.9222551928783382


In [2]:
files = []
folder = "../proba3/DC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 4335
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       0.99      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.98      0.97      0.98       218
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9922619047619048


In [3]:
files = []
folder = "../proba3/DN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 6309
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      1.00      1.00       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.97      0.98      0.97       218
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       0.99      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9922619047619048


In [4]:
files = []
folder = "../proba3/IN"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 4897
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       1.00      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.98      0.98      0.98       218
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       1.00      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9922619047619048


In [5]:
files = []
folder = "../proba3/IC"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labels = []
repeats = []

# Read data from each file
for file in files:
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(",")
            if len(parts) >= 2:
                label = parts[0]
                repeat = parts[1]
                labels.append(label)
                repeats.append(repeat)

files = []
folder = "../trainSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)

labelTrain = []
sequencesTrain = []

# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTrain.append(record.seq)
        labelTrain.append(file.replace("../trainSetNucl/", "").replace(".fasta", ""))

labelTest = []
sequencesTest = []

files = []
folder = "../testSetNucl"

for file in os.listdir(folder):
    if file.startswith("."):
        continue
    files.append(folder + "/" + file)


# Read data from each file
for file in files:
    records = SeqIO.parse(file, "fasta")
    for record in records:
        sequencesTest.append(record.seq)
        labelTest.append(file.replace("../testSetNucl/", "").replace(".fasta", ""))

knownRepeats = list(set(repeats))
print(f"Number of unique repeats: {len(knownRepeats)}")

def extractRepeatCounts(sequence, repeat_list):
    return [sequence.count(r) for r in repeat_list]

X_train = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTrain]
y_train = labelTrain

X_test = [extractRepeatCounts(seq, knownRepeats) for seq in sequencesTest]
y_test = labelTest

X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

predictions = []

for test_vec in X_test_np:
    # Compute Euclidean distance to all training repeats
    dists = np.linalg.norm(X_train_np - test_vec, axis=1)
    # Get index of closest training example
    idx = np.argmin(dists)
    # Predict the label of the closest training protein
    predictions.append(y_train[idx])

print("Classification report: ")
print(classification_report(y_test, predictions))

print("Accuracy score: " + str(accuracy_score(y_test, predictions)))

Number of unique repeats: 5163
Classification report: 
              precision    recall  f1-score   support

    MERS-CoV       0.99      0.99      0.99       395
    SARS-CoV       1.00      1.00      1.00         3
     bat-CoV       1.00      0.93      0.97        15
  bovine-CoV       1.00      1.00      1.00       375
  canine-CoV       0.97      0.97      0.97       218
  feline-CoV       0.99      0.99      0.99       536
hedgehog-CoV       1.00      1.00      1.00         6
 porcine-CoV       1.00      1.00      1.00       132

    accuracy                           0.99      1680
   macro avg       0.99      0.99      0.99      1680
weighted avg       0.99      0.99      0.99      1680

Accuracy score: 0.9904761904761905
