In [1]:
import os
import csv

import sys
import random
import numpy as np

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [2]:
def createFolder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Folder '{folder}' created.")
    else:
        print(f"Folder '{folder}' already exists.")

In [3]:
createFolder("../trainSetNucl/")
createFolder("../testSetNucl/")

Folder '../trainSetNucl/' created.
Folder '../testSetNucl/' created.


In [4]:
def shuffle(disprotIds, seed):
    random.seed(seed)
    random.shuffle(disprotIds)
    return disprotIds

In [5]:
def splitData(disprotIds):
    trainSize = int(np.floor(70*len(disprotIds)/100))
    print("Velicina trening skupa je: ", trainSize)

    train = disprotIds[0:trainSize]

    test = disprotIds[trainSize:]
    return train, test

In [6]:
def dumpToFasta(testFile, trainFile, fastaTest, fastaTrain):
    
    with open(testFile, "w") as f:
        for record in fastaTest:
            SeqIO.write(record, f, "fasta")

    with open(trainFile, "w") as f:
        for record in fastaTrain:
            SeqIO.write(record, f, "fasta")
    return

In [7]:
def processFile(records):   
    sequences=[]
    disprotIds=[]

    for record in records:
        sequences.append(record.seq)
        disprotIds.append(record.id)

    print(len(sequences))
    print(len(disprotIds))
    return sequences, disprotIds

In [8]:
def openFile(filename, seed):
    fastaTrain = []
    fastaTest = []
        
    records = SeqIO.parse(filename, "fasta")

    sequences, disprotIds = processFile(records)
    shuffle(disprotIds, seed)

    train, test = splitData(disprotIds)

    records = SeqIO.parse(filename, "fasta")

    set1 = set(test)
    set2 = set(train)

    
    for record in records:
        if record.id in test:# and podaci[i]['disprot_id'] not in testIds:
            fastaTest.append(record)
        else:
            fastaTrain.append(record)
    
    print(len(fastaTest))
    print(len(fastaTrain))
    dumpToFasta(filename.replace("nuclFastaFiles", "testSetNucl"), filename.replace("nuclFastaFiles", "trainSetNucl"), fastaTest, fastaTrain)

In [9]:
seed = [10, 15, 38, 42, 16, 52, 4, 78, 23, 0] * 2
i = 0
folder = "../nuclFastaFiles/"
for file in os.listdir(folder):
    if file.startswith("."):
        continue

    path = folder + file
    openFile(path, seed[i])
    i += 1

23
23
Velicina trening skupa je:  16
7
16
90
90
Velicina trening skupa je:  63
27
63
438
438
Velicina trening skupa je:  306
132
306
49
49
Velicina trening skupa je:  34
15
34
10498
10498
Velicina trening skupa je:  7348
3150
7348
24
24
Velicina trening skupa je:  16
8
16
724
724
Velicina trening skupa je:  506
218
506
1784
1784
Velicina trening skupa je:  1248
536
1248
1315
1315
Velicina trening skupa je:  920
395
920
17
17
Velicina trening skupa je:  11
6
11
507
507
Velicina trening skupa je:  354
153
354
710
710
Velicina trening skupa je:  497
213
497
7
7
Velicina trening skupa je:  4
3
4
8
8
Velicina trening skupa je:  5
3
5
42
42
Velicina trening skupa je:  29
13
29
4531
4531
Velicina trening skupa je:  3171
1360
3171
1249
1249
Velicina trening skupa je:  874
375
874
325
325
Velicina trening skupa je:  227
98
227
20
20
Velicina trening skupa je:  14
6
14
1219
1219
Velicina trening skupa je:  853
366
853
