In [1]:
def intersects(start, length, starts, ends):
    end = start + length
    for i in range(len(starts)):
        ranges = range(starts[i], ends[i] + 1)
        if (start in ranges or end in ranges):
            return True
    return False

In [2]:
def isDisorder(start, length, starts, ends):
    end = start + length
    for i in range(len(starts)):
        if start >= starts[i] and end <= ends[i]:
            return True
    return False

In [3]:
from typing import List

def update_rolling_hash(current_hash: int, old_char: str, new_char: str, window_size: int, base: int = 31, mod: int = 10**9 + 9) -> int:

    current_hash = (current_hash - ord(old_char) * pow(base, window_size - 1, mod)) % mod
    current_hash = (current_hash * base + ord(new_char)) % mod
    
    return current_hash if current_hash >= 0 else current_hash + mod

In [4]:
def calculate_initial_hash(s: str, window_size: int, base: int = 31, mod: int = 10**9 + 9) -> int:
    current_hash = 0
    for i in range(window_size):
        current_hash = (current_hash * base + ord(s[i])) % mod
    return current_hash

In [5]:
from Bio import SeqIO
import numpy as np
import json

def extractSequences(filename, outfile, data):
    fasta_sequences = SeqIO.parse(open(filename), 'fasta')
    print(outfile)
    with open(data) as f:
        podaci = json.load(f)

    j = 0
    sequenceMap = {}
    with open(outfile, "w") as out:
        for fasta in fasta_sequences:
            name, sequence = fasta.id, str(fasta.seq)
            regions = podaci[j]['disprot_consensus']['Structural state']
            j += 1 
            if name == "DP00072":
                continue

            starts, ends = [], []
            for region in regions:
                if region["type"] == "D":
                    starts.append(region["start"])
                    ends.append(region["end"])

            seq_len = len(sequence)
            print(name)
            for n in range(3, seq_len + 1):
                # Compute the initial hash for the first window of size n
                if seq_len >= n:
                    current_hash = calculate_initial_hash(sequence[:n], n)
                    if current_hash not in sequenceMap:
                        sequenceMap[current_hash] = np.array([0, 0, 0])

                    if isDisorder(0, n, starts, ends):
                        sequenceMap[current_hash][0] += 1
                    elif intersects(0, n, starts, ends):
                        sequenceMap[current_hash][1] += 1
                    else:
                        sequenceMap[current_hash][2] += 1

                    # Sliding window with rolling hash
                    for i in range(1, seq_len - n + 1):
                        current_hash = update_rolling_hash(current_hash, sequence[i - 1], sequence[i + n - 1], n)
                        if current_hash not in sequenceMap:
                            sequenceMap[current_hash] = np.array([0, 0, 0])
                        
                        if isDisorder(i, n, starts, ends):
                            sequenceMap[current_hash][0] += 1
                        elif intersects(i, n, starts, ends):
                            sequenceMap[current_hash][1] += 1
                        else:
                            sequenceMap[current_hash][2] += 1

                    if len(sequenceMap) >= 7000000:
                        for key, value in sequenceMap.items():  
                            out.write(f'{key}:{value}\n')
                        sequenceMap.clear()

            starts.clear()
            ends.clear()
        
        for key, value in sequenceMap.items():  
            out.write(f'{key}:{value}\n')

In [6]:
def writeSequencesToFile(inFiles, outFiles, data):
    for i in range(len(inFiles)):
        extractSequences(inFiles[i], outFiles[i], data[i])

In [None]:
# IMA NA DRUGOM LAPITOPIJU
outFiles = ["../sequences/seq3.txt"]
inFiles = ["../preprocessedFiles/preprocessed3.fasta"]
data = ["../trainSets/trainSet3.json"]
writeSequencesToFile(inFiles, outFiles, data)

../sequences/seq3.txt
DP00007
DP00013
DP00016
DP00018
DP00023
DP00028
DP00033
DP00040
DP00062
DP00070
DP00073
DP00075


In [7]:
outFiles = ["../sequences/seq4.txt"]
inFiles = ["../preprocessedFiles/preprocessed4.fasta"]
data = ["../trainSets/trainSet4.json"]
writeSequencesToFile(inFiles, outFiles, data)

../sequences/seq4.txt
DP00004
DP00007
DP00011
DP00012
DP00013
DP00017
DP00028
DP00033
DP00054
DP00061
DP00062
DP00074
DP00075
DP00078
DP00079
DP00084
DP00085
DP00090
DP00092
DP00099
DP00134
DP00138
DP00141
DP00152
DP00162
DP00166
DP00168
DP00175
DP00177
DP00184
DP00196
DP00214
DP00229
DP00236
DP00240
DP00241
DP00243
DP00260
DP00282
DP00287
DP00297
DP00308
DP00309
DP00315
DP00319
DP00322
DP00323
DP00327
DP00332
DP00333
DP00334
DP00339
DP00356
DP00357
DP00358
DP00378
DP00381
DP00392
DP00418
DP00428
DP00448
DP00457
DP00464
DP00466
DP00468
DP00473
DP00485
DP00486
DP00492
DP00505
DP00506
DP00508
DP00509
DP00510
DP00517
DP00521
DP00537
DP00539
DP00543
DP00546
DP00549
DP00553
DP00555
DP00565
DP00570
DP00576
DP00581
DP00592
DP00607
DP00608
DP00611
DP00616
DP00621
DP00624
DP00628
DP00630
DP00641
DP00666
DP00670
DP00672
DP00678
DP00683
DP00702
DP00704
DP00706
DP00707
DP00712
DP00718
DP00747
DP00765
DP00775
DP00794
DP00796
DP00816
DP00841
DP00865
DP00887
DP00893
DP00901
DP00903
DP00907
DP00910
DP

In [9]:
outFiles = ["../sequences/seq5.txt"]
inFiles = ["../preprocessedFiles/preprocessed5.fasta"]
data = ["../trainSets/trainSet5.json"]
writeSequencesToFile(inFiles, outFiles, data)

../sequences/seq5.txt
DP00007
DP00011
DP00012
DP00016
DP00023
DP00030
DP00040
DP00054
DP00062
DP00069
DP00070
DP00075
DP00078
DP00079
DP00090
DP00092
DP00099
DP00119
DP00130
DP00134
DP00138
DP00141
DP00152
DP00156
DP00168
DP00171
DP00184
DP00214
DP00219
DP00223
DP00229
DP00231
DP00238
DP00241
DP00243
DP00260
DP00262
DP00282
DP00298
DP00304
DP00308
DP00310
DP00314
DP00319
DP00320
DP00321
DP00324
DP00327
DP00332
DP00333
DP00334
DP00356
DP00358
DP00363
DP00378
DP00385
DP00420
DP00428
DP00437
DP00445
DP00448
DP00457
DP00468
DP00472
DP00473
DP00486
DP00492
DP00506
DP00508
DP00510
DP00520
DP00521
DP00537
DP00539
DP00543
DP00549
DP00553
DP00554
DP00562
DP00565
DP00608
DP00611
DP00616
DP00630
DP00633
DP00641
DP00670
DP00672
DP00694
DP00702
DP00704
DP00706
DP00716
DP00718
DP00747
DP00748
DP00765
DP00793
DP00794
DP00816
DP00841
DP00864
DP00865
DP00893
DP00901
DP00903
DP00917
DP00930
DP00941
DP00951
DP00953
DP00961
DP00962
DP01024
DP01065
DP01066
DP01068
DP01098
DP01099
DP01100
DP01102
DP01103
DP

In [10]:
outFiles = ["../sequences/seq2.txt"]
inFiles = ["../preprocessedFiles/preprocessed2.fasta"]
data = ["../trainSets/trainSet2.json"]
writeSequencesToFile(inFiles, outFiles, data)

../sequences/seq2.txt
DP00004
DP00007
DP00011
DP00016
DP00017
DP00018
DP00028
DP00054
DP00062
DP00069
DP00073
DP00075
DP00079
DP00084
DP00086
DP00092
DP00119
DP00123
DP00134
DP00141
DP00153
DP00162
DP00173
DP00196
DP00214
DP00219
DP00223
DP00230
DP00231
DP00236
DP00238
DP00262
DP00287
DP00307
DP00308
DP00309
DP00311
DP00315
DP00320
DP00322
DP00324
DP00327
DP00332
DP00333
DP00339
DP00343
DP00357
DP00358
DP00364
DP00392
DP00393
DP00420
DP00428
DP00448
DP00466
DP00468
DP00485
DP00486
DP00492
DP00506
DP00517
DP00520
DP00521
DP00537
DP00539
DP00543
DP00546
DP00549
DP00554
DP00555
DP00558
DP00565
DP00570
DP00581
DP00607
DP00608
DP00611
DP00621
DP00630
DP00633
DP00641
DP00652
DP00670
DP00672
DP00683
DP00694
DP00704
DP00706
DP00709
DP00716
DP00718
DP00736
DP00747
DP00748
DP00757
DP00793
DP00796
DP00816
DP00841
DP00846
DP00864
DP00865
DP00873
DP00893
DP00901
DP00903
DP00910
DP00912
DP00917
DP00934
DP00937
DP00941
DP00949
DP00951
DP00959
DP00961
DP00962
DP00969
DP01054
DP01063
DP01065
DP01098
DP

In [None]:
outFiles = ["../sequences/seq1.txt"]
inFiles = ["../preprocessedFiles/preprocessed1.fasta"]
data = ["../trainSets/trainSet1.json"]
writeSequencesToFile(inFiles, outFiles, data)

In [7]:
def extractSequences(filename, outfile, data):
    fasta_sequences = SeqIO.parse(open(filename), 'fasta')

    with open(data) as f:
        podaci = json.load(f)

    j = 0
    sequenceMap = {}
    with open(outfile, "w") as out:
        for fasta in fasta_sequences:
            name, sequence = fasta.id, str(fasta.seq)
            regions = podaci[j]['disprot_consensus']['Structural state']
            j += 1 
            if name != "DP00072":
                continue

            starts, ends = [], []
            for region in regions:
                if region["type"] == "D":
                    starts.append(region["start"])
                    ends.append(region["end"])

            seq_len = len(sequence)
            print(name)
            for n in range(3, seq_len + 1):
                # Compute the initial hash for the first window of size n
                if seq_len >= n:
                    current_hash = calculate_initial_hash(sequence[:n], n)
                    if current_hash not in sequenceMap:
                        sequenceMap[current_hash] = np.array([0, 0, 0])

                    if isDisorder(0, n, starts, ends):
                        sequenceMap[current_hash][0] += 1
                    elif intersects(0, n, starts, ends):
                        sequenceMap[current_hash][1] += 1
                    else:
                        sequenceMap[current_hash][2] += 1

                    # Sliding window with rolling hash
                    for i in range(1, seq_len - n + 1):
                        current_hash = update_rolling_hash(current_hash, sequence[i - 1], sequence[i + n - 1], n)
                        if current_hash not in sequenceMap:
                            sequenceMap[current_hash] = np.array([0, 0, 0])
                        
                        if isDisorder(i, n, starts, ends):
                            sequenceMap[current_hash][0] += 1
                        elif intersects(i, n, starts, ends):
                            sequenceMap[current_hash][1] += 1
                        else:
                            sequenceMap[current_hash][2] += 1

                    if len(sequenceMap) >= 7000000:
                        for key, value in sequenceMap.items():  
                            out.write(f'{key}:{value}\n')
                        sequenceMap.clear()

            starts.clear()
            ends.clear()
        
        for key, value in sequenceMap.items():  
            out.write(f'{key}:{value}\n')

In [None]:
outFiles = ["../sequences/72proba.txt"]
inFiles = ["../preprocessedFiles/preprocessed1.fasta"]
data = ["../trainSets/trainSet1.json"]
writeSequencesToFile(inFiles, outFiles, data)

DP00072
