small: 100 znakova, 5 instanci

medium: 600 znakova, 5 instanci

large: 3000 znakova, 5 instance

xlarge: 10000 znakova, 5 instance

Tipovi: "random", "one_duplicate", "multiple_duplicates", "mixed"

In [None]:
import random
import os

# Folder gde čuvamo fajlove
output_folder = "data"
os.makedirs(output_folder, exist_ok=True)

# DNK alfabet
alphabet = "ACGT"

# Funkcija za generisanje random stringa koji sadrži sva slova iz alfabeta
def random_string_cover(length, alphabet=alphabet):
    if length < len(alphabet):
        raise ValueError("Dužina stringa mora biti veća od broja slova u alfabetu.")
    
    s = list(alphabet)
    while len(s) < length:
        s.append(random.choice(alphabet))
    random.shuffle(s)
    return "".join(s)

# Ubacivanje duplikata (bez preklapanja)
def insert_duplicate_no_overlap(s, duplicate_length, existing_ranges):
    if duplicate_length >= len(s):
        duplicate_length = len(s) // 2

    while True:
        start = random.randint(0, len(s) - duplicate_length)
        overlap = any(start < end and start + duplicate_length > begin for begin, end in existing_ranges)
        if not overlap:
            break

    duplicate = s[start:start + duplicate_length]

    while True:
        insert_pos = random.randint(0, len(s) - duplicate_length)
        overlap = any(insert_pos < end and insert_pos + duplicate_length > begin for begin, end in existing_ranges)
        if not overlap and insert_pos != start:
            break

    s = s[:insert_pos] + duplicate + s[insert_pos + duplicate_length:]
    existing_ranges.append((insert_pos, insert_pos + duplicate_length))
    return s, existing_ranges

# Ubacivanje duplikata (sa preklapanjem)
def insert_duplicate_overlap(s, duplicate_length, existing_ranges):
    if duplicate_length >= len(s):
        duplicate_length = len(s) // 2

    start = random.randint(0, len(s) - duplicate_length)
    duplicate = s[start:start + duplicate_length]

    insert_pos = random.randint(0, len(s) - duplicate_length)
    s = s[:insert_pos] + duplicate + s[insert_pos + duplicate_length:]
    existing_ranges.append((insert_pos, insert_pos + duplicate_length))
    return s, existing_ranges

# Funkcija za generisanje instanci
def generate_instance(length, type="random", max_dups=3, alphabet=alphabet, allow_overlap=False):
    s = random_string_cover(length, alphabet)
    existing_ranges = []

    def choose_dup_len(min_factor, max_factor):
        min_dup_len = max(5, length // min_factor)
        max_dup_len = min(50, length // max_factor)
        if min_dup_len > max_dup_len:
            min_dup_len = max_dup_len
        return random.randint(min_dup_len, max_dup_len)

    insert_func = insert_duplicate_overlap if allow_overlap else insert_duplicate_no_overlap

    if type == "one_duplicate":
        dup_len = choose_dup_len(10, 2)
        s, existing_ranges = insert_func(s, dup_len, existing_ranges)

    elif type == "multiple_duplicates":
        for _ in range(random.randint(2, max_dups)):
            dup_len = choose_dup_len(10, 4)
            s, existing_ranges = insert_func(s, dup_len, existing_ranges)

    elif type == "mixed":
        for _ in range(random.randint(1, max_dups)):
            dup_len = choose_dup_len(10, 4)
            s, existing_ranges = insert_func(s, dup_len, existing_ranges)

    return s

# Parametri za generisanje instanci
sizes = {
    "small": 100,
    "medium": 600,
    "large": 3000,
    "xlarge": 10000
}
instances_per_size = 5
types = ["random", "one_duplicate", "multiple_duplicates", "mixed"]

# Generisanje svih fajlova
for size_name, length in sizes.items():
    for i in range(1, instances_per_size + 1):
        for t in types:
            filename = f"{output_folder}/instance_{size_name}_{i}_{t}.txt"
            s = generate_instance(length, type=t, allow_overlap=False)  # ili True za preklapanje
            with open(filename, "w") as f:
                f.write(s)
            print(f"Generated: {filename}")


Generated: data/instance_small_1_random.txt
Generated: data/instance_small_1_one_duplicate.txt
Generated: data/instance_small_1_multiple_duplicates.txt
Generated: data/instance_small_1_mixed.txt
Generated: data/instance_small_2_random.txt
Generated: data/instance_small_2_one_duplicate.txt
Generated: data/instance_small_2_multiple_duplicates.txt
Generated: data/instance_small_2_mixed.txt
Generated: data/instance_small_3_random.txt
Generated: data/instance_small_3_one_duplicate.txt
Generated: data/instance_small_3_multiple_duplicates.txt
Generated: data/instance_small_3_mixed.txt
Generated: data/instance_small_4_random.txt
Generated: data/instance_small_4_one_duplicate.txt
Generated: data/instance_small_4_multiple_duplicates.txt
Generated: data/instance_small_4_mixed.txt
Generated: data/instance_small_5_random.txt
Generated: data/instance_small_5_one_duplicate.txt
Generated: data/instance_small_5_multiple_duplicates.txt
Generated: data/instance_small_5_mixed.txt
Generated: data/instance_