In [6]:
import os
import subprocess
import pandas as pd


def split_fasta(fasta_file, output_dir):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(fasta_file, 'r') as file:
        seq = ""
        header = ""
        file_count = 0
        sequences = []

        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if seq:  # Snimi prethodnu sekvencu u listu
                    sequences.append((header, seq))
                header = line[1:]
                seq = ""
            else:
                seq += line
        if seq:  # Dodaj poslednju sekvencu
            sequences.append((header, seq))

        # Generiši fajlove sa vodećim nulama
        total_sequences = len(sequences)
        digits = len(str(total_sequences))  # Broj cifara za vodeće nule

        for i, (header, seq) in enumerate(sequences, start=1):
            file_count += 1
            output_file = os.path.join(output_dir, f"seq_{i:0{digits}d}.fasta")
            with open(output_file, 'w') as out_file:
                out_file.write(f">{header}\n{seq}\n")

    print(f"FASTA file split into {file_count} smaller files in {output_dir}.")


def run_isunstruct_on_files(output_dir):
    """
    Pokreće IsUnstruct za svaki pojedinačni FASTA fajl u datom direktorijumu.
    """
    results = []
    fasta_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.fasta')])

    for i, fasta_file in enumerate(fasta_files, 1):
        file_path = os.path.join(output_dir, fasta_file)
        print(f"Processing {i}/{len(fasta_files)}: {fasta_file}")

        try:
            result = subprocess.run(
                ["/home/user/Desktop/IP2/IsUnstruct_2.02/IsUnstruct", "-use_pattern", "2", "-long_disp", "1", "-file_save", "1", file_path],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            if result.stderr:
                print(f"Warning: {result.stderr.strip()}")
            results.append((fasta_file, result.stdout))
        except Exception as e:
            print(f"Error processing {fasta_file}: {e}")

    return results


def parse_isunstruct_output(isunstruct_output):
    lines = isunstruct_output.splitlines()
    results = []

    if lines[0].startswith("# IsUnstruct") and "long format" in lines[0]:
        results = parse_long_format(lines)
    elif lines[0].startswith("# IsUnstruct") and "short format" in lines[0]:
        results = parse_short_format(lines)
    else:
        raise ValueError("Nepoznat format izlaza.")

    return results
  

def create_dataframe_from_results(results):
    """
    Kreira DataFrame sa informacijama o sekvencama, uključujući pozicije,
    aminokiseline, skorove i klasifikacije.
    """
    rows = []
    for file_name, output in results:
        try:
            parsed_results = parse_isunstruct_output(output)
            print(f"Broj redova za {file_name}: {len(parsed_results)}")
            for parsed_result in parsed_results:
                rows.append({
                    "Ime fajla": file_name,
                    "Pozicija": parsed_result["Pozicija"],
                    "Aminokiselina": parsed_result["Aminokiselina"],
                    "IsUnstruct rezultat": parsed_result["Verovatnoća"],
                    "Klasifikacija": parsed_result["Klasifikacija"]
                })
        except Exception as e:
            print(f"Greška u obradi izlaza za {file_name}: {e}")

    print(f"Ukupan broj redova u DataFrame-u: {len(rows)}")
    return pd.DataFrame(rows)

def parse_long_format(lines):
    results = []
    for line in lines:
        line = line.strip()
        if line and not line.startswith("#"):  # Ignoriši komentare
            parts = line.split()
            try:
                # Ako linija sadrži tačno 4 vrednosti
                if len(parts) == 4:
                    position = int(parts[0])  # Pozicija
                    amino_acid = parts[1]  # Aminokiselina
                    state = parts[2]  # Stanje (U ili P)
                    probability = float(parts[3])  # Verovatnoća

                    classification = "neuređena" if probability > 0.5 else "uređena"
                    
                    results.append({
                        "Pozicija": position,
                        "Aminokiselina": amino_acid,
                        "Stanje": state,
                        "Verovatnoća": probability,
                        "Klasifikacija": classification
                    })
                elif len(parts) > 4:
                    position = int(parts[0])  # Pozicija
                    amino_acid = parts[1]  # Aminokiselina
                    state = parts[3]  # Stanje (U ili P)
                    probability = float(parts[4])  # Verovatnoća

                    # Ispisujemo dodatne vrednosti koje ignoriramo
                    if len(parts) > 4:
                        print(f"Ignorisano: {parts[2:]}")  # Dodaj ispis da vidiš šta je preostalo
                        
                    classification = "neuređena" if probability > 0.5 else "uređena"
                    
                    results.append({
                        "Pozicija": position,
                        "Aminokiselina": amino_acid,
                        "Stanje": state,
                        "Verovatnoća": probability,
                        "Klasifikacija": classification
                    })
                else:
                    print(f"Warning: Ignoring invalid line: {line}")
            except ValueError:
                print(f"Invalid line ignored: {line}")
    return results

def parse_short_format(lines):
    results = []
    sequence, state, probability = "", "", ""

    for line in lines:
        if line.startswith("sequence"):
            sequence = line.split()[1].replace(" ", "")  # Uklanjanje razmaka
        elif line.startswith("state"):
            state = line.split()[1].replace(" ", "")  # Uklanjanje razmaka
        elif line.startswith("probability"):
            probability = line.split()[1].replace(" ", "")  # Uklanjanje razmaka
        elif line.strip() == "":
            if sequence and state and probability:
                for j, (aa, s, prob) in enumerate(zip(sequence, state, probability)):
                    classification = "neuređena" if s == "U" else "uređena"
                    try:
                        prob_value = float(prob)
                    except ValueError:
                        prob_value = -1  # Nepoznata verovatnoća
                    results.append({
                        "Pozicija": j + 1,
                        "Aminokiselina": aa,
                        "Stanje": s,
                        "Verovatnoća": prob_value,
                        "Klasifikacija": classification
                    })
            sequence, state, probability = "", "", ""

    return results


def main_with_splitting():
    fasta_file = "/home/user/Desktop/IP2//izdvojeni_proteini_marburg.fasta"
    output_dir = "/home/user/Desktop/IP2/IsUnstruct_2.02/split_sequences_izdvojeno_marburg"

    split_fasta(fasta_file, output_dir)
    results = run_isunstruct_on_files(output_dir)
    df = create_dataframe_from_results(results)

    print(df.head(20))
    df.to_csv("/home/user/Desktop/IP2/IsUnstruct_2.02/isunstruct_izdvojeno_results.csv", index=False)
    print("Rezultati su sačuvani u CSV fajl.")


# Pokretanje glavne funkcije
main_with_splitting()


FASTA file split into 105 smaller files in /home/user/Desktop/IP2/IsUnstruct_2.02/split_sequences_izdvojeno_marburg.
Processing 1/105: seq_001.fasta
Processing 2/105: seq_002.fasta
Processing 3/105: seq_003.fasta
Processing 4/105: seq_004.fasta
Processing 5/105: seq_005.fasta
Processing 6/105: seq_006.fasta
Processing 7/105: seq_007.fasta
Processing 8/105: seq_008.fasta
Processing 9/105: seq_009.fasta
Processing 10/105: seq_010.fasta
Processing 11/105: seq_011.fasta
Processing 12/105: seq_012.fasta
Processing 13/105: seq_013.fasta
Processing 14/105: seq_014.fasta
Processing 15/105: seq_015.fasta
Processing 16/105: seq_016.fasta
Processing 17/105: seq_017.fasta
Processing 18/105: seq_018.fasta
Processing 19/105: seq_019.fasta
Processing 20/105: seq_020.fasta
Processing 21/105: seq_021.fasta
Processing 22/105: seq_022.fasta
Processing 23/105: seq_023.fasta
Processing 24/105: seq_024.fasta
Processing 25/105: seq_025.fasta
Processing 26/105: seq_026.fasta
Processing 27/105: seq_027.fasta
P

        Ime fajla  Pozicija Aminokiselina  IsUnstruct rezultat Klasifikacija
0   seq_001.fasta         1             M                0.974     neuređena
1   seq_001.fasta         2             D                0.922     neuređena
2   seq_001.fasta         3             L                0.847     neuređena
3   seq_001.fasta         4             H                0.813     neuređena
4   seq_001.fasta         5             S                0.779     neuređena
5   seq_001.fasta         6             L                0.714     neuređena
6   seq_001.fasta         7             L                0.686     neuređena
7   seq_001.fasta         8             E                0.682     neuređena
8   seq_001.fasta         9             L                0.663     neuređena
9   seq_001.fasta        10             G                0.666     neuređena
10  seq_001.fasta        11             T                0.666     neuređena
11  seq_001.fasta        12             K                0.665     neuređena