In [2]:
# Podela FASTA fajla na pojedinacne fajlove
import os

def split_fasta(fasta_file, output_dir):
    """
    Podeli FASTA fajl na pojedinačne fajlove, gde svaki fajl sadrži jednu sekvencu.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    with open(fasta_file, 'r') as file:
        seq = ""
        header = ""
        file_count = 0
        sequences = []
        
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if seq:  # Snimi prethodnu sekvencu u listu
                    sequences.append((header, seq))
                header = line[1:]
                seq = ""
            else:
                seq += line
        if seq:  # Dodaj poslednju sekvencu
            sequences.append((header, seq))
        
        # Generiši fajlove sa vodećim nulama
        total_sequences = len(sequences)
        digits = len(str(total_sequences))  # Broj cifara za vodeće nule
        
        for i, (header, seq) in enumerate(sequences, start=1):
            file_count += 1
            output_file = os.path.join(output_dir, f"seq_{i:0{digits}d}.fasta")
            with open(output_file, 'w') as out_file:
                out_file.write(f">{header}\n{seq}\n")
    
    print(f"FASTA file split into {file_count} smaller files in {output_dir}.")



In [3]:
#Pokretanje IUPRED3 za svaki fajl
import subprocess

def run_iupred3_on_files(output_dir):
    """
    Pokreće IUPred3 za svaki pojedinačni FASTA fajl u datom direktorijumu.
    """
    results = []
    fasta_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.fasta')])
    
    for i, fasta_file in enumerate(fasta_files, 1):
        file_path = os.path.join(output_dir, fasta_file)
        print(f"Processing {i}/{len(fasta_files)}: {fasta_file}")
        
        try:
            result = subprocess.run(
                ['/home/pc/Desktop/iupred3/iupred3.py', file_path, 'long'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            if result.stderr:
                print(f"Warning: {result.stderr.strip()}")
            results.append((fasta_file, result.stdout))
        except Exception as e:
            print(f"Error processing {fasta_file}: {e}")
    
    return results


In [4]:
def parse_iupred_output(iupred_output):
    """
    Parsira izlaz iz IUPred3 programa i klasifikuje svaku poziciju u sekvenci kao 'uređenu' ili 'neuređenu'.
    """
    results = []
    for line in iupred_output.splitlines():
        if line.strip() and not line.startswith("#"):
            parts = line.split()
            if len(parts) >= 3:
                pos = int(parts[0])  # Pozicija
                amino_acid = parts[1]  # Aminokiselina
                score = float(parts[2])  # IUPred score
                classification = "uređena" if score > 0.5 else "neuređena"
                results.append((pos, amino_acid, score, classification))
    return results



In [5]:
#Kreiranje dataframe-a
import pandas as pd

def create_dataframe_from_results(results):
    """
    Kreira DataFrame sa informacijama o sekvencama, uključujući pozicije,
    aminokiseline, skorove i klasifikacije.
    """
    rows = []
    for file_name, output in results:
        parsed_results = parse_iupred_output(output)
        for pos, amino_acid, score, classification in parsed_results:
            rows.append({
                "Ime fajla": file_name,
                "Pozicija": pos,
                "Aminokiselina": amino_acid,
                "IUPRED3 rezultat": score,
                "Klasifikacija": classification
            })
    return pd.DataFrame(rows)




In [6]:
#Glavna funkcija 
def main_with_splitting():
    """
    Glavna funkcija za podelu FASTA fajla, pokretanje IUPred3 i generisanje tabele.
    """
    fasta_file = "/home/pc/Desktop/Istrazivanje-podataka-2/Podaci/sequences_mers.fasta"
    output_dir = "/home/pc/Desktop/Istrazivanje-podataka-2/Podaci/split_sequences_mers"
    
    # Podela velikog FASTA fajla
    split_fasta(fasta_file, output_dir)
    
    # Pokretanje IUPred3 na podeljenim fajlovima
    results = run_iupred3_on_files(output_dir)
    
    # Kreiranje DataFrame-a
    df = create_dataframe_from_results(results)
    return df


In [None]:
#Pokretanje i cuvanje rezultata
# Generisanje DataFrame-a
df_results = main_with_splitting()

# Prikazivanje prvih 5 redova
print(df_results.head())

# Čuvanje rezultata u CSV fajl
df_results.to_csv("/home/pc/Desktop/Istrazivanje-podataka-2/Podaci/iupred_results_mers.csv", index=False)


FASTA file split into 7028 smaller files in /home/pc/Desktop/Istrazivanje-podataka-2/Podaci/split_sequences_mers.
Processing 1/7028: seq_0001.fasta
Processing 2/7028: seq_0002.fasta
Processing 3/7028: seq_0003.fasta
Processing 4/7028: seq_0004.fasta
Processing 5/7028: seq_0005.fasta
Processing 6/7028: seq_0006.fasta
Processing 7/7028: seq_0007.fasta
Processing 8/7028: seq_0008.fasta
Processing 9/7028: seq_0009.fasta
Processing 10/7028: seq_0010.fasta
Processing 11/7028: seq_0011.fasta
Processing 12/7028: seq_0012.fasta
  File "/home/pc/Desktop/iupred3/iupred3.py", line 25, in <module>
    iupred2_result = iupred3_lib.iupred(sequence, args.iupred_type, smoothing=args.smoothing)
  File "/home/pc/Desktop/iupred3/iupred3_lib.py", line 179, in iupred
    iupred_score = list(savgol_filter(iupred_score, 19, 5))
  File "/home/pc/.local/lib/python3.10/site-packages/scipy/signal/_savitzky_golay.py", line 345, in savgol_filter
    raise ValueError("If mode is 'interp', window_length must be less 