In [2]:
from pathlib import Path


original_folder = Path(r"C:\Users\samuel.soukup\Documents\School\AMPEvolve\outputs\natives_climed")


In [30]:
from hill_climbing.hill_climber import HillClimbingResults
import json
from typing import Dict, List
from pathlib import Path


def load_hill_climbing_results(folder_path: Path) -> Dict[str, List[HillClimbingResults]]:
    results = {}

    for json_file in folder_path.rglob("*.json"):
        relative_path = str(json_file.relative_to(folder_path))

        with open(json_file, 'r') as f:
            data = json.load(f)
        results[relative_path] = [HillClimbingResults.model_validate(x) for x in data]

    return results


hill_climbing_data = load_hill_climbing_results(original_folder)
print(f"Loaded {len(hill_climbing_data)} JSON files")
print(f"Keys: {list(hill_climbing_data.keys())[:5]}")  # Show first 5 keys


Loaded 10 JSON files
Keys: ['dnvnative_seq\\hill_climber_results_AMPKillerPredictor.json', 'dnvnative_seq\\hill_climber_results_MacrelPredictor.json', 'gH626native\\hill_climber_results_AMPKillerPredictor.json', 'gH626native\\hill_climber_results_MacrelPredictor.json', 'hcv7native_seq\\hill_climber_results_AMPKillerPredictor.json']


In [31]:
from predictor import MacrelPredictor

macrel = MacrelPredictor()
scores = {}
for file_name, file in hill_climbing_data.items():
    for results in file:
        for r in results.results:
            scores[r.sequence] = macrel.calculate_and_predict_seq(r.sequence)

In [51]:
good_ones = []
for seq, value in scores.items():
    if value>=0.95:
        good_ones.append((seq, value))
print(len(good_ones))

2725


In [52]:
import pandas as pd

df=pd.DataFrame(good_ones, columns=['sequence', 'score'])
df.nunique()

sequence    2725
score          7
dtype: int64

In [53]:
texts = []
this_text = ""
for i, seq in enumerate(good_ones):
    this_text += f">{seq[0]}\n{seq[0]}\n"
    if i % 400 == 399:
        texts.append(this_text)
        this_text = ""
texts.append(this_text)
for i, text in enumerate(texts):
    with open(fr"C:\Users\samuel.soukup\Documents\School\AMPEvolve\outputs\natives_climed\good_ones_{i}.fasta", 'w') as f:
        f.write(text)

In [56]:
def get_sequence_source(sequence: str, data: Dict[str, List[HillClimbingResults]]) -> List[str]:
    """
    Find which file(s) a sequence comes from.
    
    Args:
        sequence: The peptide sequence to search for
        data: Dictionary mapping file paths to HillClimbingResults
        
    Returns:
        List of file paths where the sequence was found
    """
    sources = []
    for file_path, results_list in data.items():
        for results in results_list:
            for result in results.results:
                if result.sequence == sequence:
                    sources.append(file_path)
                    break
    return sources


# Test with a sequence from good_ones
test_seq = "RKVSRLMRWARMHMLRIAF"
print(f"Sequence: {test_seq}")
print(f"Found in: {get_sequence_source(test_seq, hill_climbing_data)}")


Sequence: RKVSRLMRWARMHMLRIAF
Found in: ['gH626native\\hill_climber_results_AMPKillerPredictor.json']
