In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import time
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch

from seqme.metrics import lv_cd_hit

from seqme import ModelCache, compute_metrics, show_table
from seqme.metrics import Fold, LV
from seqme.utils import random_subset, read_fasta_file

from config_paths import PATHS_FOR_CONTROL, PATHS_FOR_MODELS

In [2]:
def time_function(function, *args, **kwargs) -> tuple:
    start_time = time.perf_counter()
    result = function(*args, **kwargs)
    end_time = time.perf_counter()
    time_taken = end_time - start_time
    print(f"Execution time: {time_taken:.4f} seconds")
    return (result, time_taken)

In [3]:
datasets = {name: read_fasta_file(path) for name, path in (PATHS_FOR_CONTROL | PATHS_FOR_MODELS).items()}

for model_name, sequences in datasets.items():
    print(f"{model_name}: {len(sequences)} sequences")

positives: 14941 sequences
positives_hq: 3322 sequences
negatives: 14814 sequences
negatives_hq: 925 sequences
random_uniform: 50000 sequences
random_standard: 50000 sequences
UniProt: 50000 sequences
AMP-Diffusion: 47671 sequences
AMP-GAN: 50000 sequences
CPL-Diff: 46305 sequences
HydrAMP: 50000 sequences
OmegAMP: 50000 sequences
AMP-LM: 25608 sequences
AMP-Muller: 1052 sequences


In [4]:
cache = ModelCache(
    models={},
    init_cache=None,
)

In [66]:
sources = [
    "positives",
    "positives_hq",
    "negatives",
    # "negatives_hq",
    "random_uniform",
    "random_standard",
    "UniProt",
    "AMP-Diffusion",
    "AMP-GAN",
    "CPL-Diff",
    "HydrAMP",
    "OmegAMP",
    "AMP-LM",
    # "AMP-Muller",
]

n_samples = 50_000
seed = 42

benchmark_datasets = {
    source: random_subset(datasets[source], n_samples=n_samples, seed=seed)
    if len(datasets[source]) > n_samples
    else datasets[source]
    for source in sources
}
seqs_ref = datasets["positives_hq"]

In [None]:
# write_to_fasta_file(benchmark_datasets["UniProt"], "/raid/brunopsz/Metrics_Eval/FINAL_MAX_40AA_AMP/uniprot_8_50_100_50K_MAX40.fasta")

# write_to_fasta_file(benchmark_datasets["AMP-GAN"], "/raid/brunopsz/Metrics_Eval/FINAL_MAX_40AA_AMP/amp-gan_MAX40_50K.fasta")

# write_to_fasta_file(benchmark_datasets["OmegAMP"], "/raid/brunopsz/Metrics_Eval/FINAL_MAX_40AA_AMP/omegamp_MAX40_50K.fasta")

In [27]:
metrics = [
    Fold(metric=LV(reference=seqs_ref, objective="maximize"), split_size=len(seqs_ref), drop_last=True)
]

In [28]:
df = compute_metrics(benchmark_datasets, metrics)

  0%|          | 0/12 [00:00<?, ?it/s, data=positives, metric=Levenshtein Distance]

  8%|▊         | 1/12 [00:31<05:42, 31.15s/it, data=positives_hq, metric=Levenshtein Distance]

31.14779072580859


 17%|█▋        | 2/12 [00:38<02:53, 17.39s/it, data=negatives, metric=Levenshtein Distance]   

7.758369840681553


 25%|██▌       | 3/12 [01:09<03:32, 23.62s/it, data=random_uniform, metric=Levenshtein Distance]

31.02767658792436


 33%|███▎      | 4/12 [03:08<08:07, 60.96s/it, data=random_standard, metric=Levenshtein Distance]

118.20964548690245


 42%|████▏     | 5/12 [05:05<09:28, 81.26s/it, data=UniProt, metric=Levenshtein Distance]        

117.26073508895934


 50%|█████     | 6/12 [07:06<09:29, 94.97s/it, data=AMP-Diffusion, metric=Levenshtein Distance]

121.5767376050353


 58%|█████▊    | 7/12 [08:58<08:21, 100.36s/it, data=AMP-GAN, metric=Levenshtein Distance]      

111.4614311880432


 67%|██████▋   | 8/12 [10:54<07:01, 105.45s/it, data=CPL-Diff, metric=Levenshtein Distance]

116.33690644381568


 75%|███████▌  | 9/12 [12:36<05:12, 104.18s/it, data=HydrAMP, metric=Levenshtein Distance] 

101.39666018495336


 83%|████████▎ | 10/12 [14:34<03:36, 108.49s/it, data=OmegAMP, metric=Levenshtein Distance]

118.14045676682144


 92%|█████████▏| 11/12 [16:31<01:51, 111.30s/it, data=AMP-LM, metric=Levenshtein Distance] 

117.66682225186378


100%|██████████| 12/12 [17:26<00:00, 87.22s/it, data=AMP-LM, metric=Levenshtein Distance] 

54.689387066755444





In [None]:
times = [
    31.14779072580859,
    7.758369840681553,
    31.02767658792436,
    118.20964548690245,
    117.26073508895934,
    121.5767376050353,
    111.4614311880432,
    116.33690644381568,
    101.39666018495336,
    118.14045676682144,
    117.66682225186378,
    54.689387066755444
]

In [44]:
df

Unnamed: 0_level_0,Levenshtein Distance,Levenshtein Distance,LV_time
Unnamed: 0_level_1,value,deviation,Unnamed: 3_level_1
positives,0.581212,0.031274,31.147791
positives_hq,1.0,,7.75837
negatives,0.355321,0.012224,31.027677
random_uniform,0.316844,0.00019,118.209645
random_standard,0.367722,0.004352,117.260735
UniProt,0.325625,0.000189,121.576738
AMP-Diffusion,0.436102,0.003346,111.461431
AMP-GAN,0.419296,0.000352,116.336906
CPL-Diff,0.532909,0.000692,101.39666
HydrAMP,0.404823,0.0004,118.140457


In [45]:
df_lv = df['Levenshtein Distance']
df_lv['times'] = times
df_lv

Unnamed: 0,value,deviation,times
positives,0.581212,0.031274,31.147791
positives_hq,1.0,,7.75837
negatives,0.355321,0.012224,31.027677
random_uniform,0.316844,0.00019,118.209645
random_standard,0.367722,0.004352,117.260735
UniProt,0.325625,0.000189,121.576738
AMP-Diffusion,0.436102,0.003346,111.461431
AMP-GAN,0.419296,0.000352,116.336906
CPL-Diff,0.532909,0.000692,101.39666
HydrAMP,0.404823,0.0004,118.140457


In [37]:
sources = [
    "negatives_hq",
    "AMP-Muller",
]

n_samples = len(datasets["positives_hq"])
seed = 42

benchmark_datasets = {
    source: random_subset(datasets[source], n_samples=n_samples, seed=seed)
    if len(datasets[source]) > n_samples
    else datasets[source]
    for source in sources
}
seqs_ref = datasets["positives_hq"]

In [40]:
metrics = [
    LV(reference=seqs_ref, objective="maximize")
]

In [41]:
df_2 = compute_metrics(benchmark_datasets, metrics)

  0%|          | 0/2 [00:00<?, ?it/s, data=negatives_hq, metric=Levenshtein Distance]

 50%|█████     | 1/2 [00:02<00:02,  2.38s/it, data=AMP-Muller, metric=Levenshtein Distance]  

2.3809959003701806


100%|██████████| 2/2 [00:04<00:00,  2.43s/it, data=AMP-Muller, metric=Levenshtein Distance]

2.4759832778945565





In [42]:
times_2 = [2.3809959003701806, 2.4759832778945565]

In [46]:
df_2['Levenshtein Distance']

Unnamed: 0,value,deviation
negatives_hq,0.62405,
AMP-Muller,0.623581,


In [47]:
df_lv_negative = df_2['Levenshtein Distance']
df_lv_negative['times'] = times_2
df_lv_negative

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lv_negative['times'] = times_2


Unnamed: 0,value,deviation,times
negatives_hq,0.62405,,2.380996
AMP-Muller,0.623581,,2.475983


In [52]:
import pandas as pd

combined_df = pd.concat([df_lv, df_lv_negative])
combined_df

Unnamed: 0,value,deviation,times
positives,0.581212,0.031274,31.147791
positives_hq,1.0,,7.75837
negatives,0.355321,0.012224,31.027677
random_uniform,0.316844,0.00019,118.209645
random_standard,0.367722,0.004352,117.260735
UniProt,0.325625,0.000189,121.576738
AMP-Diffusion,0.436102,0.003346,111.461431
AMP-GAN,0.419296,0.000352,116.336906
CPL-Diff,0.532909,0.000692,101.39666
HydrAMP,0.404823,0.0004,118.140457


In [53]:
combined_df.to_csv("lv_results_cross_validation.csv")

### Benchmark CD-HIT LV 

In [5]:
def run_similarity_experiments():
    positives_hq_path = PATHS_FOR_CONTROL["positives_hq"]
    all_paths = {**PATHS_FOR_CONTROL, **PATHS_FOR_MODELS}

    results = {}

    for name, path in all_paths.items():

        print(f"\nRunning experiment: positives_hq vs {name}")
        result, time_taken = time_function(
            lv_cd_hit.mean_levenshtein_similarity_cd_hit_experiment,
            positives_hq_path,
            path
        )
        results[name] = {"result": result[0], "time": time_taken, "coverage": result[1]}

    return results

In [6]:
all_results = run_similarity_experiments()


Running experiment: positives_hq vs positives
Coverage: 0.30406264640920955
Execution time: 3.5196 seconds

Running experiment: positives_hq vs positives_hq
Coverage: 0.2959060806742926
Execution time: 0.9903 seconds

Running experiment: positives_hq vs negatives
Coverage: 0.7735250438774133
Execution time: 9.0873 seconds

Running experiment: positives_hq vs negatives_hq
Coverage: 0.49945945945945946
Execution time: 0.5216 seconds

Running experiment: positives_hq vs random_uniform
Coverage: 0.83502
Execution time: 31.4758 seconds

Running experiment: positives_hq vs random_standard
Coverage: 0.7818
Execution time: 27.7009 seconds

Running experiment: positives_hq vs UniProt
Coverage: 0.91586
Execution time: 33.9371 seconds

Running experiment: positives_hq vs AMP-Diffusion
Coverage: 0.6615552432296364
Execution time: 24.6110 seconds

Running experiment: positives_hq vs AMP-GAN
Coverage: 0.69336
Execution time: 26.2956 seconds

Running experiment: positives_hq vs CPL-Diff
Coverage: 0.

In [7]:
print("\n=== Experiment Summary ===")
for name, data in all_results.items():
    print(f"{name}: time={data['time']:.2f}s, result={data['result']}, coverage={data['coverage']:.2f}")


=== Experiment Summary ===
positives: time=3.52s, result=0.6668095957507639, coverage=0.30
positives_hq: time=0.99s, result=1.0, coverage=0.30
negatives: time=9.09s, result=0.2899160517673812, coverage=0.77
negatives_hq: time=0.52s, result=0.48477817245103527, coverage=0.50
random_uniform: time=31.48s, result=0.3906932765253448, coverage=0.84
random_standard: time=27.70s, result=0.4344713977942926, coverage=0.78
UniProt: time=33.94s, result=0.4169796010198117, coverage=0.92
AMP-Diffusion: time=24.61s, result=0.5155783885285317, coverage=0.66
AMP-GAN: time=26.30s, result=0.46336056316318663, coverage=0.69
CPL-Diff: time=19.30s, result=0.5354948738317479, coverage=0.57
HydrAMP: time=43.18s, result=0.4567799029936456, coverage=0.81
OmegAMP: time=17.09s, result=0.5775454758679757, coverage=0.48
AMP-LM: time=12.91s, result=0.5055660777505538, coverage=0.72
AMP-Muller: time=0.52s, result=0.40750520812236346, coverage=0.38


In [8]:
import pandas as pd

df_results = pd.DataFrame(
    [{"source": name, "value": data["result"], "time": data["time"], "coverage": data["coverage"]} for name, data in all_results.items()]
).set_index("source")[["value", "time", "coverage"]]

df_results

Unnamed: 0_level_0,value,time,coverage
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
positives,0.66681,3.519611,0.304063
positives_hq,1.0,0.990269,0.295906
negatives,0.289916,9.087318,0.773525
negatives_hq,0.484778,0.521577,0.499459
random_uniform,0.390693,31.475786,0.83502
random_standard,0.434471,27.700939,0.7818
UniProt,0.41698,33.937083,0.91586
AMP-Diffusion,0.515578,24.611037,0.661555
AMP-GAN,0.463361,26.295559,0.69336
CPL-Diff,0.535495,19.303415,0.567347


In [9]:
df_results.to_csv("lv_results_cd_hit.csv")