# are positives almost always from the same species ?

In [1]:
import pandas as pd
import os

In [2]:
dataset_path = "/nfs/scratch/pinder/negative_dataset/my_repository/datasets/no_duplicates/deleak_cdhit/fully_balanced"
lineage_path = "/nfs/scratch/pinder/negative_dataset/my_repository/datasets/no_duplicates/deleak_cdhit/sunburst_data/uniprot_lineage.tsv"
uniprot_seqs_path = "/nfs/scratch/pinder/negative_dataset/my_repository/datasets/no_duplicates/deleak_cdhit/sunburst_data/full_uniprot_sequences.fasta"

In [3]:
# tax_id to species mapping
lineage_df = pd.read_csv(lineage_path, sep="\t")
taxid_to_species = dict(zip(lineage_df['tax_id'], lineage_df['species']))

print(f"Found {len(taxid_to_species)} tax_id to species mappings.")
print(f"Sample mappings: {list(taxid_to_species.items())[:5]}")

Found 2220 tax_id to species mappings.
Sample mappings: [(9606, 'Homo sapiens'), (559292, 'Saccharomyces cerevisiae'), (83333, 'Escherichia coli'), (10090, 'Mus musculus'), (185431, 'Trypanosoma brucei')]


In [4]:
# uniprot accession to tax_id mapping
accession_to_taxid = {}
with open(uniprot_seqs_path, 'r') as f:
    for line in f:
        if line.startswith(">"):
            parts = line.split("|")
            accession = parts[0][1:].strip() # Remove leading '>'
            tax_id = parts[1].split("OX=")[1].strip()
            accession_to_taxid[accession] = tax_id

print(f"Found {len(accession_to_taxid)} accession to tax_id mappings.")
print(f"Sample mappings: {list(accession_to_taxid.items())[:5]}")

Found 26624 accession to tax_id mappings.
Sample mappings: [('A5YKK6', '9606'), ('O75794', '9606'), ('P00125', '9913'), ('P00157', '9913'), ('P01848', '9606')]


In [12]:
# go through all interaction files and find same species interactions
train_df = pd.read_csv(os.path.join(dataset_path, "train.csv"))
val_df = pd.read_csv(os.path.join(dataset_path, "val.csv"))
test_df = pd.read_csv(os.path.join(dataset_path, "test.csv"))

# only negatives of interest
train_df = train_df[train_df["label"] == 0]
val_df = val_df[val_df["label"] == 0]
test_df = test_df[test_df["label"] == 0]

def find_same_species_interactions(df, split: str) -> None:
    class_self = 0
    class_non_self = 0
    same = 0
    different = 0
    not_found = 0
    different_species_pairs = set()
    for _, row in df.iterrows():
        rec_uni = row["entry"].split("--")[0].split("_")[-1]
        lig_uni = row["entry"].split("--")[1].split("_")[-1]

        if rec_uni in accession_to_taxid and lig_uni in accession_to_taxid:
            rec_taxid = accession_to_taxid[rec_uni]
            lig_taxid = accession_to_taxid[lig_uni]
        else:
            not_found += 1
            continue

        if rec_uni == lig_uni:
            class_self += 1
        else:
            class_non_self += 1

        if rec_taxid == lig_taxid:
            same += 1
        else:
            different += 1
            different_species_pairs.add(row["entry"])
    
    print(f"{split} set - Same species interactions: {same} - {((same / (same+different))*100):.1f}%, Different species interactions: {different} - {(different / (same+different))*100:.1f}%, Not found: {not_found}")
    print(f"{split} set - Self interactions: {class_self}, Non-self interactions: {class_non_self},\n ----> new percentages: Same species: {(((same-class_self) / (same+different-class_self))*100):.1f}%, Different species: {(different / (same+different-class_self))*100:.1f}%")
    print(f"Different species pairs in {split} set: {list(different_species_pairs)[:5]}\n")  # Print first 5 for brevity

find_same_species_interactions(train_df, "Train")
find_same_species_interactions(val_df, "Validation")
find_same_species_interactions(test_df, "Test")

Train set - Same species interactions: 15061 - 13.8%, Different species interactions: 94042 - 86.2%, Not found: 30857
Train set - Self interactions: 9357, Non-self interactions: 99746,
 ----> new percentages: Same species: 5.7%, Different species: 94.3%
Different species pairs in Train set: ['8a3w__IB1_Q4QC11--1gqp__A1_P53068', '4qii__A1_P9WNP5--1seb__F1_P01911', '6k61__O1_P58565--5ez4__D1_Q9L4P8', '3s13__B2_Q80A30--8oqu__C1_O53871', '7w5z__H1_Q950Y9--7tgh__Z1_I7MFL6']

Validation set - Same species interactions: 531 - 65.6%, Different species interactions: 278 - 34.4%, Not found: 9
Validation set - Self interactions: 524, Non-self interactions: 285,
 ----> new percentages: Same species: 2.5%, Different species: 97.5%
Different species pairs in Validation set: ['6mou__A1_B3C969--1ny5__A1_O67198', '5ol8__A1_Q96QE5--2gsk__B1_P02929', '6pal__A1_A0A412SRH5--1ext__B1_P19438', '5amo__B1_O88998--3axb__A2_Q9YCJ0', '3ctv__A1_O28011--3o4w__B1_P33284']

Test set - Same species interactions: 383 -