In [1]:
import pandas as pd
import random

In [2]:
df = pd.read_csv('hemoglobin_filtered_unique.csv')

In [3]:
superorder_targets = {
    'Marsupials': 14,
    'Afrotheria': 13,
    'Xenarthra': 4,
    'Laurasiatheria': 79,
    'Euarchontoglires': 99
}

In [7]:
selected_records = []

print("\n=== STRATIFIED SAMPLING ===\n")

for superorder in ['Marsupials', 'Afrotheria', 'Xenarthra', 'Laurasiatheria', 'Euarchontoglires']:
    target = superorder_targets[superorder]


    super_df = df[df['superorder'] == superorder].copy()
    available = len(super_df)

    print(f"{superorder}: {available} available, targeting {target}")


    orders = super_df['order'].unique()


    order_quotas = {}
    for order in orders:
        order_count = len(super_df[super_df['order'] == order])
        order_quotas[order] = max(1, int(target * order_count / available))


    total_quota = sum(order_quotas.values())
    if total_quota != target:
        diff = target - total_quota
        largest_order = max(order_quotas, key=order_quotas.get)
        order_quotas[largest_order] += diff


    for order in orders:
        order_df = super_df[super_df['order'] == order].copy()
        quota = order_quotas[order]


        genera = order_df['genus'].unique()
        sampled_records = []


        for genus in genera:
            if len(sampled_records) < quota:
                genus_df = order_df[order_df['genus'] == genus]
                if len(genus_df) > 0:

                    genus_df_sorted = genus_df.sort_values(['precedence', 'seq_length'],
                                                           ascending=[True, False])

                    sampled_records.append(genus_df_sorted.iloc[0].to_dict())


        if len(sampled_records) < quota:
            used_species = {r['species'] for r in sampled_records}
            remaining = order_df[~order_df['species'].isin(used_species)].to_dict('records')
            if remaining:
                fill = random.sample(remaining, min(quota - len(sampled_records), len(remaining)))
                sampled_records.extend(fill)

        selected_records.extend(sampled_records)
        print(f"  {order}: selected {len(sampled_records)}")


=== STRATIFIED SAMPLING ===

Marsupials: 14 available, targeting 14
  Dasyuromorphia: selected 4
  Didelphimorphia: selected 3
  Diprotodontia: selected 6
  Peramelemorphia: selected 1
Afrotheria: 13 available, targeting 13
  Afrosoricida: selected 3
  Sirenia: selected 3
  Macroscelidea: selected 1
  Proboscidea: selected 4
  Tubulidentata: selected 1
  Hyracoidea: selected 1
Xenarthra: 4 available, targeting 4
  Pilosa: selected 3
  Cingulata: selected 1
Laurasiatheria: 203 available, targeting 79
  Carnivora: selected 25
  Artiodactyla: selected 30
  Chiroptera: selected 13
  Eulipotyphla: selected 5
  Perissodactyla: selected 5
  Pholidota: selected 1
Euarchontoglires: 164 available, targeting 99
  Rodentia: selected 50
  Primates: selected 39
  Dermoptera: selected 1
  Lagomorpha: selected 8
  Scandentia: selected 1


In [9]:

final_df = pd.DataFrame(selected_records)

print(f"\nFinal selection: {len(final_df)} species")

# Verify distribution
print("\nFinal distribution:")
print(final_df['superorder'].value_counts().sort_index())


final_df.to_csv('hemoglobin_209_species_final.csv', index=False)


✓ Final selection: 209 species

Final distribution:
superorder
Afrotheria          13
Euarchontoglires    99
Laurasiatheria      79
Marsupials          14
Xenarthra            4
Name: count, dtype: int64


In [12]:
!pip install Bio


Collecting Bio
  Downloading bio-1.8.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.8.1-py3-none-any.whl (321 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.3/321.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.whl (9.

In [16]:
from Bio.SeqIO import SeqRecord, write
from Bio.Seq import Seq

fasta_records = []
for _, row in final_df.iterrows():
    record = SeqRecord(
        Seq(row['sequence']),
        id=str(row['protein_id']),
        description=f"{row['species']} | {row['order']} | {row['superorder']}"
    )
    fasta_records.append(record)

write(fasta_records, "hemoglobin_209_species_final.fasta", "fasta")

209