In [7]:
!pip install Bio



In [8]:
from Bio import Entrez, SeqIO
import json
import time

In [9]:
Entrez.email = "as.mohammed@ufl.edu"

In [10]:
def map_order_to_superorder(order_name):
    """Map order to superorder"""
    superorder_map = {
        # Marsupials
        'Didelphimorphia': 'Marsupials',
        'Diprotodontia': 'Marsupials',
        'Peramelemorphia': 'Marsupials',
        'Dasyuromorphia': 'Marsupials',
        # Afrotheria
        'Proboscidea': 'Afrotheria',
        'Sirenia': 'Afrotheria',
        'Hyracoidea': 'Afrotheria',
        'Tubulidentata': 'Afrotheria',
        'Macroscelidea': 'Afrotheria',
        'Afrosoricida': 'Afrotheria',
        # Xenarthra
        'Cingulata': 'Xenarthra',
        'Pilosa': 'Xenarthra',
        # Laurasiatheria
        'Eulipotyphla': 'Laurasiatheria',
        'Chiroptera': 'Laurasiatheria',
        'Carnivora': 'Laurasiatheria',
        'Pholidota': 'Laurasiatheria',
        'Perissodactyla': 'Laurasiatheria',
        'Cetartiodactyla': 'Laurasiatheria',
        'Artiodactyla': 'Laurasiatheria',
        # Euarchontoglires
        'Primates': 'Euarchontoglires',
        'Scandentia': 'Euarchontoglires',
        'Dermoptera': 'Euarchontoglires',
        'Rodentia': 'Euarchontoglires',
        'Lagomorpha': 'Euarchontoglires',
    }
    return superorder_map.get(order_name, 'Unclassified')

In [11]:
def get_precedence(definition):
    """
    Returns precedence score:
    1 = "alpha" (not subunit alpha or alpha-like)
    2 = "subunit alpha" (not alpha-like)
    3 = "subunit alpha-like"
    4 = other (will be filtered out)
    """
    defn = definition.lower()

    if "alpha" in defn and "subunit alpha" not in defn and "alpha-like" not in defn:
        return 1
    elif "subunit alpha" in defn and "alpha-like" not in defn:
        return 2
    elif "subunit alpha-like" in defn or "alpha-like" in defn:
        return 3
    else:
        return 4


In [12]:
print("Fetching all mammalian hemoglobin alpha sequences...")

handle = Entrez.esearch(db="protein",
                        term="hemoglobin alpha AND Mammalia[Organism]",
                        retmax=5000)
record = Entrez.read(handle)
all_protein_ids = record['IdList']

print(f"Found {len(all_protein_ids)} sequences total")

Fetching all mammalian hemoglobin alpha sequences...
Found 5000 sequences total


In [13]:
sequences_data = []

for i, protein_id in enumerate(all_protein_ids):
    if i % 50 == 0:
        print(f"Processing {i}/{len(all_protein_ids)}...")
        time.sleep(0.5)

    try:
        fetch_handle = Entrez.efetch(db="protein", id=protein_id,
                                     rettype="gb", retmode="xml")
        fetch_records = Entrez.read(fetch_handle)

        if not fetch_records:
            continue

        record_data = fetch_records[0]
        sequence = record_data['GBSeq_sequence']
        organism = record_data['GBSeq_organism']
        protein_def = record_data['GBSeq_definition']


        tax_id = None
        if 'GBSeq_feature-table' in record_data:
            for feature in record_data['GBSeq_feature-table']:
                if feature['GBFeature_key'] == 'source':
                    for qualifier in feature['GBFeature_quals']:
                        if qualifier['GBQualifier_name'] == 'db_xref':
                            if 'taxon' in qualifier['GBQualifier_value']:
                                tax_id = qualifier['GBQualifier_value'].split(':')[1]


        if tax_id:
            tax_handle = Entrez.efetch(db="taxonomy", id=tax_id, retmode="xml")
            tax_records = Entrez.read(tax_handle)
            tax_record = tax_records[0]

            lineage_dict = {}
            for tax_level in tax_record.get('LineageEx', []):
                lineage_dict[tax_level['Rank']] = tax_level['ScientificName']

            order_name = lineage_dict.get('order', 'Unknown')
            superorder_name = map_order_to_superorder(order_name)

            sequences_data.append({
                'protein_id': protein_id,
                'tax_id': tax_id,
                'species': organism,
                'sequence': sequence,
                'definition': protein_def,
                'kingdom': lineage_dict.get('kingdom', 'Animalia'),
                'phylum': lineage_dict.get('phylum', ''),
                'class': lineage_dict.get('class', ''),
                'order': order_name,
                'family': lineage_dict.get('family', ''),
                'genus': lineage_dict.get('genus', ''),
                'seq_length': len(sequence),
                'superorder': superorder_name
            })

    except Exception as e:
        print(f"Error processing {protein_id}: {e}")
        continue

print(f"Successfully retrieved {len(sequences_data)} sequences with taxonomy")

Processing 0/5000...
Processing 50/5000...
Processing 100/5000...
Processing 150/5000...
Processing 200/5000...
Processing 250/5000...
Processing 300/5000...
Processing 350/5000...
Processing 400/5000...
Processing 450/5000...
Processing 500/5000...
Processing 550/5000...
Processing 600/5000...
Processing 650/5000...
Processing 700/5000...
Processing 750/5000...
Processing 800/5000...
Processing 850/5000...
Processing 900/5000...
Processing 950/5000...
Processing 1000/5000...
Processing 1050/5000...
Processing 1100/5000...
Processing 1150/5000...
Processing 1200/5000...
Processing 1250/5000...
Processing 1300/5000...
Processing 1350/5000...
Processing 1400/5000...
Processing 1450/5000...
Processing 1500/5000...
Processing 1550/5000...
Processing 1600/5000...
Processing 1650/5000...
Processing 1700/5000...
Processing 1750/5000...
Processing 1800/5000...
Processing 1850/5000...
Processing 1900/5000...
Processing 1950/5000...
Processing 2000/5000...
Processing 2050/5000...
Processing 2100

In [14]:
import pandas as pd



In [18]:
print("\n=== APPLYING FILTERS ===\n")


df = pd.DataFrame(sequences_data)


df['precedence'] = df['definition'].apply(get_precedence)


df = df[df['precedence'] <= 3].copy()
print(f"After filtering for alpha/subunit alpha/alpha-like: {len(df)} sequences")


df = df[~df['definition'].str.lower().str.contains('partial', na=False)].copy()
print(f"After removing partial sequences: {len(df)} sequences")

#keep sequences longer than 100 amino acids (full-length)
df = df[df['seq_length'] > 100].copy()
print(f"After length filter (>100 aa): {len(df)} sequences")

df = df[df['seq_length'] < 200].copy()
print(f"After length filter (<200 aa): {len(df)} sequences")


df = df.sort_values(['species', 'precedence', 'seq_length'],
                    ascending=[True, True, False])
df_unique = df.drop_duplicates(subset='species', keep='first')

print(f"After deduplication (one per species): {len(df_unique)} unique species")





=== APPLYING FILTERS ===

After filtering for alpha/subunit alpha/alpha-like: 3117 sequences
After removing partial sequences: 3040 sequences
After length filter (>100 aa): 2824 sequences
After length filter (<200 aa): 2769 sequences
After deduplication (one per species): 402 unique species


In [21]:

print("\n=== DATASET SUMMARY ===\n")
print("Sequences per superorder:")
print(df_unique['superorder'].value_counts().sort_index())

print("\nSequences per order (top 15):")
print(df_unique['order'].value_counts().head(15))

print("\nPrecedence distribution:")
print(df_unique['precedence'].value_counts().sort_index())

print(f"\nSequence length stats (amino acids):")
print(f"  Min: {df_unique['seq_length'].min()}")
print(f"  Max: {df_unique['seq_length'].max()}")
print(f"  Mean: {df_unique['seq_length'].mean():.1f}")



=== DATASET SUMMARY ===

Sequences per superorder:
superorder
Afrotheria           13
Euarchontoglires    164
Laurasiatheria      203
Marsupials           14
Unclassified          4
Xenarthra             4
Name: count, dtype: int64

Sequences per order (top 15):
order
Rodentia          81
Artiodactyla      72
Carnivora         66
Primates          65
Chiroptera        35
Eulipotyphla      15
Lagomorpha        14
Perissodactyla    13
Diprotodontia      6
Proboscidea        4
Dasyuromorphia     4
Pilosa             3
Afrosoricida       3
Sirenia            3
Monotremata        3
Name: count, dtype: int64

Precedence distribution:
precedence
1    125
2    269
3      8
Name: count, dtype: int64

Sequence length stats (amino acids):
  Min: 101
  Max: 187
  Mean: 138.8


In [22]:

df_unique.to_csv('hemoglobin_filtered_unique.csv', index=False)
with open('hemoglobin_filtered_unique.json', 'w') as f:
    json.dump(df_unique.to_dict('records'), f, indent=2)

print("\nSaved to hemoglobin_filtered_unique.csv and .json")
print(f"\nTotal unique species ready for stratified sampling: {len(df_unique)}")


Saved to hemoglobin_filtered_unique.csv and .json

Total unique species ready for stratified sampling: 402
