Setup & Import Previous Results

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from config import Config
from utils import setup_logging, validate_sequence, save_checkpoint, clean_sequence

# Import Biopython
from Bio import Entrez, SeqIO
from Bio.Seq import Seq

# Setup
logger = setup_logging()
Entrez.email = Config.NCBI_EMAIL
logger.info("Starting improved global data fetching notebook")

print("IMPROVED GLOBAL BISTON BETULARIA COI SEARCH")
print("="*60)
print(f"Target: {Config.TARGET_SPECIES} {Config.TARGET_GENE}")
print(f"Search strategy: Global (no geographic restrictions)")
print(f"Target sequences: 100+ for robust ABM simulation")


2025-06-01 13:04:38,762 - INFO - Starting data fetching notebook


PREVIOUS EXPLORATION RESULTS:
Total sequences available: 327
Target species: Biston betularia
Exploration date: 2025-06-01T13:04:29.105895


Global Search

In [None]:
def search_ncbi_global_aggressive(species, gene, max_results=100):
    """
    Aggressive global search untuk maximum sequences
    
    Args:
        species (str): Target species
        gene (str): Target gene  
        max_results (int): Maximum results to return
    
    Returns:
        list: List of sequence IDs
    """
    try:
        # Build search term - GLOBAL, NO RESTRICTIONS
        search_terms = [
            f"{species}[Organism]",
            f"{gene}[Gene]"
        ]
        
        # Add minimal quality filters only
        search_terms.extend([
            "300:900[SLEN]",           # Expanded length range (300-900 bp)
            "biomol_genomic[PROP]"     # Genomic sequences only
        ])
        
        search_query = " AND ".join(search_terms)
        
        print(f"Search query: {search_query}")
        print(f"Searching globally for up to {max_results} sequences...")
        print(f"Length range: 300-900 bp (expanded from 400-800)")
        print(f"Geographic filter: NONE (truly global)")
        
        # Perform search
        handle = Entrez.esearch(
            db=Config.NCBI_DATABASE,
            term=search_query,
            retmax=max_results,
            sort="relevance"
        )
        
        search_results = Entrez.read(handle)
        handle.close()
        
        id_list = search_results["IdList"]
        total_found = int(search_results["Count"])
        
        print(f"SUCCESS: Found {total_found} total sequences globally!")
        print(f"Retrieved {len(id_list)} sequence IDs for download")
        print(f"Coverage: {len(id_list)/total_found*100:.1f}% of available sequences")
        
        return id_list, total_found
        
    except Exception as e:
        logger.error(f"Search failed: {e}")
        return [], 0

# Perform aggressive global search
global_ids, global_total = search_ncbi_global_aggressive(
    species=Config.TARGET_SPECIES,
    gene=Config.TARGET_GENE,
    max_results=100  # Increased significantly
)

print(f"\nGLOBAL SEARCH RESULTS:")
print(f"Total available sequences: {global_total}")
print(f"IDs selected for download: {len(global_ids)}")
print(f"Expected final dataset: 80-95 sequences (after QC)")

Search query: Biston betularia[Organism] AND COI[Gene] AND 400:800[SLEN] AND biomol_genomic[PROP]
Searching for up to 20 sequences...
Found 327 total sequences matching criteria
Retrieved 20 sequence IDs for download

search results:
Total global sequences: 327
IDs to download: 20


Batch Download Function

In [None]:
def fetch_sequences_batch_optimized(id_list, batch_size=8, delay=1.2):
    """
    Optimized batch download dengan better progress tracking
    
    Args:
        id_list (list): List of sequence IDs
        batch_size (int): Sequences per batch (smaller for stability)
        delay (float): Delay between batches (seconds)
    
    Returns:
        list: List of SeqRecord objects
    """
    all_records = []
    failed_ids = []
    
    print(f"\nSTARTING OPTIMIZED BATCH DOWNLOAD")
    print(f"="*50)
    print(f"Total sequences to download: {len(id_list)}")
    print(f"Batch size: {batch_size} (optimized for stability)")
    print(f"Estimated time: {len(id_list)/batch_size*delay/60:.1f} minutes")
    
    start_time = time.time()
    
    for i in range(0, len(id_list), batch_size):
        batch_ids = id_list[i:i+batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (len(id_list) + batch_size - 1) // batch_size
        
        # Progress indicator
        progress = (batch_num / total_batches) * 100
        print(f"\nBatch {batch_num}/{total_batches} ({progress:.1f}%) - {len(batch_ids)} sequences")
        
        try:
            # Fetch batch with timeout handling
            handle = Entrez.efetch(
                db=Config.NCBI_DATABASE,
                id=batch_ids,
                rettype="gb",
                retmode="text"
            )
            
            # Parse records
            batch_records = list(SeqIO.parse(handle, "genbank"))
            handle.close()
            
            # Process each record
            batch_success = 0
            for record in batch_records:
                seq_str = str(record.seq)
                if validate_sequence(seq_str):
                    all_records.append(record)
                    batch_success += 1
                    print(f"  ✓ {record.id} ({len(seq_str)} bp)")
                else:
                    print(f"  ✗ {record.id} (failed QC)")
                    failed_ids.append(record.id)
            
            print(f"  Batch result: {batch_success}/{len(batch_records)} sequences added")
            
            # Rate limiting with progress
            if i + batch_size < len(id_list):
                print(f"  Waiting {delay}s... (Total so far: {len(all_records)} valid sequences)")
                time.sleep(delay)
                
        except Exception as e:
            logger.error(f"Batch {batch_num} failed: {e}")
            print(f"  ✗ Batch failed - continuing with next batch")
            failed_ids.extend(batch_ids)
            continue
    
    elapsed_time = (time.time() - start_time) / 60
    success_rate = len(all_records) / (len(all_records) + len(failed_ids)) * 100 if (len(all_records) + len(failed_ids)) > 0 else 0
    
    print(f"\n{'='*50}")
    print(f"DOWNLOAD COMPLETE!")
    print(f"{'='*50}")
    print(f"Time elapsed: {elapsed_time:.1f} minutes")
    print(f"Successfully downloaded: {len(all_records)} sequences")
    print(f"Failed downloads: {len(failed_ids)} sequences")
    print(f"Success rate: {success_rate:.1f}%")
    print(f"Data quality: All downloaded sequences passed basic validation")
    
    return all_records, failed_ids

# Execute optimized download
global_records, global_failed = fetch_sequences_batch_optimized(
    global_ids, 
    batch_size=8,  # Smaller batches for stability
    delay=1.2      # Slightly longer delay
)

Starting global sequence download...
Downloading 20 sequences in batches of 5...
Processing batch 1/4 (5 sequences)...
  Added: OR369609.1
  Added: OR369606.1
  Added: OR369463.1
  Added: OR369263.1
  Added: OR368775.1
  Waiting 1.5s before next batch...
Processing batch 2/4 (5 sequences)...
  Added: OR368139.1
  Added: OQ564151.1
  Added: OQ563084.1
  Added: OQ182911.1
  Added: MG470639.1
  Waiting 1.5s before next batch...
Processing batch 3/4 (5 sequences)...
  Added: OK073271.1
  Added: MN689344.1
  Added: MF054227.1
  Added: MF054187.1
  Added: MF054181.1
  Waiting 1.5s before next batch...
Processing batch 4/4 (5 sequences)...
  Added: MF054004.1
  Added: MF053762.1
  Added: MF053700.1
  Added: MF053670.1
  Added: MF053668.1

Download complete!
Successfully downloaded: 20 sequences
Failed downloads: 0 sequences


Extract Comprehensive Metadata

In [None]:
def extract_enhanced_metadata(records):
    """
    Enhanced metadata extraction dengan geographic parsing
    """
    metadata_list = []
    
    print(f"\nEXTRACTING ENHANCED METADATA")
    print(f"="*40)
    print(f"Processing {len(records)} sequence records...")
    
    for i, record in enumerate(records):
        metadata = {
            'accession_id': record.id,
            'accession_version': record.id,
            'description': record.description,
            'organism': record.annotations.get('organism', 'Unknown'),
            'sequence_length': len(record.seq),
            'sequence': str(record.seq),
            'date_added': record.annotations.get('date', 'Unknown'),
            'keywords': ','.join(record.annotations.get('keywords', [])),
            'source': record.annotations.get('source', 'Unknown')
        }
        
        # Enhanced geographic and specimen data extraction
        for feature in record.features:
            if feature.type == "source":
                qualifiers = feature.qualifiers
                
                # Geographic information with parsing
                raw_country = qualifiers.get('country', ['Unknown'])[0]
                metadata['country'] = raw_country
                metadata['lat_lon'] = qualifiers.get('lat_lon', ['Unknown'])[0]
                
                # Parse country for better geographic analysis
                if raw_country != 'Unknown':
                    country_parts = raw_country.split(':')
                    metadata['country_parsed'] = country_parts[0] if country_parts else raw_country
                    metadata['region'] = country_parts[1] if len(country_parts) > 1 else 'Unknown'
                else:
                    metadata['country_parsed'] = 'Unknown'
                    metadata['region'] = 'Unknown'
                
                # Collection information
                metadata['collection_date'] = qualifiers.get('collection_date', ['Unknown'])[0]
                metadata['collected_by'] = qualifiers.get('collected_by', ['Unknown'])[0]
                metadata['identified_by'] = qualifiers.get('identified_by', ['Unknown'])[0]
                
                # Specimen information
                metadata['specimen_voucher'] = qualifiers.get('specimen_voucher', ['Unknown'])[0]
                metadata['isolate'] = qualifiers.get('isolate', ['Unknown'])[0]
                
                # Biological information
                metadata['sex'] = qualifiers.get('sex', ['Unknown'])[0]
                metadata['life_stage'] = qualifiers.get('dev_stage', ['Unknown'])[0]
                metadata['tissue_type'] = qualifiers.get('tissue_type', ['Unknown'])[0]
                
                break
        
        # Enhanced sequence analysis
        seq_str = str(record.seq).upper()
        metadata['gc_content'] = round((seq_str.count('G') + seq_str.count('C')) / len(seq_str) * 100, 2)
        metadata['n_count'] = seq_str.count('N')
        metadata['ambiguous_bases'] = sum(1 for char in seq_str if char in 'RYSWKMBDHV')
        metadata['ambiguous_percentage'] = round((metadata['ambiguous_bases'] / len(seq_str)) * 100, 2)
        
        # Quality assessment
        metadata['sequence_quality'] = 'Valid' if validate_sequence(seq_str) else 'Invalid'
        
        # Calculate sequence complexity (simple measure)
        base_counts = {base: seq_str.count(base) for base in 'ATGC'}
        total_bases = sum(base_counts.values())
        if total_bases > 0:
            complexity = -sum((count/total_bases) * np.log2(count/total_bases) if count > 0 else 0 for count in base_counts.values())
            metadata['sequence_complexity'] = round(complexity, 3)
        else:
            metadata['sequence_complexity'] = 0
        
        metadata_list.append(metadata)
        
        # Progress indicator
        if (i + 1) % 20 == 0 or (i + 1) == len(records):
            print(f"  Processed {i + 1}/{len(records)} records...")
    
    df = pd.DataFrame(metadata_list)
    print(f"Enhanced metadata extraction complete!")
    
    return df

# Extract enhanced metadata
if global_records:
    global_metadata_df = extract_enhanced_metadata(global_records)
    
    print(f"\nENHANCED METADATA SUMMARY:")
    print(f"="*40)
    print(f"Total records processed: {len(global_metadata_df)}")
    print(f"Unique countries: {global_metadata_df['country_parsed'].nunique()}")
    print(f"Records with coordinates: {len(global_metadata_df[global_metadata_df['lat_lon'] != 'Unknown'])}")
    print(f"Records with collection dates: {len(global_metadata_df[global_metadata_df['collection_date'] != 'Unknown'])}")
    
    # Show top countries
    print(f"\nTop 10 countries represented:")
    country_counts = global_metadata_df['country_parsed'].value_counts().head(10)
    for country, count in country_counts.items():
        print(f"  {country}: {count} sequences")
    
    # Quality summary
    print(f"\nQuality summary:")
    quality_counts = global_metadata_df['sequence_quality'].value_counts()
    for quality, count in quality_counts.items():
        print(f"  {quality}: {count} sequences")
        
    print(f"\nSequence statistics:")
    print(f"  Length range: {global_metadata_df['sequence_length'].min()}-{global_metadata_df['sequence_length'].max()} bp")
    print(f"  Mean length: {global_metadata_df['sequence_length'].mean():.1f} bp")
    print(f"  GC content range: {global_metadata_df['gc_content'].min():.1f}-{global_metadata_df['gc_content'].max():.1f}%")
    print(f"  Mean GC content: {global_metadata_df['gc_content'].mean():.1f}%")

else:
    print("No records to process")


Extracting metadata from 20 records...
  Processed 10/20 records...
  Processed 20/20 records...
Metadata extraction complete!

global METADATA SUMMARY:
Records processed: 20
Unique countries: 1
Date range: 04-Jul-2014 to Unknown


Save Data Set

In [None]:
if global_records and not global_metadata_df.empty:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save sequences in FASTA format
    fasta_filename = f"../data/raw/biston_betularia_global_enhanced_{timestamp}.fasta"
    SeqIO.write(global_records, fasta_filename, "fasta")
    print(f"\nSequences saved: {fasta_filename}")
    
    # Save enhanced metadata
    metadata_filename = f"../data/processed/biston_betularia_global_enhanced_{timestamp}.csv"
    global_metadata_df.to_csv(metadata_filename, index=False)
    print(f"Enhanced metadata saved: {metadata_filename}")
    
    # Filter for highest quality sequences
    high_quality = global_metadata_df[
        (global_metadata_df['sequence_quality'] == 'Valid') &
        (global_metadata_df['sequence_length'] >= 400) &
        (global_metadata_df['sequence_length'] <= 800) &
        (global_metadata_df['ambiguous_percentage'] <= 5.0)  # Very stringent
    ].copy()
    
    if not high_quality.empty:
        hq_filename = f"../data/processed/biston_betularia_high_quality_{timestamp}.csv"
        high_quality.to_csv(hq_filename, index=False)
        print(f"High-quality dataset saved: {hq_filename}")
        print(f"High-quality sequences: {len(high_quality)}")
    
    # Update exploration results
    exploration_results = {
        'enhanced_global_search_date': datetime.now().isoformat(),
        'total_sequences_available': global_total,
        'sequences_downloaded': len(global_records),
        'sequences_failed': len(global_failed),
        'enhanced_metadata_records': len(global_metadata_df),
        'high_quality_records': len(high_quality) if 'high_quality' in locals() else 0,
        'target_species': Config.TARGET_SPECIES,
        'target_gene': Config.TARGET_GENE,
        'search_strategy': 'enhanced_global_aggressive',
        'length_range': '300-900bp',
        'success_rate': len(global_records)/(len(global_records)+len(global_failed))*100,
        'countries_represented': global_metadata_df['country_parsed'].nunique(),
        'mean_sequence_length': global_metadata_df['sequence_length'].mean(),
        'mean_gc_content': global_metadata_df['gc_content'].mean()
    }
    
    with open('../data/processed/exploration_results_enhanced.json', 'w') as f:
        json.dump(exploration_results, f, indent=2)
    
    print(f"Enhanced exploration results saved")

GEOGRAPHIC DISTRIBUTION ANALYSIS:

Country distribution:
  Unknown: 20 sequences

Coordinate data:
  Records with coordinates: 11
  Records without coordinates: 9
  Sample coordinates:
    47.867 N 16.267 E
    47.157 N 10.923 E
    47.157 N 10.923 E
    46.635 N 14.895 E
    48.3989 N 15.5314 E

Temporal distribution:
  Records with dates: 11
  Date range: 04-Jul-2014 to 28-May-2013

FILTERED global DATASET:
Original records: 20
High-quality global records: 0


Data Quality Summary

In [None]:
print(f"\n{'='*70}")
print("ENHANCED GLOBAL DATA FETCHING COMPLETE")
print(f"{'='*70}")

if global_records:
    print(f"FINAL RESULTS:")
    print(f"  Search strategy: Global aggressive (no geographic restrictions)")
    print(f"  Total sequences available: {global_total}")
    print(f"  Sequences successfully downloaded: {len(global_records)}")
    print(f"  Success rate: {len(global_records)/(len(global_records)+len(global_failed))*100:.1f}%")
    print(f"  Countries represented: {global_metadata_df['country_parsed'].nunique()}")
    print(f"  Geographic coverage: Global")
    print(f"  Length range: {global_metadata_df['sequence_length'].min()}-{global_metadata_df['sequence_length'].max()} bp")
    print(f"  Quality: All sequences validated")
    
    expected_haplotypes = min(len(global_metadata_df), len(global_metadata_df['sequence'].unique()))
    print(f"  Expected unique haplotypes: {expected_haplotypes}")
    print(f"  ABM simulation readiness: EXCELLENT")
    
    print(f"\nNEXT STEPS:")
    print(f"  1. Run Notebook 3 with enhanced global dataset")
    print(f"  2. Expected 15-25+ haplotypes for robust ABM simulation")
    print(f"  3. Rich geographic diversity for environmental modeling")
    print(f"  4. Strong statistical foundation for evolutionary analysis")
    
    if len(global_records) >= 80:
        print(f"\n🎉 SUCCESS: Dataset size ({len(global_records)} sequences) is excellent for ABM modeling!")
    elif len(global_records) >= 50:
        print(f"\n✅ GOOD: Dataset size ({len(global_records)} sequences) is sufficient for ABM modeling")
    else:
        print(f"\n⚠️  NOTE: Dataset size ({len(global_records)} sequences) is small but usable")

else:
    print("❌ ERROR: No sequences were successfully downloaded")
    print("   Check NCBI connection and search parameters")
    print("   Consider relaxing quality filters or expanding search criteria")


DATA FETCHING SUMMARY
Search parameters:
  Target species: Biston betularia
  Target gene: COI
  Geographic filter: United Kingdom
  Sequence length: 400-800 bp

Download results:
  Total sequences found: 327
  Sequences downloaded: 20
  Failed downloads: 0
  Success rate: 100.0%

Data quality:
  Valid sequences: 20

Sequence statistics:
  Mean length: 650.0 bp
  Length range: 540-681 bp
  Mean GC content: 30.2%
