Setup & Import Previous Results

In [1]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from config import Config
from utils import setup_logging, validate_sequence, save_checkpoint, clean_sequence

# Import Biopython
from Bio import Entrez, SeqIO
from Bio.Seq import Seq

# Setup
logger = setup_logging()
Entrez.email = Config.NCBI_EMAIL
logger.info("Starting data fetching notebook")

# Load previous exploration results
with open('../data/processed/exploration_results.json', 'r') as f:
    exploration_results = json.load(f)

print("PREVIOUS EXPLORATION RESULTS:")
print(f"Total sequences available: {exploration_results['total_sequences_available']}")
print(f"Target species: {exploration_results['target_species']}")
print(f"Exploration date: {exploration_results['exploration_date']}")

2025-05-31 22:32:51,096 - INFO - Starting data fetching notebook


PREVIOUS EXPLORATION RESULTS:
Total sequences available: 327
Target species: Biston betularia
Exploration date: 2025-05-31T22:16:22.668568


Advanced Search with Filters

In [2]:
def search_ncbi_with_filters(species, gene, location=None, max_results=50):
    """
    Search NCBI dengan filters geografis dan kualitas
    
    Args:
        species (str): Target species
        gene (str): Target gene
        location (str): Geographic filter (optional)
        max_results (int): Maximum results to return
    
    Returns:
        list: List of sequence IDs
    """
    try:
        # Build search term
        search_terms = [
            f"{species}[Organism]",
            f"{gene}[Gene]"
        ]
        
        # Add geographic filter if specified
        if location:
            search_terms.append(f"{location}[Country]")
        
        # Add quality filters
        search_terms.extend([
            "400:800[SLEN]",  # Sequence length between 400-800 bp
            "biomol_genomic[PROP]"  # Genomic sequences only
        ])
        
        search_query = " AND ".join(search_terms)
        
        print(f"Search query: {search_query}")
        print(f"Searching for up to {max_results} sequences...")
        
        # Perform search
        handle = Entrez.esearch(
            db=Config.NCBI_DATABASE,
            term=search_query,
            retmax=max_results,
            sort="relevance"
        )
        
        search_results = Entrez.read(handle)
        handle.close()
        
        id_list = search_results["IdList"]
        total_found = int(search_results["Count"])
        
        print(f"Found {total_found} total sequences matching criteria")
        print(f"Retrieved {len(id_list)} sequence IDs for download")
        
        return id_list, total_found
        
    except Exception as e:
        logger.error(f"Search failed: {e}")
        return [], 0

# Search dengan filter UK
uk_ids, uk_total = search_ncbi_with_filters(
    species=Config.TARGET_SPECIES,
    gene=Config.TARGET_GENE,
    location=Config.TARGET_LOCATION,
    max_results=Config.MAX_SEQUENCES_TEST
)

print(f"\nUK-specific search results:")
print(f"Total UK sequences: {uk_total}")
print(f"IDs to download: {len(uk_ids)}")

Search query: Biston betularia[Organism] AND COI[Gene] AND United Kingdom[Country] AND 400:800[SLEN] AND biomol_genomic[PROP]
Searching for up to 20 sequences...
Found 4 total sequences matching criteria
Retrieved 4 sequence IDs for download

UK-specific search results:
Total UK sequences: 4
IDs to download: 4


Batch Download Function

In [3]:
def fetch_sequences_batch(id_list, batch_size=10, delay=1.0):
    """
    Download sequences dalam batches untuk avoid rate limits
    
    Args:
        id_list (list): List of sequence IDs
        batch_size (int): Number of sequences per batch
        delay (float): Delay between batches (seconds)
    
    Returns:
        list: List of SeqRecord objects
    """
    all_records = []
    failed_ids = []
    
    print(f"Downloading {len(id_list)} sequences in batches of {batch_size}...")
    
    for i in range(0, len(id_list), batch_size):
        batch_ids = id_list[i:i+batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (len(id_list) + batch_size - 1) // batch_size
        
        print(f"Processing batch {batch_num}/{total_batches} ({len(batch_ids)} sequences)...")
        
        try:
            # Fetch batch
            handle = Entrez.efetch(
                db=Config.NCBI_DATABASE,
                id=batch_ids,
                rettype="gb",
                retmode="text"
            )
            
            # Parse records
            batch_records = list(SeqIO.parse(handle, "genbank"))
            handle.close()
            
            # Validate and add records
            for record in batch_records:
                if validate_sequence(str(record.seq)):
                    all_records.append(record)
                    print(f"  Added: {record.id}")
                else:
                    print(f"  Skipped (invalid): {record.id}")
                    failed_ids.append(record.id)
            
            # Rate limiting delay
            if i + batch_size < len(id_list):  # Don't delay after last batch
                print(f"  Waiting {delay}s before next batch...")
                time.sleep(delay)
                
        except Exception as e:
            logger.error(f"Batch {batch_num} failed: {e}")
            failed_ids.extend(batch_ids)
            continue
    
    print(f"\nDownload complete!")
    print(f"Successfully downloaded: {len(all_records)} sequences")
    print(f"Failed downloads: {len(failed_ids)} sequences")
    
    return all_records, failed_ids

# Download UK sequences
print("Starting UK sequence download...")
uk_records, uk_failed = fetch_sequences_batch(uk_ids, batch_size=5, delay=1.5)

Starting UK sequence download...
Downloading 4 sequences in batches of 5...
Processing batch 1/1 (4 sequences)...
  Added: KX043781.1
  Added: KX071139.1
  Added: KX044146.1
  Added: KX043790.1

Download complete!
Successfully downloaded: 4 sequences
Failed downloads: 0 sequences


Extract Comprehensive Metadata

In [5]:
def extract_comprehensive_metadata(records):
    """
    Extract detailed metadata dari GenBank records
    
    Args:
        records (list): List of SeqRecord objects
    
    Returns:
        pandas.DataFrame: Comprehensive metadata
    """
    metadata_list = []
    
    print(f"Extracting metadata from {len(records)} records...")
    
    for i, record in enumerate(records):
        metadata = {
            'accession_id': record.id,
            'accession_version': record.id,
            'description': record.description,
            'organism': record.annotations.get('organism', 'Unknown'),
            'sequence_length': len(record.seq),
            'sequence': str(record.seq),
            'date_added': record.annotations.get('date', 'Unknown'),
            'keywords': ','.join(record.annotations.get('keywords', [])),
            'source': record.annotations.get('source', 'Unknown')
        }
        
        # Extract from source feature
        for feature in record.features:
            if feature.type == "source":
                qualifiers = feature.qualifiers
                
                # Geographic information
                metadata['country'] = qualifiers.get('country', ['Unknown'])[0]
                metadata['lat_lon'] = qualifiers.get('lat_lon', ['Unknown'])[0]
                
                # Collection information
                metadata['collection_date'] = qualifiers.get('collection_date', ['Unknown'])[0]
                metadata['collected_by'] = qualifiers.get('collected_by', ['Unknown'])[0]
                metadata['identified_by'] = qualifiers.get('identified_by', ['Unknown'])[0]
                
                # Specimen information
                metadata['specimen_voucher'] = qualifiers.get('specimen_voucher', ['Unknown'])[0]
                metadata['isolate'] = qualifiers.get('isolate', ['Unknown'])[0]
                
                # Additional fields
                metadata['sex'] = qualifiers.get('sex', ['Unknown'])[0]
                metadata['life_stage'] = qualifiers.get('dev_stage', ['Unknown'])[0]
                metadata['tissue_type'] = qualifiers.get('tissue_type', ['Unknown'])[0]
                
                break
        
        # Sequence composition analysis
        seq_str = str(record.seq).upper()
        metadata['gc_content'] = round((seq_str.count('G') + seq_str.count('C')) / len(seq_str) * 100, 2)
        metadata['n_count'] = seq_str.count('N')
        metadata['ambiguous_bases'] = sum(1 for char in seq_str if char in 'RYSWKMBDHV')
        
        # Quality metrics
        metadata['sequence_quality'] = 'Valid' if validate_sequence(seq_str) else 'Invalid'
        
        metadata_list.append(metadata)
        
        if (i + 1) % 10 == 0:
            print(f"  Processed {i + 1}/{len(records)} records...")
    
    df = pd.DataFrame(metadata_list)
    print(f"Metadata extraction complete!")
    
    return df

# Extract metadata from UK records
if uk_records:
    uk_metadata_df = extract_comprehensive_metadata(uk_records)
    print(f"\nUK METADATA SUMMARY:")
    print(f"Records processed: {len(uk_metadata_df)}")
    print(f"Unique countries: {uk_metadata_df['country'].nunique()}")
    print(f"Date range: {uk_metadata_df['collection_date'].min()} to {uk_metadata_df['collection_date'].max()}")
else:
    print("No UK records to process")

Extracting metadata from 4 records...
Metadata extraction complete!

UK METADATA SUMMARY:
Records processed: 4
Unique countries: 1
Date range: 13-Jul-2007 to 29-Jun-2007


Geographic Analysis & Filterin

In [6]:
def analyze_geographic_distribution(df):
    """
    Analyze geographic distribution of sequences
    """
    if df.empty:
        return
    
    print("GEOGRAPHIC DISTRIBUTION ANALYSIS:")
    print("="*50)
    
    # Country distribution
    country_counts = df['country'].value_counts()
    print(f"\nCountry distribution:")
    for country, count in country_counts.head(10).items():
        print(f"  {country}: {count} sequences")
    
    # Coordinates analysis
    valid_coords = df[df['lat_lon'] != 'Unknown']['lat_lon']
    print(f"\nCoordinate data:")
    print(f"  Records with coordinates: {len(valid_coords)}")
    print(f"  Records without coordinates: {len(df) - len(valid_coords)}")
    
    if len(valid_coords) > 0:
        print(f"  Sample coordinates:")
        for coord in valid_coords.head(5):
            print(f"    {coord}")
    
    # Collection date analysis
    valid_dates = df[df['collection_date'] != 'Unknown']['collection_date']
    print(f"\nTemporal distribution:")
    print(f"  Records with dates: {len(valid_dates)}")
    print(f"  Date range: {valid_dates.min()} to {valid_dates.max()}" if len(valid_dates) > 0 else "  No valid dates")
    
    return country_counts

# Analyze UK data
if 'uk_metadata_df' in locals() and not uk_metadata_df.empty:
    uk_geo_analysis = analyze_geographic_distribution(uk_metadata_df)
    
    # Filter for high-quality UK records
    uk_filtered = uk_metadata_df[
        (uk_metadata_df['sequence_quality'] == 'Valid') &
        (uk_metadata_df['country'].str.contains('United Kingdom|UK|England|Scotland|Wales', na=False, case=False))
    ].copy()
    
    print(f"\nFILTERED UK DATASET:")
    print(f"Original records: {len(uk_metadata_df)}")
    print(f"High-quality UK records: {len(uk_filtered)}")

GEOGRAPHIC DISTRIBUTION ANALYSIS:

Country distribution:
  Unknown: 4 sequences

Coordinate data:
  Records with coordinates: 3
  Records without coordinates: 1
  Sample coordinates:
    51.4328 N 0.94377 W
    51.62 N 0.04 E
    52.548 N 0.881 E

Temporal distribution:
  Records with dates: 4
  Date range: 13-Jul-2007 to 29-Jun-2007

FILTERED UK DATASET:
Original records: 4
High-quality UK records: 0


Save Processed Data

In [7]:
if uk_records:
    # Save sequences in FASTA format
    fasta_filename = f"../data/raw/biston_betularia_uk_raw_{datetime.now().strftime('%Y%m%d')}.fasta"
    SeqIO.write(uk_records, fasta_filename, "fasta")
    print(f"Raw sequences saved: {fasta_filename}")
    
    # Save comprehensive metadata
    metadata_filename = f"../data/processed/biston_betularia_uk_metadata_{datetime.now().strftime('%Y%m%d')}.csv"
    uk_metadata_df.to_csv(metadata_filename, index=False)
    print(f"Metadata saved: {metadata_filename}")
    
    # Save filtered high-quality data
    if 'uk_filtered' in locals() and not uk_filtered.empty:
        filtered_filename = f"../data/processed/biston_betularia_uk_filtered_{datetime.now().strftime('%Y%m%d')}.csv"
        uk_filtered.to_csv(filtered_filename, index=False)
        print(f"Filtered data saved: {filtered_filename}")
    
    # Update exploration results
    exploration_results.update({
        'uk_download_date': datetime.now().isoformat(),
        'uk_sequences_downloaded': len(uk_records),
        'uk_sequences_failed': len(uk_failed),
        'uk_metadata_records': len(uk_metadata_df),
        'uk_filtered_records': len(uk_filtered) if 'uk_filtered' in locals() else 0
    })
    
    with open('../data/processed/exploration_results.json', 'w') as f:
        json.dump(exploration_results, f, indent=2)
    
    print(f"\nExploration results updated")

Raw sequences saved: ../data/raw/biston_betularia_uk_raw_20250531.fasta
Metadata saved: ../data/processed/biston_betularia_uk_metadata_20250531.csv

Exploration results updated


Data Quality Summary

In [10]:
if 'uk_metadata_df' in locals() and not uk_metadata_df.empty:
    print(f"\n{'='*60}")
    print("DATA FETCHING SUMMARY")
    print(f"{'='*60}")
    
    print(f"Search parameters:")
    print(f"  Target species: {Config.TARGET_SPECIES}")
    print(f"  Target gene: {Config.TARGET_GENE}")
    print(f"  Geographic filter: {Config.TARGET_LOCATION}")
    print(f"  Sequence length: {Config.MIN_SEQUENCE_LENGTH}-{Config.MAX_SEQUENCE_LENGTH} bp")
    
    print(f"\nDownload results:")
    print(f"  Total sequences found: {uk_total}")
    print(f"  Sequences downloaded: {len(uk_records)}")
    print(f"  Failed downloads: {len(uk_failed)}")
    print(f"  Success rate: {len(uk_records)/(len(uk_records)+len(uk_failed))*100:.1f}%")
    
    print(f"\nData quality:")
    quality_counts = uk_metadata_df['sequence_quality'].value_counts()
    for quality, count in quality_counts.items():
        print(f"  {quality} sequences: {count}")
    
    print(f"\nSequence statistics:")
    print(f"  Mean length: {uk_metadata_df['sequence_length'].mean():.1f} bp")
    print(f"  Length range: {uk_metadata_df['sequence_length'].min()}-{uk_metadata_df['sequence_length'].max()} bp")
    print(f"  Mean GC content: {uk_metadata_df['gc_content'].mean():.1f}%")

else:
    print("No data downloaded - check NCBI connection and search parameters")


DATA FETCHING SUMMARY
Search parameters:
  Target species: Biston betularia
  Target gene: COI
  Geographic filter: United Kingdom
  Sequence length: 400-800 bp

Download results:
  Total sequences found: 4
  Sequences downloaded: 4
  Failed downloads: 0
  Success rate: 100.0%

Data quality:
  Valid sequences: 4

Sequence statistics:
  Mean length: 630.5 bp
  Length range: 597-658 bp
  Mean GC content: 30.3%
