In [21]:
import sys
import os
sys.path.append('../src')  # Add src folder to path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from config import Config
from utils import setup_logging, validate_sequence

# Setup logging
logger = setup_logging()
logger.info("Starting data exploration notebook")

2025-05-31 22:14:25,296 - INFO - Starting data exploration notebook


In [23]:
try:
    from Bio import Entrez, SeqIO
    from Bio.Seq import Seq
    print("Biopython imported successfully")
    
    # IMPORTANT: GANTI EMAIL INI DENGAN EMAIL ANDA
    Entrez.email = Config.NCBI_EMAIL  # Pastikan sudah diupdate di config.py
    
    if Config.NCBI_EMAIL == "18222042@std.stei.itb.ac.id":
        print("WARNING: Please update your email in src/config.py")
        print("   Change NCBI_EMAIL = '18222042@std.stei.itb.ac.id'")
    else:
        print(f"NCBI email configured: {Config.NCBI_EMAIL}")
        
except ImportError as e:
    print(f"Error importing Biopython: {e}")
    print("Please install: pip install biopython")

Biopython imported successfully
NCBI email configured: YOUR_EMAIL@gmail.com


In [24]:
def test_ncbi_connection():
    """
    Test basic connection ke NCBI dan search functionality
    """
    try:
        # Simple search untuk test connection
        search_term = f"{Config.TARGET_SPECIES}[Organism] AND {Config.TARGET_GENE}[Gene]"
        print(f"Testing search: {search_term}")
        
        # Search without downloading
        handle = Entrez.esearch(
            db=Config.NCBI_DATABASE,
            term=search_term,
            retmax=5  # Only get 5 IDs for testing
        )
        
        search_results = Entrez.read(handle)
        handle.close()
        
        id_count = len(search_results["IdList"])
        total_count = int(search_results["Count"])
        
        print(f"NCBI connection successful!")
        print(f"   Found {total_count} total sequences for {Config.TARGET_SPECIES} {Config.TARGET_GENE}")
        print(f"   Sample IDs: {search_results['IdList'][:3]}")
        
        return search_results["IdList"][:5], total_count
        
    except Exception as e:
        print(f"NCBI connection failed: {e}")
        print("   Check your internet connection and email configuration")
        return [], 0

# Test connection
test_ids, total_sequences = test_ncbi_connection()

Testing search: Biston betularia[Organism] AND COI[Gene]
NCBI connection successful!
   Found 327 total sequences for Biston betularia COI
   Sample IDs: ['2698115618', '2698115612', '2698115326']


In [29]:
def fetch_sample_records(id_list, num_samples=3):
    """
    Fetch beberapa sample records untuk explore structure
    """
    if not id_list:
        print("No IDs available for fetching")
        return []
    
    sample_records = []
    
    try:
        # Fetch records dalam format GenBank
        handle = Entrez.efetch(
            db=Config.NCBI_DATABASE,
            id=id_list[:num_samples],  # Ambil sample saja
            rettype="gb",
            retmode="text"
        )
        
        records = SeqIO.parse(handle, "genbank")
        
        for record in records:
            sample_records.append(record)
            print(f"Fetched record: {record.id}")
            
        handle.close()
        
    except Exception as e:
        print(f"Error fetching records: {e}")
    
    return sample_records

sample_records = fetch_sample_records(test_ids, num_samples=2)

Fetched record: OR369609.1
Fetched record: OR369606.1


In [30]:
def explore_record_structure(records):
    """
    Explore structure dari GenBank records
    """
    if not records:
        print("No records to explore")
        return
    
    for i, record in enumerate(records):
        print(f"\n{'='*50}")
        print(f"RECORD {i+1}: {record.id}")
        print(f"{'='*50}")
        
        # Basic info
        print(f"Description: {record.description}")
        print(f"Sequence length: {len(record.seq)} bp")
        print(f"Sequence preview: {str(record.seq)[:50]}...")
        
        # Annotations
        print("\n--- ANNOTATIONS ---")
        for key, value in record.annotations.items():
            if isinstance(value, str) and len(value) < 100:
                print(f"{key}: {value}")
            elif isinstance(value, list) and len(value) < 10:
                print(f"{key}: {value}")
        
        # Features (genes, CDS, etc.)
        print(f"\n--- FEATURES ({len(record.features)} total) ---")
        for feature in record.features[:5]:  # Show first 5 features
            print(f"Type: {feature.type}")
            if 'gene' in feature.qualifiers:
                print(f"  Gene: {feature.qualifiers['gene']}")
            if 'product' in feature.qualifiers:
                print(f"  Product: {feature.qualifiers['product']}")
            print()

# Explore sample records
explore_record_structure(sample_records)


RECORD 1: OR369609.1
Description: Biston betularia voucher TLMF Lep 17110 cytochrome oxidase subunit 1 (COI) gene, partial cds; mitochondrial
Sequence length: 658 bp
Sequence preview: AACATTATACTTTATTTTTGGTATTTGAGCAGGAATAATTGGAACATCTT...

--- ANNOTATIONS ---
molecule_type: DNA
topology: linear
data_file_division: INV
date: 19-MAR-2024
accessions: ['OR369609']
keywords: ['BARCODE']
source: mitochondrion Biston betularia (pepper-and-salt moth)
organism: Biston betularia
references: [Reference(title='A DNA barcode library of Austrian geometridae (Lepidoptera) reveals high potential for DNA-based species identification', ...), Reference(title='Direct Submission', ...)]

--- FEATURES (3 total) ---
Type: source

Type: gene
  Gene: ['COI']

Type: CDS
  Gene: ['COI']
  Product: ['cytochrome oxidase subunit 1']


RECORD 2: OR369606.1
Description: Biston betularia voucher TLMF Lep 12565 cytochrome oxidase subunit 1 (COI) gene, partial cds; mitochondrial
Sequence length: 658 bp
Sequence preview:

In [31]:
def extract_metadata_preview(records):
    """
    Extract metadata dari sample records untuk preview
    """
    metadata_list = []
    
    for record in records:
        metadata = {
            'accession_id': record.id,
            'description': record.description,
            'sequence_length': len(record.seq),
            'organism': record.annotations.get('organism', 'Unknown'),
            'sequence': str(record.seq)
        }
        
        # Extract geographic and date info from features
        for feature in record.features:
            if feature.type == "source":
                qualifiers = feature.qualifiers
                
                # Country/location
                if 'country' in qualifiers:
                    metadata['country'] = qualifiers['country'][0]
                
                # Collection date
                if 'collection_date' in qualifiers:
                    metadata['collection_date'] = qualifiers['collection_date'][0]
                
                # Lat/lon if available
                if 'lat_lon' in qualifiers:
                    metadata['lat_lon'] = qualifiers['lat_lon'][0]
                
                # Collected by
                if 'collected_by' in qualifiers:
                    metadata['collected_by'] = qualifiers['collected_by'][0]
        
        metadata_list.append(metadata)
    
    return pd.DataFrame(metadata_list)

# Extract metadata
if sample_records:
    metadata_df = extract_metadata_preview(sample_records)
    print("METADATA PREVIEW:")
    print(metadata_df.to_string())
    
    # Sequence quality check
    print(f"\n{'='*50}")
    print("SEQUENCE QUALITY CHECK:")
    print(f"{'='*50}")
    
    for i, record in enumerate(sample_records):
        seq_str = str(record.seq)
        is_valid = validate_sequence(seq_str)
        print(f"Record {i+1} ({record.id}): {'Valid' if is_valid else 'Invalid'}")
        print(f"  Length: {len(seq_str)} bp")
        print(f"  GC content: {(seq_str.count('G') + seq_str.count('C')) / len(seq_str) * 100:.1f}%")

METADATA PREVIEW:
  accession_id                                                                                                  description  sequence_length          organism                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            sequence collection_date            lat_lon collected_by
0   OR369609.1  Biston betularia voucher TLMF Lep 17110 cytochrome oxidase subunit 1 (COI) gene, partial cds; mitoc

In [33]:
print(f"\n{'='*60}")
print("EXPLORATION SUMMARY & NEXT STEPS")
print(f"{'='*60}")

print(f"Total {Config.TARGET_SPECIES} COI sequences available: {total_sequences}")
print(f"Sample records successfully fetched and analyzed")
print(f"Metadata structure identified")
print(f"Sequence quality validation working")

# Save exploration results
exploration_results = {
    'total_sequences_available': total_sequences,
    'sample_ids': test_ids,
    'exploration_date': datetime.now().isoformat(),
    'target_species': Config.TARGET_SPECIES,
    'target_gene': Config.TARGET_GENE
}

# Save to processed data folder
import json
with open('../data/processed/exploration_results.json', 'w') as f:
    json.dump(exploration_results, f, indent=2)

print(f"\nExploration results saved to data/processed/exploration_results.json")


EXPLORATION SUMMARY & NEXT STEPS
Total Biston betularia COI sequences available: 327
Sample records successfully fetched and analyzed
Metadata structure identified
Sequence quality validation working

Exploration results saved to data/processed/exploration_results.json
