SETUP & IMPORT PREVIOUS RESULTS

In [1]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from config import Config
from utils import setup_logging, validate_sequence, clean_sequence, save_checkpoint

# Import Biopython
from Bio import Entrez, SeqIO
from Bio.Seq import Seq

# Setup logging
logger = setup_logging()
Entrez.email = Config.NCBI_EMAIL
logger.info("Starting cortex data processing notebook")

print("CORTEX GENE DATA PROCESSING")
print("="*50)
print(f"Target: {Config.TARGET_SPECIES} cortex gene")
print(f"Purpose: Clean data for ABM simulation")


2025-06-11 11:00:53,884 - INFO - Starting cortex data processing notebook


CORTEX GENE DATA PROCESSING
Target: Biston betularia cortex gene
Purpose: Clean data for ABM simulation


CORTEX GENE CONFIGURATION

In [2]:
class CortexConfig:
    """Configuration for cortex gene processing"""
    
    # Search parameters
    GENE_NAME = "cortex"
    MIN_LENGTH = 200
    MAX_LENGTH = 3000
    MAX_SEQUENCES = 100
    BATCH_SIZE = 5
    
    # Quality thresholds (more lenient than COI)
    MAX_AMBIGUOUS_PERCENTAGE = 15.0
    SIMILARITY_THRESHOLD = 0.97
    
print(f"Cortex processing configuration:")
print(f"Length range: {CortexConfig.MIN_LENGTH}-{CortexConfig.MAX_LENGTH} bp")
print(f"Max sequences: {CortexConfig.MAX_SEQUENCES}")
print(f"Max ambiguous: {CortexConfig.MAX_AMBIGUOUS_PERCENTAGE}%")

Cortex processing configuration:
Length range: 200-3000 bp
Max sequences: 100
Max ambiguous: 15.0%


NCBI SEARCH

In [3]:
def search_cortex_sequences(species, max_results=100):
    """Search for cortex gene sequences"""
    
    print(f"\nSearching for cortex sequences...")
    
    search_terms = [
        f"{species}[Organism] AND cortex[Gene]",
        f"{species}[Organism] AND cortex protein",
        "Lepidoptera[Organism] AND cortex[Gene]"  # Broader search
    ]
    
    all_ids = set()
    
    for search_term in search_terms:
        try:
            print(f"  Searching: {search_term}")
            
            handle = Entrez.esearch(
                db=Config.NCBI_DATABASE,
                term=search_term,
                retmax=max_results//2,
                sort="relevance"
            )
            
            results = Entrez.read(handle)
            handle.close()
            
            ids = results["IdList"]
            total = int(results["Count"])
            
            all_ids.update(ids)
            print(f"    Found: {len(ids)} retrieved / {total} total")
            
            time.sleep(0.5)
            
        except Exception as e:
            print(f"    Search failed: {e}")
    
    final_ids = list(all_ids)[:max_results]
    print(f"\nTotal unique cortex IDs: {len(final_ids)}")
    
    return final_ids

# Search for cortex sequences
cortex_ids = search_cortex_sequences(
    species=Config.TARGET_SPECIES,
    max_results=CortexConfig.MAX_SEQUENCES
)


Searching for cortex sequences...
  Searching: Biston betularia[Organism] AND cortex[Gene]
    Found: 0 retrieved / 0 total
  Searching: Biston betularia[Organism] AND cortex protein
    Found: 1 retrieved / 1 total
  Searching: Lepidoptera[Organism] AND cortex[Gene]
    Found: 0 retrieved / 0 total

Total unique cortex IDs: 1


SEQUENCE FETCHING

In [4]:
def fetch_cortex_sequences(id_list, batch_size=5):
    """Fetch cortex sequences in batches"""
    
    if not id_list:
        print("No IDs to fetch")
        return []
    
    print(f"\nFetching {len(id_list)} cortex sequences...")
    
    all_records = []
    
    for i in range(0, len(id_list), batch_size):
        batch_ids = id_list[i:i+batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (len(id_list) + batch_size - 1) // batch_size
        
        print(f"Batch {batch_num}/{total_batches} - {len(batch_ids)} sequences")
        
        try:
            handle = Entrez.efetch(
                db=Config.NCBI_DATABASE,
                id=batch_ids,
                rettype="gb",
                retmode="text"
            )
            
            batch_records = list(SeqIO.parse(handle, "genbank"))
            handle.close()
            
            all_records.extend(batch_records)
            print(f"  Retrieved {len(batch_records)} records")
            
            time.sleep(1.0)
            
        except Exception as e:
            print(f"  Batch failed: {e}")
            continue
    
    print(f"Total cortex records fetched: {len(all_records)}")
    return all_records

# Fetch cortex sequences
cortex_records = fetch_cortex_sequences(cortex_ids, CortexConfig.BATCH_SIZE)


Fetching 1 cortex sequences...
Batch 1/1 - 1 sequences
  Retrieved 1 records
Total cortex records fetched: 1


DIRECT EXPORT

In [5]:
def create_cortex_dataset_direct(records):
    """Create dataset directly from single cortex record"""
    
    if not records:
        print("No cortex records to process")
        return pd.DataFrame(), []
    
    if len(records) == 1:
        print(f"\nSingle cortex sequence found - creating direct dataset...")
        
        record = records[0]
        seq_str = str(record.seq).upper()
        cleaned_seq = clean_sequence(seq_str)
        
        # Create single-row dataframe
        data = {
            'accession_id': record.id,
            'description': record.description,
            'organism': record.annotations.get('organism', 'Unknown'),
            'sequence': cleaned_seq,
            'sequence_length': len(cleaned_seq),
            'gene_type': 'cortex',
            'haplotype_id': 'Cortex_Hap_001'
        }
        
        # Extract source information
        for feature in record.features:
            if feature.type == "source":
                qualifiers = feature.qualifiers
                data['country'] = qualifiers.get('country', ['Unknown'])[0]
                data['collection_date'] = qualifiers.get('collection_date', ['Unknown'])[0]
                data['collected_by'] = qualifiers.get('collected_by', ['Unknown'])[0]
                break
        
        # Calculate GC content
        gc_count = cleaned_seq.count('G') + cleaned_seq.count('C')
        data['gc_content'] = round((gc_count / len(cleaned_seq)) * 100, 2)
        
        df = pd.DataFrame([data])
        
        # Create single haplotype group
        haplotype_groups = [{
            'haplotype_id': 'Cortex_Hap_001',
            'representative_sequence': cleaned_seq,
            'sequence_count': 1,
            'frequency': 1.0
        }]
        
        print(f"Cortex dataset created: 1 sequence, 1 haplotype")
        print(f"Sequence length: {len(cleaned_seq)} bp")
        print(f"GC content: {data['gc_content']}%")
        
        return df, haplotype_groups
    
    else:
        # Fallback to normal processing if multiple sequences
        print(f"Multiple sequences found - using standard processing...")
        return pd.DataFrame(), []

# Create cortex dataset directly
cortex_final_df, cortex_haplotype_groups = create_cortex_dataset_direct(cortex_records)


Single cortex sequence found - creating direct dataset...
Cortex dataset created: 1 sequence, 1 haplotype
Sequence length: 377523 bp
GC content: 36.42%


EXPORT FINAL DATASET

In [6]:
def export_cortex_dataset(df, haplotype_groups):
    """Export final cortex dataset"""
    
    if df.empty:
        print("No cortex data to export")
        return
    
    # Create final directory
    os.makedirs("../data/final", exist_ok=True)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Export main dataset
    dataset_file = f"../data/final/biston_betularia_cortex_dataset_{timestamp}.csv"
    df.to_csv(dataset_file, index=False)
    print(f"\nCortex dataset exported: {dataset_file}")
    print(f"Records: {len(df)}")
    print(f"Columns: {list(df.columns)}")
    
    # Export haplotype summary
    if haplotype_groups:
        haplotype_file = f"../data/final/cortex_haplotype_summary_{timestamp}.csv"
        haplotype_df = pd.DataFrame(haplotype_groups)
        haplotype_df.to_csv(haplotype_file, index=False)
        print(f"Cortex haplotype summary exported: {haplotype_file}")
    
    # Export processing summary
    summary = {
        'processing_date': datetime.now().isoformat(),
        'gene_type': 'cortex',
        'target_species': Config.TARGET_SPECIES,
        'sequences_found': len(cortex_ids) if cortex_ids else 0,
        'sequences_processed': len(df),
        'unique_haplotypes': len(haplotype_groups),
        'mean_sequence_length': df['sequence_length'].mean(),
        'mean_gc_content': df['gc_content'].mean()
    }
    
    summary_file = f"../data/final/cortex_processing_summary_{timestamp}.json"
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2)
    print(f"Processing summary exported: {summary_file}")

# Export cortex dataset
export_cortex_dataset(cortex_final_df, cortex_haplotype_groups)


Cortex dataset exported: ../data/final/biston_betularia_cortex_dataset_20250611_110102.csv
Records: 1
Columns: ['accession_id', 'description', 'organism', 'sequence', 'sequence_length', 'gene_type', 'haplotype_id', 'country', 'collection_date', 'collected_by', 'gc_content']
Cortex haplotype summary exported: ../data/final/cortex_haplotype_summary_20250611_110102.csv
Processing summary exported: ../data/final/cortex_processing_summary_20250611_110102.json
