# E. coli Genome Analysis

This notebook downloads and visualizes the E. coli K-12 MG1655 genome.

In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import gzip
import os

In [None]:
# Import our genome analysis functions
import comp_bio

# Download the E. coli genome
genome_path = comp_bio.download_ecoli_genome()
print(f"Genome downloaded to: {genome_path}")

In [None]:
# Visualize the genome composition
if genome_path:
    print("Generating genome visualizations...")
    comp_bio.visualize_genome_composition(genome_path)
else:
    print("Failed to download genome file")

## Genome Statistics

Let's also examine some basic statistics about the E. coli genome:

In [None]:
# Read and analyze the genome sequence
if genome_path and os.path.exists(genome_path):
    sequence = ""
    with gzip.open(genome_path, 'rt') as f:
        for line in f:
            if not line.startswith('>'):
                sequence += line.strip().upper()
    
    # Basic statistics
    print(f"Genome length: {len(sequence):,} base pairs")
    print(f"Genome size: ~{len(sequence)/1e6:.1f} Mb")
    
    base_counts = Counter(sequence)
    total_bases = sum(base_counts[base] for base in 'ATGC')
    
    print("\nBase composition:")
    for base in 'ATGC':
        count = base_counts[base]
        percent = (count / total_bases) * 100
        print(f"  {base}: {count:,} ({percent:.2f}%)")
    
    gc_content = (base_counts['G'] + base_counts['C']) / total_bases * 100
    print(f"\nGC content: {gc_content:.2f}%")
    print(f"AT content: {100-gc_content:.2f}%")

## Additional Analysis

You can extend this analysis by:
- Finding open reading frames (ORFs)
- Analyzing codon usage
- Identifying repetitive sequences
- Comparing with other bacterial genomes