<a href="https://colab.research.google.com/github/Chandan0731/bioinformatics_lab/blob/main/Experiment_10_genome_annotation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Block 1: Cloud Data Acquisition
# We fetch a bacterial genome (E. coli) from the NCBI Cloud Database.

!pip install biopython

from Bio import Entrez, SeqIO

# 1. Setup
Entrez.email = "student_rvce@example.com"
accession_id = "U00096.3" # Escherichia coli str. K-12 substr. MG1655 (Complete Genome)

print(f"üì° Connecting to NCBI Cloud to fetch genome: {accession_id}...")

try:
    # 2. Download Genome
    handle = Entrez.efetch(db="nucleotide", id=accession_id, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()

    # 3. Save to file
    SeqIO.write(record, "genome.fasta", "fasta")

    print(f"‚úÖ Download Successful!")
    print(f"Organism: {record.description}")
    print(f"Genome Size: {len(record.seq)} bp")
    print("File saved as: 'genome.fasta'")

except Exception as e:
    print(f"‚ùå Error downloading genome: {e}")

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.2/3.2 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86
üì° Connecting to NCBI Cloud to fetch genome: U00096.3...
‚úÖ Download Successful!
Organism: U00096.3 Escherichia coli str. K-12 substr. MG1655, complete genome
Genome Size: 4641652 bp
File saved as: 'genome.fasta'


In [4]:
# Block 2: Gene Prediction (ORF Finding)
# We scan the genome for start (ATG) and stop (TAA/TAG/TGA) codons.

from Bio.Seq import Seq

print("--- STARTING GENE PREDICTION WORKFLOW ---")

# 1. Load Genome
record = SeqIO.read("genome.fasta", "fasta")
dna_seq = record.seq
genome_len = len(dna_seq)

# 2. Find ORFs (Simple Gene Finder Logic)
# We look for genes > 300 base pairs (100 amino acids)
min_protein_len = 100
predicted_genes = []

# Scan only the forward strand for this demo
print("Scanning forward strand for genes...")
for frame in range(3):
    # Translate the whole frame
    trans = dna_seq[frame:].translate(table=11)

    # Split by Stop Codons (*)
    potential_proteins = trans.split("*")

    current_pos = frame
    for prot in potential_proteins:
        if len(prot) >= min_protein_len:
            # Check if it starts with Methionine (M) - True Start Codon
            if prot.startswith("M"):
                start = current_pos
                end = current_pos + len(prot) * 3

                predicted_genes.append({
                    "start": start,
                    "end": end,
                    "sequence": prot,
                    "id": f"GENE_{len(predicted_genes)+1}"
                })

        # Update position tracker
        current_pos += len(prot) * 3 + 3 # +3 for the stop codon

print(f"‚úÖ Prediction Complete.")
print(f"Found {len(predicted_genes)} potential genes.")


--- STARTING GENE PREDICTION WORKFLOW ---
Scanning forward strand for genes...
‚úÖ Prediction Complete.
Found 124 potential genes.


In [6]:
# Block 3: Cloud Functional Annotation
# We send our predicted genes to the NCBI Cloud to identify their function.

from Bio.Blast import NCBIWWW, NCBIXML
import time

print("--- FUNCTIONAL ANNOTATION (Batch Mode) ---")

# We limit to the first 3 genes to save time (Full genome takes hours)
genes_to_annotate = predicted_genes[:3]
annotations = []

for i, gene in enumerate(genes_to_annotate):
    print(f"\nüì° Annotating {gene['id']} ({len(gene['sequence'])} aa)...")

    try:
        # 1. Send to NCBI Cloud (BLAST)
        result_handle = NCBIWWW.qblast("blastp", "swissprot", gene['sequence'])
        blast_record = NCBIXML.read(result_handle)

        # 2. Get Top Hit
        if blast_record.alignments:
            top_hit = blast_record.alignments[0]
            function_name = top_hit.hit_def.split(">")[0]
            gene['function'] = function_name
            gene['confidence'] = top_hit.hsps[0].expect

            print(f"   ‚úÖ Identified: {function_name[:50]}...")
        else:
            gene['function'] = "Hypothetical Protein"
            print("   ‚ö†Ô∏è No database match found.")

    except Exception as e:
        print(f"   ‚ùå Network Error: {e}")
        gene['function'] = "Unknown"

    time.sleep(2) # Respect API limits

print("\n‚úÖ Annotation Workflow Complete.")

--- FUNCTIONAL ANNOTATION (Batch Mode) ---

üì° Annotating GENE_1 (638 aa)...




   ‚úÖ Identified: RecName: Full=Chaperone protein DnaK; AltName: Ful...

üì° Annotating GENE_2 (107 aa)...
   ‚ö†Ô∏è No database match found.

üì° Annotating GENE_3 (404 aa)...
   ‚úÖ Identified: RecName: Full=Putrescine transport ATP-binding pro...

‚úÖ Annotation Workflow Complete.


In [8]:
# Block 4: Generate Annotation Report (GFF3 Format)
# We save our results in the standard bioinformatics format.

output_file = "annotated_genome.gff"

print(f"Generating GFF3 Report: {output_file}...")

with open(output_file, "w") as f:
    f.write("##gff-version 3\n")
    f.write(f"##sequence-region {record.id} 1 {len(record.seq)}\n")

    # Write the annotated genes
    for gene in predicted_genes[:3]: # Only writing the ones we annotated
        # GFF Format: SeqID, Source, Type, Start, End, Score, Strand, Phase, Attributes
        function = gene.get('function', 'Hypothetical protein')
        line = f"{record.id}\tPython_Workflow\tgene\t{gene['start']}\t{gene['end']}\t.\t+\t0\tID={gene['id']};Name={function}\n"
        f.write(line)

print("‚úÖ File Created Successfully.")

# Display contents
print("\n--- GFF FILE PREVIEW ---")
!head -n 10 annotated_genome.gff

Generating GFF3 Report: annotated_genome.gff...
‚úÖ File Created Successfully.

--- GFF FILE PREVIEW ---
##gff-version 3
##sequence-region U00096.3 1 4641652
U00096.3	Python_Workflow	gene	12162	14076	.	+	0	ID=GENE_1;Name=RecName: Full=Chaperone protein DnaK; AltName: Full=HSP70; AltName: Full=Heat shock 70 kDa protein; AltName: Full=Heat shock protein 70 [Escherichia coli APEC O1] 
U00096.3	Python_Workflow	gene	266598	266919	.	+	0	ID=GENE_2;Name=Hypothetical Protein
U00096.3	Python_Workflow	gene	894909	896121	.	+	0	ID=GENE_3;Name=RecName: Full=Putrescine transport ATP-binding protein PotG [Escherichia coli K-12]
