In [4]:
organism_name = "Vibrio natriegens"

In [5]:
from Bio import Entrez
from Bio.Blast import NCBIWWW, NCBIXML

Entrez.email = "your.email@example.com"  # Required by NCBI



# Step 1: Get a representative 16S rRNA sequence for E. coli
print(f"Fetching 16S rRNA sequence for {organism_name}...")
search = Entrez.esearch(db="nucleotide", term=f"{organism_name}[Organism] AND 16S ribosomal RNA[Title]", retmax=1)
record = Entrez.read(search)
seq_id = record["IdList"][0]

fasta = Entrez.efetch(db="nucleotide", id=seq_id, rettype="fasta", retmode="text").read()

# Step 2: Run BLAST against nt database, excluding E. coli
print("Running BLAST (excluding Escherichia coli)...")
result_handle = NCBIWWW.qblast(
    program="blastn",
    database="nt",
    sequence=fasta,
    entrez_query="NOT Escherichia coli[Organism]",
    hitlist_size=100  # get many results so we can deduplicate
)

# Step 3: Parse BLAST results
blast_record = next(NCBIXML.parse(result_handle))

unique_species = {}
for alignment in blast_record.alignments:
    title = alignment.hit_def
    hsp = alignment.hsps[0]
    identity = (hsp.identities / hsp.align_length) * 100
    score = hsp.score

    # Extract species name (usually first two words of the title)
    parts = title.split()
    if len(parts) >= 2:
        species_name = " ".join(parts[:2])
    else:
        continue

    # Skip if it's E. coli or already recorded
    if "Escherichia coli" in species_name or species_name in unique_species:
        continue

    # Keep best (highest score) alignment per unique species
    unique_species[species_name] = {
        "title": title,
        "score": score,
        "identity": identity,
        "align_length": hsp.align_length
    }

# Step 4: Sort by similarity (descending)
sorted_species = sorted(unique_species.items(), key=lambda x: x[1]["identity"], reverse=True)

print("\nTop genetically related distinct species (excluding E. coli):\n")
for species, data in sorted_species[:10]:
    print(f"{species}\n  Identity: {data['identity']:.2f}% | Score: {data['score']} | Length: {data['align_length']}\n")


Fetching 16S rRNA sequence for Vibrio natriegens...
Running BLAST (excluding Escherichia coli)...

Top genetically related distinct species (excluding E. coli):

Vibrio natriegens
  Identity: 100.00% | Score: 2814.0 | Length: 1407

Vibrio parahaemolyticus
  Identity: 100.00% | Score: 2814.0 | Length: 1407

Vibrio chemaguriensis
  Identity: 99.93% | Score: 2805.0 | Length: 1407

Vibrio sp.
  Identity: 99.93% | Score: 2805.0 | Length: 1407

Uncultured bacterium
  Identity: 99.36% | Score: 2749.0 | Length: 1411

Vibrio campbellii
  Identity: 99.00% | Score: 2744.0 | Length: 1407

Mutant Vibrio
  Identity: 99.00% | Score: 2744.0 | Length: 1407

Vibrio alginolyticus
  Identity: 98.93% | Score: 2739.0 | Length: 1407

MAG: Vibrio
  Identity: 98.93% | Score: 2739.0 | Length: 1407

MAG: uncultured
  Identity: 98.93% | Score: 2739.0 | Length: 1407



In [6]:
# Filter out 'Uncultured bacterium' and entries with 'sp.'
filtered_species = {}
for species, data in unique_species.items():
    # Skip if it contains 'Uncultured' (includes 'Uncultured bacterium', 'Uncultured prokaryote', etc.)
    if 'Uncultured' in species:
        continue
    # Skip if it ends with 'sp.' (e.g., "Shigella sp.", "Escherichia sp.")
    if species.endswith('sp.'):
        continue
    # Skip if it contains ' sp.'
    if ' sp.' in species:
        continue
    # Keep the rest
    filtered_species[species] = data

# Sort by similarity (descending)
sorted_filtered = sorted(filtered_species.items(), key=lambda x: x[1]["identity"], reverse=True)

print(f"Original species count: {len(unique_species)}")
print(f"Filtered species count: {len(filtered_species)}")
print(f"\nTop genetically related distinct species (filtered):\n")
for species, data in sorted_filtered[:10]:
    print(f"{species}\n  Identity: {data['identity']:.2f}% | Score: {data['score']} | Length: {data['align_length']}\n")


Original species count: 13
Filtered species count: 11

Top genetically related distinct species (filtered):

Vibrio natriegens
  Identity: 100.00% | Score: 2814.0 | Length: 1407

Vibrio parahaemolyticus
  Identity: 100.00% | Score: 2814.0 | Length: 1407

Vibrio chemaguriensis
  Identity: 99.93% | Score: 2805.0 | Length: 1407

Vibrio campbellii
  Identity: 99.00% | Score: 2744.0 | Length: 1407

Mutant Vibrio
  Identity: 99.00% | Score: 2744.0 | Length: 1407

Vibrio alginolyticus
  Identity: 98.93% | Score: 2739.0 | Length: 1407

MAG: Vibrio
  Identity: 98.93% | Score: 2739.0 | Length: 1407

MAG: uncultured
  Identity: 98.93% | Score: 2739.0 | Length: 1407

Bacterium ST4
  Identity: 98.86% | Score: 2734.0 | Length: 1407

Vibrio neocaledonicus
  Identity: 98.86% | Score: 2734.0 | Length: 1407



Creates a list (not a list) called sorted_filtered that has the closest related organisms

In [7]:
sorted_filtered

[('Vibrio natriegens',
  {'title': 'Vibrio natriegens strain 80980-3 16S ribosomal RNA gene, partial sequence',
   'score': 2814.0,
   'identity': 100.0,
   'align_length': 1407}),
 ('Vibrio parahaemolyticus',
  {'title': 'Vibrio parahaemolyticus strain FL1-6 16S ribosomal RNA gene, partial sequence',
   'score': 2814.0,
   'identity': 100.0,
   'align_length': 1407}),
 ('Vibrio chemaguriensis',
  {'title': 'Vibrio chemaguriensis strain Iso1 16S ribosomal RNA gene, partial sequence',
   'score': 2805.0,
   'identity': 99.92892679459844,
   'align_length': 1407}),
 ('Vibrio campbellii',
  {'title': 'Vibrio campbellii strain KJ-W18 16S ribosomal RNA gene, partial sequence',
   'score': 2744.0,
   'identity': 99.00497512437812,
   'align_length': 1407}),
 ('Mutant Vibrio',
  {'title': 'Mutant Vibrio natriegens strain Vmax chromosome 1, complete sequence',
   'score': 2744.0,
   'identity': 99.00497512437812,
   'align_length': 1407}),
 ('Vibrio alginolyticus',
  {'title': 'Vibrio alginoly

In [12]:
# Create a list with the names of the 10 highest related organisms
top_10_organisms = [species for species, data in sorted_filtered[:10]]
print(f"\nTop 10 related organisms list: {top_10_organisms}")


Top 10 related organisms list: ['Vibrio natriegens', 'Vibrio parahaemolyticus', 'Vibrio chemaguriensis', 'Vibrio campbellii', 'Mutant Vibrio', 'Vibrio alginolyticus', 'MAG: Vibrio', 'MAG: uncultured', 'Bacterium ST4', 'Vibrio neocaledonicus']


In [13]:
top_10_organisms

['Vibrio natriegens',
 'Vibrio parahaemolyticus',
 'Vibrio chemaguriensis',
 'Vibrio campbellii',
 'Mutant Vibrio',
 'Vibrio alginolyticus',
 'MAG: Vibrio',
 'MAG: uncultured',
 'Bacterium ST4',
 'Vibrio neocaledonicus']