In [1]:
import os
from Bio import Entrez
from Bio import SeqIO

In [2]:
def download_cytochrome_b_sequences(species_list):
    Entrez.email = 'your_email@example.com'  # Enter your email address

    # Create folder to store all the FASTA files
    folder_name = "cytb_sequences"
    os.makedirs(folder_name, exist_ok=True)

    for species in species_list:
        search_term = f"{species} AND mitochondrial cytochrome b gene, complete cds AND 1:2000[SLEN]"
        handle = Entrez.esearch(db='nucleotide', term=search_term, retmax=5)
        record = Entrez.read(handle)
        handle.close()

        if int(record['Count']) == 0:
            print(f"No cytochrome b sequences found for {species}")
        else:
            seq_ids = record['IdList']
            handle = Entrez.efetch(db='nucleotide', id=seq_ids, rettype='fasta', retmode='text')
            sequences = SeqIO.parse(handle, 'fasta')
            sequence_list = list(sequences)  # Collect all sequences in a list

            output_file = f"{folder_name}/{species}_cytb.fasta"
            SeqIO.write(sequence_list, output_file, 'fasta')
            handle.close()
            num_sequences = len(sequence_list)
            print(f"Downloaded {num_sequences} cytochrome b sequence(s) for {species}")


In [4]:
# Example usage:
file_path = 'species_file.txt'  # Path to the file containing species names, one per line

with open(file_path, 'r') as file:
    species_list = [line.strip() for line in file]

# Download cytochrome b sequences for each species
download_cytochrome_b_sequences(species_list)

Downloaded 1 cytochrome b sequence(s) for Carcharodon carcharias
No cytochrome b sequences found for Otodus megalodon
Downloaded 2 cytochrome b sequence(s) for Galeocerdo cuvier
Downloaded 5 cytochrome b sequence(s) for Ginglimostoma cirratum
Downloaded 1 cytochrome b sequence(s) for Heterodontus francisci
Downloaded 1 cytochrome b sequence(s) for Isurus oxyrinchus
No cytochrome b sequences found for Squatina squatina
Downloaded 2 cytochrome b sequence(s) for Mitsukurina owstoni
Downloaded 5 cytochrome b sequence(s) for Prionace glauca


In [5]:
input_folder = "cytb_sequences"
output_file = 'merged.fasta'  # Path to the output merged FASTA file

sequences = []  # List to store the selected sequences

# Iterate over files in the input folder
for file_name in os.listdir(input_folder):
    file_path = os.path.join(input_folder, file_name)
    
    # Check if the file is a FASTA file
    if file_name.endswith('.fasta') or file_name.endswith('.fa'):
        with open(file_path, 'r') as file:
            records = list(SeqIO.parse(file, 'fasta'))
            
            if records:
                # Select the desired record based on the description
                selected_record = next((rec for rec in records if "mitochondrial cytochrome b gene, complete cds" in rec.description), records[0])
                sequences.append(selected_record)
    
# Write the merged sequences to the output file
with open(output_file, 'w') as output:
    SeqIO.write(sequences, output, 'fasta')

# Print the number of sequences in the merged file and their lengths
num_sequences = len(sequences)
print(f"Number of sequences in the merged file: {num_sequences}")

for seq in sequences:
    print(f"Sequence Length: {len(seq)}")

Number of sequences in the merged file: 7
Sequence Length: 1146
Sequence Length: 1146
Sequence Length: 1164
Sequence Length: 1146
Sequence Length: 1146
Sequence Length: 1192
Sequence Length: 1146
