In [1]:
import os
import re
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Define the directory containing the genomes
genomes_dir = "/Volumes/T7 Shield/TIFR-ARG-Projects/Sequence-Data/Genomes"

# Initialize an empty list to hold genome descriptions
genomes_descriptions = []

# Get the list of genome files in the directory and process the first 5
genome_files = os.listdir(genomes_dir)

# Iterate over each genome file
for genome in tqdm(genome_files):
    # Construct the corresponding proteome file name
    proteome = genome.replace('_genomic.fna', '_cds_proteins.faa')
    
    # Construct the full path to the genome file
    fasta_path = os.path.join(genomes_dir, genome)
    
    # Parse the genome file and extract relevant information
    for record in SeqIO.parse(fasta_path, "fasta"):
        accession = record.id
        genome_description = record.description
        sequence_length = len(record.seq) / 1_000_000  # Convert length to megabases
        
        # Determine the type of replicon based on the description
        if 'plasmid' in genome_description.lower():
            replicon = 'Plasmid'
        elif 'chromosome' in genome_description.lower() or 'genome' in genome_description.lower():
            replicon = 'Chromosome'
        else:
            replicon = 'Undefined'
        
        # Append the extracted information to the list
        genomes_descriptions.append((accession, replicon, genome, proteome, sequence_length))

# Create a DataFrame from the list of genome descriptions
genome_data = pd.DataFrame(genomes_descriptions, columns=['Accession', 'Replicon', 'GenomeFile', 'ProteomeFile', 'SequenceLength(Mb)'])

100%|████████████████| 42593/42593 [12:00<00:00, 59.14it/s]


In [4]:
genome_data.to_csv('/Users/akshayonly/Work/Updated/Data/01/genome_information.csv', index=False)