In [1]:
import pandas as pd
import os

# Helper function to analyze chromosome files
def analyze_chrom_sizes(file_path, species_name):
    df = pd.read_csv(file_path, sep="\t", header=None, names=["chrom", "size"])
    
    total_genome_size = df["size"].sum()
    num_chromosomes = len(df)
    largest_chrom = df.loc[df["size"].idxmax()]
    smallest_chrom = df.loc[df["size"].idxmin()]
    mean_size = df["size"].mean()
    
    return {
        "Species": species_name,
        "Total genome size (bp)": total_genome_size,
        "Number of chromosomes": num_chromosomes,
        "Largest chromosome": largest_chrom["chrom"],
        "Largest size (bp)": largest_chrom["size"],
        "Smallest chromosome": smallest_chrom["chrom"],
        "Smallest size (bp)": smallest_chrom["size"],
        "Mean size (bp)": mean_size
    }

ModuleNotFoundError: No module named 'pandas'

In [None]:
# define file path
files = {
    "E. coli K12": os.path.join("ecoli.chrom.sizes"),
    "Yeast (sacCer3)": os.path.join("yeast.chrom.sizes"),
    "Worm (ce10)": os.path.join("worm.chrom.sizes"),
    "Fruit Fly (dm6)": os.path.join("fly.chrom.sizes"),
    "Arabidopsis (TAIR10)": os.path.join("arabidopsis.chrom.sizes"),
    "Tomato (v4.00)": os.path.join("tomato.chrom.sizes"),
    "Human (hg38)": os.path.join("human.chrom.sizes"),
    "Wheat (IWGSC)": os.path.join("wheat.chrom.sizes"),
}

In [None]:
# run analysis 
results = []
for species, path in files.items():
    res = analyze_chrom_sizes(path, species)
    results.append(res)

summary_table = pd.DataFrame(results)
summary_table