to install run in command line py -m pip install biopython pandas

In [1]:
import pandas as pd
from Bio import Entrez, SeqIO

# Set your email (required by NCBI)
Entrez.email = "daniel.weinberger@yale.edu"

In [9]:

# Function to fetch gene product label
def fetch_gene_product(accession, start, end):
    try:
        # Fetch sequence data from NCBI
        handle = Entrez.efetch(db="nuccore", id=accession, seq_start=start, seq_stop=end, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()

        # Search for the CDS feature in the given region
        for feature in record.features:
            if feature.type == "CDS" and feature.location.start.position <= start <= feature.location.end.position:
                return feature.qualifiers.get('product', ['Unknown'])[0]
    except Exception as e:
        return f"Error: {str(e)}"
    
    return "No matching CDS"


In [12]:

# Read the CSV file
input_file = "./Data/Alignment-HitTable AY insertions.csv"  # Replace with your input file name
output_file = "output.csv"  # Name of the output file
df = pd.read_csv(input_file)
#df = pd.read_csv(input_file).head(5)  # Limit to the first 5 rows


In [13]:
# Create a new column for CDS product names
df['cds_product'] = df.apply(lambda row: fetch_gene_product(row['accession'], row['start'], row['end']), axis=1)

# Export the results to a new CSV file
df[['accession', 'start', 'end', 'cds_product']].to_csv(output_file, index=False)
print(f"CDS products exported to {output_file}")

CDS products exported to output.csv
