In [1]:
# ---
# title: Replace RefSeq IDs with Ensembl Gene IDs in BED file
# ---

import pandas as pd

# Step 1: Load your BED file
bed = pd.read_csv("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/mm39.bed", sep="\t", header=None)

# Step 2: Load mouse_gene2ensembl file
# 有7列: tax_id, GeneID, EnsemblGene, RefSeq, EnsemblTranscript, RefSeqProtein, EnsemblProtein
mapping = pd.read_csv("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/Adapters_and_Annotations/mouse_gene2ensembl.tsv", sep="\t", header=None, 
                      names=["tax_id", "GeneID", "EnsemblGene", "RefSeq", 
                             "EnsemblTranscript", "RefSeqProtein", "EnsemblProtein"])

# Step 3: Remove version numbers to ensure matching
mapping["RefSeq"] = mapping["RefSeq"].str.replace(r"\.\d+$", "", regex=True)
bed[3] = bed[3].str.replace(r"\.\d+$", "", regex=True)

# Step 4: Build mapping and replace RefSeq ID with Ensembl Gene ID
refseq_to_ensg = dict(zip(mapping["RefSeq"], mapping["EnsemblGene"]))
bed[3] = bed[3].map(lambda x: refseq_to_ensg.get(x, x))  # 保留原始值如果没有匹配上

# Step 5: Save the new BED file
bed.to_csv("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/mm39_ensembl.bed", sep="\t", header=False, index=False)
print("Done! Saved as mm39_ensembl.bed")


Done! Saved as mm39_ensembl.bed
