# Filter BED Entries by Ensembl IDs via MyGeneInfo

This notebook:
1. Reads a list of gene symbols.
2. Uses MyGeneInfo to map symbols to Ensembl gene IDs (mouse).
3. Loads a genome‐wide BED file annotated with Ensembl IDs.
4. Filters the BED to only those entries whose `gene_id` matches the mapped IDs.
5. Writes out the filtered BED.


In [1]:
#!/usr/bin/env python3
# ─── Cell 1: Imports ───────────────────────────────────────────────────────
# - mygene: query gene annotation service
# - pandas: data manipulation for gene lists and BED tables
from mygene import MyGeneInfo
import pandas as pd


# ─── Cell 2: Configuration ─────────────────────────────────────────────────
# Path to the text file containing one gene symbol per line
GENELIST_FILE = "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/CTCF_3T3L1/results/annotation/GPS2_CTCF_common_genes.txt"

# Path to the genome-wide BED annotated with Ensembl gene IDs in column 4
BED_FILE      = "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/mm39_ensembl.bed"

# Desired output path for the filtered BED
OUTPUT_FILE   = "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/CTCF_3T3L1/results/annotation/GPS2_CTCF_common_genes_filtered.bed"


# ─── Cell 3: Load Gene Symbols & Map to Ensembl IDs ────────────────────────
# 1) Read symbols into a list
genes = pd.read_csv(GENELIST_FILE, header=None)[0].tolist()

# 2) Query MyGeneInfo for each symbol, retrieving 'ensembl.gene'
mg      = MyGeneInfo()
results = mg.querymany(
    genes,
    scopes='symbol',
    fields='ensembl.gene',
    species='mouse'
)

# 3) Build DataFrame and keep only successful mappings
df = pd.DataFrame(results)
df = df[df['notfound'] != True][['query','ensembl']]

# Helper to extract the actual Ensembl gene ID
def extract_ensembl(x):
    if isinstance(x, dict):
        return x.get('gene')
    elif isinstance(x, list):
        return x[0].get('gene')
    return None

# 4) Apply helper and dedupe
df['ensembl_id'] = df['ensembl'].apply(extract_ensembl)
ensembl_ids     = set(df['ensembl_id'].dropna())


# ─── Cell 4: Load BED & Filter by Ensembl IDs ─────────────────────────────
# Define column names for the BED file (12-column UCSC-style)
bed_cols = [
    'chr','start','end','gene_id','score','strand',
    'thickStart','thickEnd','itemRgb','blockCount',
    'blockSizes','blockStarts'
]

# Read the BED into a DataFrame
bed = pd.read_csv(
    BED_FILE,
    sep='\t',
    header=None,
    names=bed_cols
)

# Keep only rows whose gene_id column is in our set of Ensembl IDs
filtered_bed = bed[bed['gene_id'].isin(ensembl_ids)]


# ─── Cell 5: Write Filtered BED ───────────────────────────────────────────
# Save filtered entries as a standard BED (no header, no index)
filtered_bed.to_csv(
    OUTPUT_FILE,
    sep='\t',
    header=False,
    index=False
)

print(f"Done! Filtered BED written to: {OUTPUT_FILE}")


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
153 input query terms found dup hits:	[('Gm26901', 2), ('Gm33222', 2), ('Gm6644', 2), ('Gm29083', 2), ('2900060B14Rik', 2), ('2610027F03Ri
14 input query terms found no hit:	['Bves', 'Fut11', 'LOC118567337', '2310039H08Rik', 'LOC102631992', 'Gm50217', 'Tmem88b', 'Sagsin1', 


✅ Done! Filtered BED written to: /projectnb/perissilab/Xinyu/GPS2_CHIPseq/CTCF_3T3L1/results/annotation/GPS2_CTCF_common_genes_filtered.bed
