In [2]:
import pandas as pd

# load a list of house-keeping genes
hk_df = pd.read_csv('../references/mm10.HouseKeepingGene.bed', sep='\t', header=None)
hk_df.columns = ['chr', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'ItemRgb', 'blockCount', 'blockSizes', 'blockStarts']
hk_df = hk_df.set_index('name')
refseq_id = list(hk_df.index.unique())

# load gtf file to get chr positions
gtf  = pd.read_csv("../references/genome.gtf", sep='\t', header=None, skiprows=5, usecols=[0, 3, 4, 8])
gtf.columns = ['chr', 'chrstart', 'chrend', 'Gene stable ID']
gtf['Gene stable ID'] = gtf['Gene stable ID'].map(lambda x: x.split(';')[0].replace('gene_id "', '').replace('"', '') if 'transcript_id' not in x else None)
gtf['chr'] = 'chr' + gtf['chr'].astype(str)
gtf.dropna(inplace=True)

# load biomart annotation to get refseq ids
biomart_df = pd.read_csv("../references/biomart_export.txt", sep='\t')
biomart_hk = biomart_df[biomart_df["RefSeq mRNA ID"].isin(refseq_id)]
biomart_hk = biomart_hk.merge(gtf, on='Gene stable ID').set_index('RefSeq mRNA ID')

# create a bed file
merge_col = ['Gene stable ID', 'chrstart', 'chrend']
merge_df = hk_df.merge(biomart_hk[merge_col], left_index=True, right_index=True).reset_index()
bed_file = merge_df[['chr', 'chrstart', 'chrend', 'Gene stable ID', 'score', 'strand', 'thickStart', 'thickEnd', 'ItemRgb', 'blockCount', 'blockSizes', 'blockStarts']]

In [85]:

bed_file

Unnamed: 0,chr,chrstart,chrend,name,score,strand,thickStart,thickEnd,ItemRgb,blockCount,blockSizes,blockStarts
0,chr1,36737195,36753503,NM_146107,0,-,36700075,36709853,0,11,9774162175932171251267665120,"0,1135,1334,1932,2219,2483,2857,3273,7668,8989..."
1,chr1,72346586,72434111,NM_009533,0,+,72307546,72394722,0,21,"147,114,184,49,123,192,115,139,113,63,138,91,1...","0,3014,4987,6733,8038,11578,17653,18795,22505,..."
2,chr1,127796510,127871605,NM_178690,0,+,127868811,127942610,0,24,"57,56,76,133,79,120,166,100,82,69,74,93,167,90...","0,152,5307,20383,22245,32806,34899,41078,46866..."
3,chr1,130981437,131025563,NM_008551,0,-,131055091,131097525,0,10,15328186125761278065140255,02055222527103185379042974648491943585
4,chr1,135190450,135211822,NM_145417,0,-,135262970,135284008,0,11,431143225109113114236117149141523,"0,1354,2484,4171,4471,5384,8830,9663,14690,160..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4249,chr19,52920357,53028645,NM_133216,0,-,52991590,53038545,0,21,"540,99,77,174,70,61,72,78,59,51,90,211,82,96,1...","0,4241,5069,5720,6694,7972,9209,11294,11499,12..."
4250,chr19,53367821,53379009,NM_172429,0,-,53380463,53389422,0,6,1388154162143120158,02027434352401008911202
4251,chr19,56385561,56430776,NM_007611,0,+,56404415,56441095,0,7,1891101371291761301478,072873611336673390983959043737
4252,chr19,59285610,59334212,NM_001033222,0,-,59299522,59345587,0,5,56251631031231062,09074191743166748635
