In [24]:
import pandas as pd
import numpy as np
import os
import warnings

pd.options.mode.chained_assignment = None

In [25]:
# load the brainvar meanCPM for snRNAs and rRNAs
brainvar_wg = pd.read_csv('brainvar_ensembl_rRNA_snRNA_meanCPM.txt', sep='\t', names=['chr', 'start', 'end', 'meanCPM'])

# the coordinates are zero-based, half-open. So, +1 to start coordinate to set it as 1-based
brainvar_wg['start'] = brainvar_wg['start'] + 1

# remove any  that have NaN or 0 as meanCPM values
brainvar_wg = brainvar_wg[brainvar_wg['meanCPM'].notna()]
brainvar_wg = brainvar_wg[brainvar_wg['meanCPM'] != 0.00000]

In [35]:
# import the Ensembl exon coordinates for snRNAs
ens_genes = pd.read_csv('ensembl_biotype_exons.txt', sep='\t', names=['chr', 'start', 'end', 'gene_id', 'gene_name', 'biotype'])
ens_genes = ens_genes[ens_genes['biotype'] != 'rRNA']

# merge to add these info to the BrainVar data
brainvar_ens = pd.merge(brainvar_wg, ens_genes, left_on=['chr', 'start', 'end'], right_on=['chr', 'start', 'end'], how='left')
brainvar_ens = brainvar_ens[['chr', 'start', 'end', 'gene_id', 'gene_name', 'biotype', 'meanCPM']].dropna(subset = ['biotype'])


In [40]:
# look for meanCPM > 5
brainvar_ens_cpm5 = brainvar_ens[brainvar_ens['meanCPM'] > 5]

brainvar_ens_cpm5 = brainvar_ens_cpm5[~brainvar_ens_cpm5['gene_name'].str.endswith('P')]

# manually curated out ENSG00000273694, ENSG00000283489, ENSG00000283509 as these genes are known to be pseudogenes
manual_pseudo = ['ENSG00000273694', 'ENSG00000283489', 'ENSG00000283509']
brainvar_ens_cpm5 = brainvar_ens_cpm5[~brainvar_ens_cpm5['gene_id'].isin(manual_pseudo)]

brainvar_ens_cpm5.to_csv('results/brainvar_snRNA_cpm5.txt', sep='\t', index=False)
